103 files changed, 113125 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
new file mode 100644
index 0000000..09cc53a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -0,0 +1,1077 @@
+//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86AsmInstrumentation.h"
+#include "X86Operand.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include <algorithm>
+#include <cassert>
+#include <vector>
+
+// Following comment describes how assembly instrumentation works.
+// Currently we have only AddressSanitizer instrumentation, but we're
+// planning to implement MemorySanitizer for inline assembly too. If
+// you're not familiar with AddressSanitizer algorithm, please, read
+// https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerAlgorithm.
+//
+// When inline assembly is parsed by an instance of X86AsmParser, all
+// instructions are emitted via EmitInstruction method. That's the
+// place where X86AsmInstrumentation analyzes an instruction and
+// decides, whether the instruction should be emitted as is or
+// instrumentation is required. The latter case happens when an
+// instruction reads from or writes to memory. Now instruction opcode
+// is explicitly checked, and if an instruction has a memory operand
+// (for instance, movq (%rsi, %rcx, 8), %rax) - it should be
+// instrumented.  There're also exist instructions that modify
+// memory but don't have an explicit memory operands, for instance,
+// movs.
+//
+// Let's consider at first 8-byte memory accesses when an instruction
+// has an explicit memory operand. In this case we need two registers -
+// AddressReg to compute address of a memory cells which are accessed
+// and ShadowReg to compute corresponding shadow address. So, we need
+// to spill both registers before instrumentation code and restore them
+// after instrumentation. Thus, in general, instrumentation code will
+// look like this:
+// PUSHF  # Store flags, otherwise they will be overwritten
+// PUSH AddressReg  # spill AddressReg
+// PUSH ShadowReg   # spill ShadowReg
+// LEA MemOp, AddressReg  # compute address of the memory operand
+// MOV AddressReg, ShadowReg
+// SHR ShadowReg, 3
+// # ShadowOffset(AddressReg >> 3) contains address of a shadow
+// # corresponding to MemOp.
+// CMP ShadowOffset(ShadowReg), 0  # test shadow value
+// JZ .Done  # when shadow equals to zero, everything is fine
+// MOV AddressReg, RDI
+// # Call __asan_report function with AddressReg as an argument
+// CALL __asan_report
+// .Done:
+// POP ShadowReg  # Restore ShadowReg
+// POP AddressReg  # Restore AddressReg
+// POPF  # Restore flags
+//
+// Memory accesses with different size (1-, 2-, 4- and 16-byte) are
+// handled in a similar manner, but small memory accesses (less than 8
+// byte) require an additional ScratchReg, which is used for shadow value.
+//
+// If, suppose, we're instrumenting an instruction like movs, only
+// contents of RDI, RDI + AccessSize * RCX, RSI, RSI + AccessSize *
+// RCX are checked.  In this case there're no need to spill and restore
+// AddressReg , ShadowReg or flags four times, they're saved on stack
+// just once, before instrumentation of these four addresses, and restored
+// at the end of the instrumentation.
+//
+// There exist several things which complicate this simple algorithm.
+// * Instrumented memory operand can have RSP as a base or an index
+//   register.  So we need to add a constant offset before computation
+//   of memory address, since flags, AddressReg, ShadowReg, etc. were
+//   already stored on stack and RSP was modified.
+// * Debug info (usually, DWARF) should be adjusted, because sometimes
+//   RSP is used as a frame register. So, we need to select some
+//   register as a frame register and temprorary override current CFA
+//   register.
+
+namespace llvm {
+namespace {
+
+static cl::opt<bool> ClAsanInstrumentAssembly(
+    "asan-instrument-assembly",
+    cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden,
+    cl::init(false));
+
+const int64_t MinAllowedDisplacement = std::numeric_limits<int32_t>::min();
+const int64_t MaxAllowedDisplacement = std::numeric_limits<int32_t>::max();
+
+int64_t ApplyDisplacementBounds(int64_t Displacement) {
+  return std::max(std::min(MaxAllowedDisplacement, Displacement),
+                  MinAllowedDisplacement);
+}
+
+void CheckDisplacementBounds(int64_t Displacement) {
+  assert(Displacement >= MinAllowedDisplacement &&
+         Displacement <= MaxAllowedDisplacement);
+}
+
+bool IsStackReg(unsigned Reg) { return Reg == X86::RSP || Reg == X86::ESP; }
+
+bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; }
+
+class X86AddressSanitizer : public X86AsmInstrumentation {
+public:
+  struct RegisterContext {
+  private:
+    enum RegOffset {
+      REG_OFFSET_ADDRESS = 0,
+      REG_OFFSET_SHADOW,
+      REG_OFFSET_SCRATCH
+    };
+
+  public:
+    RegisterContext(unsigned AddressReg, unsigned ShadowReg,
+                    unsigned ScratchReg) {
+      BusyRegs.push_back(convReg(AddressReg, 64));
+      BusyRegs.push_back(convReg(ShadowReg, 64));
+      BusyRegs.push_back(convReg(ScratchReg, 64));
+    }
+
+    unsigned AddressReg(unsigned Size) const {
+      return convReg(BusyRegs[REG_OFFSET_ADDRESS], Size);
+    }
+
+    unsigned ShadowReg(unsigned Size) const {
+      return convReg(BusyRegs[REG_OFFSET_SHADOW], Size);
+    }
+
+    unsigned ScratchReg(unsigned Size) const {
+      return convReg(BusyRegs[REG_OFFSET_SCRATCH], Size);
+    }
+
+    void AddBusyReg(unsigned Reg) {
+      if (Reg != X86::NoRegister)
+        BusyRegs.push_back(convReg(Reg, 64));
+    }
+
+    void AddBusyRegs(const X86Operand &Op) {
+      AddBusyReg(Op.getMemBaseReg());
+      AddBusyReg(Op.getMemIndexReg());
+    }
+
+    unsigned ChooseFrameReg(unsigned Size) const {
+      static const MCPhysReg Candidates[] = { X86::RBP, X86::RAX, X86::RBX,
+                                              X86::RCX, X86::RDX, X86::RDI,
+                                              X86::RSI };
+      for (unsigned Reg : Candidates) {
+        if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg))
+          return convReg(Reg, Size);
+      }
+      return X86::NoRegister;
+    }
+
+  private:
+    unsigned convReg(unsigned Reg, unsigned Size) const {
+      return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, Size);
+    }
+
+    std::vector<unsigned> BusyRegs;
+  };
+
+  X86AddressSanitizer(const MCSubtargetInfo *&STI)
+      : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {}
+
+  ~X86AddressSanitizer() override {}
+
+  // X86AsmInstrumentation implementation:
+  void InstrumentAndEmitInstruction(const MCInst &Inst,
+                                    OperandVector &Operands,
+                                    MCContext &Ctx,
+                                    const MCInstrInfo &MII,
+                                    MCStreamer &Out) override {
+    InstrumentMOVS(Inst, Operands, Ctx, MII, Out);
+    if (RepPrefix)
+      EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX));
+
+    InstrumentMOV(Inst, Operands, Ctx, MII, Out);
+
+    RepPrefix = (Inst.getOpcode() == X86::REP_PREFIX);
+    if (!RepPrefix)
+      EmitInstruction(Out, Inst);
+  }
+
+  // Adjusts up stack and saves all registers used in instrumentation.
+  virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+                                            MCContext &Ctx,
+                                            MCStreamer &Out) = 0;
+
+  // Restores all registers used in instrumentation and adjusts stack.
+  virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+                                            MCContext &Ctx,
+                                            MCStreamer &Out) = 0;
+
+  virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+                                         bool IsWrite,
+                                         const RegisterContext &RegCtx,
+                                         MCContext &Ctx, MCStreamer &Out) = 0;
+  virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+                                         bool IsWrite,
+                                         const RegisterContext &RegCtx,
+                                         MCContext &Ctx, MCStreamer &Out) = 0;
+
+  virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+                                  MCStreamer &Out) = 0;
+
+  void InstrumentMemOperand(X86Operand &Op, unsigned AccessSize, bool IsWrite,
+                            const RegisterContext &RegCtx, MCContext &Ctx,
+                            MCStreamer &Out);
+  void InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, unsigned CntReg,
+                          unsigned AccessSize, MCContext &Ctx, MCStreamer &Out);
+
+  void InstrumentMOVS(const MCInst &Inst, OperandVector &Operands,
+                      MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
+  void InstrumentMOV(const MCInst &Inst, OperandVector &Operands,
+                     MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
+
+protected:
+  void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); }
+
+  void EmitLEA(X86Operand &Op, unsigned Size, unsigned Reg, MCStreamer &Out) {
+    assert(Size == 32 || Size == 64);
+    MCInst Inst;
+    Inst.setOpcode(Size == 32 ? X86::LEA32r : X86::LEA64r);
+    Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, Size)));
+    Op.addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+
+  void ComputeMemOperandAddress(X86Operand &Op, unsigned Size,
+                                unsigned Reg, MCContext &Ctx, MCStreamer &Out);
+
+  // Creates new memory operand with Displacement added to an original
+  // displacement. Residue will contain a residue which could happen when the
+  // total displacement exceeds 32-bit limitation.
+  std::unique_ptr<X86Operand> AddDisplacement(X86Operand &Op,
+                                              int64_t Displacement,
+                                              MCContext &Ctx, int64_t *Residue);
+
+  bool is64BitMode() const {
+    return STI->getFeatureBits()[X86::Mode64Bit];
+  }
+  bool is32BitMode() const {
+    return STI->getFeatureBits()[X86::Mode32Bit];
+  }
+  bool is16BitMode() const {
+    return STI->getFeatureBits()[X86::Mode16Bit];
+  }
+
+  unsigned getPointerWidth() {
+    if (is16BitMode()) return 16;
+    if (is32BitMode()) return 32;
+    if (is64BitMode()) return 64;
+    llvm_unreachable("invalid mode");
+  }
+
+  // True when previous instruction was actually REP prefix.
+  bool RepPrefix;
+
+  // Offset from the original SP register.
+  int64_t OrigSPOffset;
+};
+
+void X86AddressSanitizer::InstrumentMemOperand(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite,
+    const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+  assert(Op.isMem() && "Op should be a memory operand.");
+  assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 &&
+         "AccessSize should be a power of two, less or equal than 16.");
+  // FIXME: take into account load/store alignment.
+  if (IsSmallMemAccess(AccessSize))
+    InstrumentMemOperandSmall(Op, AccessSize, IsWrite, RegCtx, Ctx, Out);
+  else
+    InstrumentMemOperandLarge(Op, AccessSize, IsWrite, RegCtx, Ctx, Out);
+}
+
+void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg,
+                                             unsigned CntReg,
+                                             unsigned AccessSize,
+                                             MCContext &Ctx, MCStreamer &Out) {
+  // FIXME: check whole ranges [DstReg .. DstReg + AccessSize * (CntReg - 1)]
+  // and [SrcReg .. SrcReg + AccessSize * (CntReg - 1)].
+  RegisterContext RegCtx(X86::RDX /* AddressReg */, X86::RAX /* ShadowReg */,
+                         IsSmallMemAccess(AccessSize)
+                             ? X86::RBX
+                             : X86::NoRegister /* ScratchReg */);
+  RegCtx.AddBusyReg(DstReg);
+  RegCtx.AddBusyReg(SrcReg);
+  RegCtx.AddBusyReg(CntReg);
+
+  InstrumentMemOperandPrologue(RegCtx, Ctx, Out);
+
+  // Test (%SrcReg)
+  {
+    const MCExpr *Disp = MCConstantExpr::create(0, Ctx);
+    std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+        getPointerWidth(), 0, Disp, SrcReg, 0, AccessSize, SMLoc(), SMLoc()));
+    InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx,
+                         Out);
+  }
+
+  // Test -1(%SrcReg, %CntReg, AccessSize)
+  {
+    const MCExpr *Disp = MCConstantExpr::create(-1, Ctx);
+    std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+        getPointerWidth(), 0, Disp, SrcReg, CntReg, AccessSize, SMLoc(),
+        SMLoc()));
+    InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx,
+                         Out);
+  }
+
+  // Test (%DstReg)
+  {
+    const MCExpr *Disp = MCConstantExpr::create(0, Ctx);
+    std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+        getPointerWidth(), 0, Disp, DstReg, 0, AccessSize, SMLoc(), SMLoc()));
+    InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out);
+  }
+
+  // Test -1(%DstReg, %CntReg, AccessSize)
+  {
+    const MCExpr *Disp = MCConstantExpr::create(-1, Ctx);
+    std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+        getPointerWidth(), 0, Disp, DstReg, CntReg, AccessSize, SMLoc(),
+        SMLoc()));
+    InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out);
+  }
+
+  InstrumentMemOperandEpilogue(RegCtx, Ctx, Out);
+}
+
+void X86AddressSanitizer::InstrumentMOVS(const MCInst &Inst,
+                                         OperandVector &Operands,
+                                         MCContext &Ctx, const MCInstrInfo &MII,
+                                         MCStreamer &Out) {
+  // Access size in bytes.
+  unsigned AccessSize = 0;
+
+  switch (Inst.getOpcode()) {
+  case X86::MOVSB:
+    AccessSize = 1;
+    break;
+  case X86::MOVSW:
+    AccessSize = 2;
+    break;
+  case X86::MOVSL:
+    AccessSize = 4;
+    break;
+  case X86::MOVSQ:
+    AccessSize = 8;
+    break;
+  default:
+    return;
+  }
+
+  InstrumentMOVSImpl(AccessSize, Ctx, Out);
+}
+
+void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst,
+                                        OperandVector &Operands, MCContext &Ctx,
+                                        const MCInstrInfo &MII,
+                                        MCStreamer &Out) {
+  // Access size in bytes.
+  unsigned AccessSize = 0;
+
+  switch (Inst.getOpcode()) {
+  case X86::MOV8mi:
+  case X86::MOV8mr:
+  case X86::MOV8rm:
+    AccessSize = 1;
+    break;
+  case X86::MOV16mi:
+  case X86::MOV16mr:
+  case X86::MOV16rm:
+    AccessSize = 2;
+    break;
+  case X86::MOV32mi:
+  case X86::MOV32mr:
+  case X86::MOV32rm:
+    AccessSize = 4;
+    break;
+  case X86::MOV64mi32:
+  case X86::MOV64mr:
+  case X86::MOV64rm:
+    AccessSize = 8;
+    break;
+  case X86::MOVAPDmr:
+  case X86::MOVAPSmr:
+  case X86::MOVAPDrm:
+  case X86::MOVAPSrm:
+    AccessSize = 16;
+    break;
+  default:
+    return;
+  }
+
+  const bool IsWrite = MII.get(Inst.getOpcode()).mayStore();
+
+  for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) {
+    assert(Operands[Ix]);
+    MCParsedAsmOperand &Op = *Operands[Ix];
+    if (Op.isMem()) {
+      X86Operand &MemOp = static_cast<X86Operand &>(Op);
+      RegisterContext RegCtx(
+          X86::RDI /* AddressReg */, X86::RAX /* ShadowReg */,
+          IsSmallMemAccess(AccessSize) ? X86::RCX
+                                       : X86::NoRegister /* ScratchReg */);
+      RegCtx.AddBusyRegs(MemOp);
+      InstrumentMemOperandPrologue(RegCtx, Ctx, Out);
+      InstrumentMemOperand(MemOp, AccessSize, IsWrite, RegCtx, Ctx, Out);
+      InstrumentMemOperandEpilogue(RegCtx, Ctx, Out);
+    }
+  }
+}
+
+void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op,
+                                                   unsigned Size,
+                                                   unsigned Reg, MCContext &Ctx,
+                                                   MCStreamer &Out) {
+  int64_t Displacement = 0;
+  if (IsStackReg(Op.getMemBaseReg()))
+    Displacement -= OrigSPOffset;
+  if (IsStackReg(Op.getMemIndexReg()))
+    Displacement -= OrigSPOffset * Op.getMemScale();
+
+  assert(Displacement >= 0);
+
+  // Emit Op as is.
+  if (Displacement == 0) {
+    EmitLEA(Op, Size, Reg, Out);
+    return;
+  }
+
+  int64_t Residue;
+  std::unique_ptr<X86Operand> NewOp =
+      AddDisplacement(Op, Displacement, Ctx, &Residue);
+  EmitLEA(*NewOp, Size, Reg, Out);
+
+  while (Residue != 0) {
+    const MCConstantExpr *Disp =
+        MCConstantExpr::create(ApplyDisplacementBounds(Residue), Ctx);
+    std::unique_ptr<X86Operand> DispOp =
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, Reg, 0, 1, SMLoc(),
+                              SMLoc());
+    EmitLEA(*DispOp, Size, Reg, Out);
+    Residue -= Disp->getValue();
+  }
+}
+
+std::unique_ptr<X86Operand>
+X86AddressSanitizer::AddDisplacement(X86Operand &Op, int64_t Displacement,
+                                     MCContext &Ctx, int64_t *Residue) {
+  assert(Displacement >= 0);
+
+  if (Displacement == 0 ||
+      (Op.getMemDisp() && Op.getMemDisp()->getKind() != MCExpr::Constant)) {
+    *Residue = Displacement;
+    return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(),
+                                 Op.getMemDisp(), Op.getMemBaseReg(),
+                                 Op.getMemIndexReg(), Op.getMemScale(),
+                                 SMLoc(), SMLoc());
+  }
+
+  int64_t OrigDisplacement =
+      static_cast<const MCConstantExpr *>(Op.getMemDisp())->getValue();
+  CheckDisplacementBounds(OrigDisplacement);
+  Displacement += OrigDisplacement;
+
+  int64_t NewDisplacement = ApplyDisplacementBounds(Displacement);
+  CheckDisplacementBounds(NewDisplacement);
+
+  *Residue = Displacement - NewDisplacement;
+  const MCExpr *Disp = MCConstantExpr::create(NewDisplacement, Ctx);
+  return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(), Disp,
+                               Op.getMemBaseReg(), Op.getMemIndexReg(),
+                               Op.getMemScale(), SMLoc(), SMLoc());
+}
+
+class X86AddressSanitizer32 : public X86AddressSanitizer {
+public:
+  static const long kShadowOffset = 0x20000000;
+
+  X86AddressSanitizer32(const MCSubtargetInfo *&STI)
+      : X86AddressSanitizer(STI) {}
+
+  ~X86AddressSanitizer32() override {}
+
+  unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
+    unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
+    if (FrameReg == X86::NoRegister)
+      return FrameReg;
+    return getX86SubSuperRegister(FrameReg, 32);
+  }
+
+  void SpillReg(MCStreamer &Out, unsigned Reg) {
+    EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(Reg));
+    OrigSPOffset -= 4;
+  }
+
+  void RestoreReg(MCStreamer &Out, unsigned Reg) {
+    EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(Reg));
+    OrigSPOffset += 4;
+  }
+
+  void StoreFlags(MCStreamer &Out) {
+    EmitInstruction(Out, MCInstBuilder(X86::PUSHF32));
+    OrigSPOffset -= 4;
+  }
+
+  void RestoreFlags(MCStreamer &Out) {
+    EmitInstruction(Out, MCInstBuilder(X86::POPF32));
+    OrigSPOffset += 4;
+  }
+
+  void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+                                    MCContext &Ctx,
+                                    MCStreamer &Out) override {
+    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32);
+    assert(LocalFrameReg != X86::NoRegister);
+
+    const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
+    unsigned FrameReg = GetFrameReg(Ctx, Out);
+    if (MRI && FrameReg != X86::NoRegister) {
+      SpillReg(Out, LocalFrameReg);
+      if (FrameReg == X86::ESP) {
+        Out.EmitCFIAdjustCfaOffset(4 /* byte size of the LocalFrameReg */);
+        Out.EmitCFIRelOffset(
+            MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0);
+      }
+      EmitInstruction(
+          Out,
+          MCInstBuilder(X86::MOV32rr).addReg(LocalFrameReg).addReg(FrameReg));
+      Out.EmitCFIRememberState();
+      Out.EmitCFIDefCfaRegister(
+          MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */));
+    }
+
+    SpillReg(Out, RegCtx.AddressReg(32));
+    SpillReg(Out, RegCtx.ShadowReg(32));
+    if (RegCtx.ScratchReg(32) != X86::NoRegister)
+      SpillReg(Out, RegCtx.ScratchReg(32));
+    StoreFlags(Out);
+  }
+
+  void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+                                    MCContext &Ctx,
+                                    MCStreamer &Out) override {
+    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32);
+    assert(LocalFrameReg != X86::NoRegister);
+
+    RestoreFlags(Out);
+    if (RegCtx.ScratchReg(32) != X86::NoRegister)
+      RestoreReg(Out, RegCtx.ScratchReg(32));
+    RestoreReg(Out, RegCtx.ShadowReg(32));
+    RestoreReg(Out, RegCtx.AddressReg(32));
+
+    unsigned FrameReg = GetFrameReg(Ctx, Out);
+    if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) {
+      RestoreReg(Out, LocalFrameReg);
+      Out.EmitCFIRestoreState();
+      if (FrameReg == X86::ESP)
+        Out.EmitCFIAdjustCfaOffset(-4 /* byte size of the LocalFrameReg */);
+    }
+  }
+
+  void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+                                 bool IsWrite,
+                                 const RegisterContext &RegCtx,
+                                 MCContext &Ctx,
+                                 MCStreamer &Out) override;
+  void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+                                 bool IsWrite,
+                                 const RegisterContext &RegCtx,
+                                 MCContext &Ctx,
+                                 MCStreamer &Out) override;
+  void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+                          MCStreamer &Out) override;
+
+private:
+  void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+                          MCStreamer &Out, const RegisterContext &RegCtx) {
+    EmitInstruction(Out, MCInstBuilder(X86::CLD));
+    EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
+
+    EmitInstruction(Out, MCInstBuilder(X86::AND64ri8)
+                             .addReg(X86::ESP)
+                             .addReg(X86::ESP)
+                             .addImm(-16));
+    EmitInstruction(
+        Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(32)));
+
+    MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") +
+                                            (IsWrite ? "store" : "load") +
+                                            llvm::Twine(AccessSize));
+    const MCSymbolRefExpr *FnExpr =
+        MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
+    EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr));
+  }
+};
+
+void X86AddressSanitizer32::InstrumentMemOperandSmall(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite,
+    const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+  unsigned AddressRegI32 = RegCtx.AddressReg(32);
+  unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
+  unsigned ShadowRegI8 = RegCtx.ShadowReg(8);
+
+  assert(RegCtx.ScratchReg(32) != X86::NoRegister);
+  unsigned ScratchRegI32 = RegCtx.ScratchReg(32);
+
+  ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out);
+
+  EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg(
+                           AddressRegI32));
+  EmitInstruction(Out, MCInstBuilder(X86::SHR32ri)
+                           .addReg(ShadowRegI32)
+                           .addReg(ShadowRegI32)
+                           .addImm(3));
+
+  {
+    MCInst Inst;
+    Inst.setOpcode(X86::MOV8rm);
+    Inst.addOperand(MCOperand::createReg(ShadowRegI8));
+    const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1,
+                              SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+
+  EmitInstruction(
+      Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8));
+  MCSymbol *DoneSym = Ctx.createTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+  EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+  EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg(
+                           AddressRegI32));
+  EmitInstruction(Out, MCInstBuilder(X86::AND32ri)
+                           .addReg(ScratchRegI32)
+                           .addReg(ScratchRegI32)
+                           .addImm(7));
+
+  switch (AccessSize) {
+  default: llvm_unreachable("Incorrect access size");
+  case 1:
+    break;
+  case 2: {
+    const MCExpr *Disp = MCConstantExpr::create(1, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
+                              SMLoc(), SMLoc()));
+    EmitLEA(*Op, 32, ScratchRegI32, Out);
+    break;
+  }
+  case 4:
+    EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8)
+                             .addReg(ScratchRegI32)
+                             .addReg(ScratchRegI32)
+                             .addImm(3));
+    break;
+  }
+
+  EmitInstruction(
+      Out,
+      MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8));
+  EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg(
+                           ShadowRegI32));
+  EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr));
+
+  EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
+  EmitLabel(Out, DoneSym);
+}
+
+void X86AddressSanitizer32::InstrumentMemOperandLarge(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite,
+    const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+  unsigned AddressRegI32 = RegCtx.AddressReg(32);
+  unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
+
+  ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out);
+
+  EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg(
+                           AddressRegI32));
+  EmitInstruction(Out, MCInstBuilder(X86::SHR32ri)
+                           .addReg(ShadowRegI32)
+                           .addReg(ShadowRegI32)
+                           .addImm(3));
+  {
+    MCInst Inst;
+    switch (AccessSize) {
+    default: llvm_unreachable("Incorrect access size");
+    case 8:
+      Inst.setOpcode(X86::CMP8mi);
+      break;
+    case 16:
+      Inst.setOpcode(X86::CMP16mi);
+      break;
+    }
+    const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1,
+                              SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    Inst.addOperand(MCOperand::createImm(0));
+    EmitInstruction(Out, Inst);
+  }
+  MCSymbol *DoneSym = Ctx.createTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+  EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+  EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
+  EmitLabel(Out, DoneSym);
+}
+
+void X86AddressSanitizer32::InstrumentMOVSImpl(unsigned AccessSize,
+                                               MCContext &Ctx,
+                                               MCStreamer &Out) {
+  StoreFlags(Out);
+
+  // No need to test when ECX is equals to zero.
+  MCSymbol *DoneSym = Ctx.createTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+  EmitInstruction(
+      Out, MCInstBuilder(X86::TEST32rr).addReg(X86::ECX).addReg(X86::ECX));
+  EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+  // Instrument first and last elements in src and dst range.
+  InstrumentMOVSBase(X86::EDI /* DstReg */, X86::ESI /* SrcReg */,
+                     X86::ECX /* CntReg */, AccessSize, Ctx, Out);
+
+  EmitLabel(Out, DoneSym);
+  RestoreFlags(Out);
+}
+
+class X86AddressSanitizer64 : public X86AddressSanitizer {
+public:
+  static const long kShadowOffset = 0x7fff8000;
+
+  X86AddressSanitizer64(const MCSubtargetInfo *&STI)
+      : X86AddressSanitizer(STI) {}
+
+  ~X86AddressSanitizer64() override {}
+
+  unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
+    unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
+    if (FrameReg == X86::NoRegister)
+      return FrameReg;
+    return getX86SubSuperRegister(FrameReg, 64);
+  }
+
+  void SpillReg(MCStreamer &Out, unsigned Reg) {
+    EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(Reg));
+    OrigSPOffset -= 8;
+  }
+
+  void RestoreReg(MCStreamer &Out, unsigned Reg) {
+    EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(Reg));
+    OrigSPOffset += 8;
+  }
+
+  void StoreFlags(MCStreamer &Out) {
+    EmitInstruction(Out, MCInstBuilder(X86::PUSHF64));
+    OrigSPOffset -= 8;
+  }
+
+  void RestoreFlags(MCStreamer &Out) {
+    EmitInstruction(Out, MCInstBuilder(X86::POPF64));
+    OrigSPOffset += 8;
+  }
+
+  void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+                                    MCContext &Ctx,
+                                    MCStreamer &Out) override {
+    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64);
+    assert(LocalFrameReg != X86::NoRegister);
+
+    const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
+    unsigned FrameReg = GetFrameReg(Ctx, Out);
+    if (MRI && FrameReg != X86::NoRegister) {
+      SpillReg(Out, X86::RBP);
+      if (FrameReg == X86::RSP) {
+        Out.EmitCFIAdjustCfaOffset(8 /* byte size of the LocalFrameReg */);
+        Out.EmitCFIRelOffset(
+            MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0);
+      }
+      EmitInstruction(
+          Out,
+          MCInstBuilder(X86::MOV64rr).addReg(LocalFrameReg).addReg(FrameReg));
+      Out.EmitCFIRememberState();
+      Out.EmitCFIDefCfaRegister(
+          MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */));
+    }
+
+    EmitAdjustRSP(Ctx, Out, -128);
+    SpillReg(Out, RegCtx.ShadowReg(64));
+    SpillReg(Out, RegCtx.AddressReg(64));
+    if (RegCtx.ScratchReg(64) != X86::NoRegister)
+      SpillReg(Out, RegCtx.ScratchReg(64));
+    StoreFlags(Out);
+  }
+
+  void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+                                    MCContext &Ctx,
+                                    MCStreamer &Out) override {
+    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64);
+    assert(LocalFrameReg != X86::NoRegister);
+
+    RestoreFlags(Out);
+    if (RegCtx.ScratchReg(64) != X86::NoRegister)
+      RestoreReg(Out, RegCtx.ScratchReg(64));
+    RestoreReg(Out, RegCtx.AddressReg(64));
+    RestoreReg(Out, RegCtx.ShadowReg(64));
+    EmitAdjustRSP(Ctx, Out, 128);
+
+    unsigned FrameReg = GetFrameReg(Ctx, Out);
+    if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) {
+      RestoreReg(Out, LocalFrameReg);
+      Out.EmitCFIRestoreState();
+      if (FrameReg == X86::RSP)
+        Out.EmitCFIAdjustCfaOffset(-8 /* byte size of the LocalFrameReg */);
+    }
+  }
+
+  void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+                                 bool IsWrite,
+                                 const RegisterContext &RegCtx,
+                                 MCContext &Ctx,
+                                 MCStreamer &Out) override;
+  void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+                                 bool IsWrite,
+                                 const RegisterContext &RegCtx,
+                                 MCContext &Ctx,
+                                 MCStreamer &Out) override;
+  void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+                          MCStreamer &Out) override;
+
+private:
+  void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) {
+    const MCExpr *Disp = MCConstantExpr::create(Offset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, X86::RSP, 0, 1,
+                              SMLoc(), SMLoc()));
+    EmitLEA(*Op, 64, X86::RSP, Out);
+    OrigSPOffset += Offset;
+  }
+
+  void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+                          MCStreamer &Out, const RegisterContext &RegCtx) {
+    EmitInstruction(Out, MCInstBuilder(X86::CLD));
+    EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
+
+    EmitInstruction(Out, MCInstBuilder(X86::AND64ri8)
+                             .addReg(X86::RSP)
+                             .addReg(X86::RSP)
+                             .addImm(-16));
+
+    if (RegCtx.AddressReg(64) != X86::RDI) {
+      EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg(
+                               RegCtx.AddressReg(64)));
+    }
+    MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") +
+                                            (IsWrite ? "store" : "load") +
+                                            llvm::Twine(AccessSize));
+    const MCSymbolRefExpr *FnExpr =
+        MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
+    EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr));
+  }
+};
+
+void X86AddressSanitizer64::InstrumentMemOperandSmall(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite,
+    const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+  unsigned AddressRegI64 = RegCtx.AddressReg(64);
+  unsigned AddressRegI32 = RegCtx.AddressReg(32);
+  unsigned ShadowRegI64 = RegCtx.ShadowReg(64);
+  unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
+  unsigned ShadowRegI8 = RegCtx.ShadowReg(8);
+
+  assert(RegCtx.ScratchReg(32) != X86::NoRegister);
+  unsigned ScratchRegI32 = RegCtx.ScratchReg(32);
+
+  ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out);
+
+  EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg(
+                           AddressRegI64));
+  EmitInstruction(Out, MCInstBuilder(X86::SHR64ri)
+                           .addReg(ShadowRegI64)
+                           .addReg(ShadowRegI64)
+                           .addImm(3));
+  {
+    MCInst Inst;
+    Inst.setOpcode(X86::MOV8rm);
+    Inst.addOperand(MCOperand::createReg(ShadowRegI8));
+    const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1,
+                              SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+
+  EmitInstruction(
+      Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8));
+  MCSymbol *DoneSym = Ctx.createTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+  EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+  EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg(
+                           AddressRegI32));
+  EmitInstruction(Out, MCInstBuilder(X86::AND32ri)
+                           .addReg(ScratchRegI32)
+                           .addReg(ScratchRegI32)
+                           .addImm(7));
+
+  switch (AccessSize) {
+  default: llvm_unreachable("Incorrect access size");
+  case 1:
+    break;
+  case 2: {
+    const MCExpr *Disp = MCConstantExpr::create(1, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
+                              SMLoc(), SMLoc()));
+    EmitLEA(*Op, 32, ScratchRegI32, Out);
+    break;
+  }
+  case 4:
+    EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8)
+                             .addReg(ScratchRegI32)
+                             .addReg(ScratchRegI32)
+                             .addImm(3));
+    break;
+  }
+
+  EmitInstruction(
+      Out,
+      MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8));
+  EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg(
+                           ShadowRegI32));
+  EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr));
+
+  EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
+  EmitLabel(Out, DoneSym);
+}
+
+void X86AddressSanitizer64::InstrumentMemOperandLarge(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite,
+    const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+  unsigned AddressRegI64 = RegCtx.AddressReg(64);
+  unsigned ShadowRegI64 = RegCtx.ShadowReg(64);
+
+  ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out);
+
+  EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg(
+                           AddressRegI64));
+  EmitInstruction(Out, MCInstBuilder(X86::SHR64ri)
+                           .addReg(ShadowRegI64)
+                           .addReg(ShadowRegI64)
+                           .addImm(3));
+  {
+    MCInst Inst;
+    switch (AccessSize) {
+    default: llvm_unreachable("Incorrect access size");
+    case 8:
+      Inst.setOpcode(X86::CMP8mi);
+      break;
+    case 16:
+      Inst.setOpcode(X86::CMP16mi);
+      break;
+    }
+    const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1,
+                              SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    Inst.addOperand(MCOperand::createImm(0));
+    EmitInstruction(Out, Inst);
+  }
+
+  MCSymbol *DoneSym = Ctx.createTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+  EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+  EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
+  EmitLabel(Out, DoneSym);
+}
+
+void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize,
+                                               MCContext &Ctx,
+                                               MCStreamer &Out) {
+  StoreFlags(Out);
+
+  // No need to test when RCX is equals to zero.
+  MCSymbol *DoneSym = Ctx.createTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+  EmitInstruction(
+      Out, MCInstBuilder(X86::TEST64rr).addReg(X86::RCX).addReg(X86::RCX));
+  EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+  // Instrument first and last elements in src and dst range.
+  InstrumentMOVSBase(X86::RDI /* DstReg */, X86::RSI /* SrcReg */,
+                     X86::RCX /* CntReg */, AccessSize, Ctx, Out);
+
+  EmitLabel(Out, DoneSym);
+  RestoreFlags(Out);
+}
+
+} // End anonymous namespace
+
+X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo *&STI)
+    : STI(STI), InitialFrameReg(0) {}
+
+X86AsmInstrumentation::~X86AsmInstrumentation() {}
+
+void X86AsmInstrumentation::InstrumentAndEmitInstruction(
+    const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
+    const MCInstrInfo &MII, MCStreamer &Out) {
+  EmitInstruction(Out, Inst);
+}
+
+void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out,
+                                            const MCInst &Inst) {
+  Out.EmitInstruction(Inst, *STI);
+}
+
+unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx,
+                                                   MCStreamer &Out) {
+  if (!Out.getNumFrameInfos()) // No active dwarf frame
+    return X86::NoRegister;
+  const MCDwarfFrameInfo &Frame = Out.getDwarfFrameInfos().back();
+  if (Frame.End) // Active dwarf frame is closed
+    return X86::NoRegister;
+  const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
+  if (!MRI) // No register info
+    return X86::NoRegister;
+
+  if (InitialFrameReg) {
+    // FrameReg is set explicitly, we're instrumenting a MachineFunction.
+    return InitialFrameReg;
+  }
+
+  return MRI->getLLVMRegNum(Frame.CurrentCfaRegister, true /* IsEH */);
+}
+
+X86AsmInstrumentation *
+CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+                            const MCContext &Ctx, const MCSubtargetInfo *&STI) {
+  Triple T(STI->getTargetTriple());
+  const bool hasCompilerRTSupport = T.isOSLinux();
+  if (ClAsanInstrumentAssembly && hasCompilerRTSupport &&
+      MCOptions.SanitizeAddress) {
+    if (STI->getFeatureBits()[X86::Mode32Bit] != 0)
+      return new X86AddressSanitizer32(STI);
+    if (STI->getFeatureBits()[X86::Mode64Bit] != 0)
+      return new X86AddressSanitizer64(STI);
+  }
+  return new X86AsmInstrumentation(STI);
+}
+
+} // end llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
new file mode 100644
index 0000000..470cead
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
@@ -0,0 +1,68 @@
+//===- X86AsmInstrumentation.h - Instrument X86 inline assembly *- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
+#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
+
+#include "llvm/ADT/SmallVector.h"
+
+#include <memory>
+
+namespace llvm {
+
+class MCContext;
+class MCInst;
+class MCInstrInfo;
+class MCParsedAsmOperand;
+class MCStreamer;
+class MCSubtargetInfo;
+class MCTargetOptions;
+
+class X86AsmInstrumentation;
+
+X86AsmInstrumentation *
+CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+                            const MCContext &Ctx,
+                            const MCSubtargetInfo *&STI);
+
+class X86AsmInstrumentation {
+public:
+  virtual ~X86AsmInstrumentation();
+
+  // Sets frame register corresponding to a current frame.
+  void SetInitialFrameRegister(unsigned RegNo) {
+    InitialFrameReg = RegNo;
+  }
+
+  // Tries to instrument and emit instruction.
+  virtual void InstrumentAndEmitInstruction(
+      const MCInst &Inst,
+      SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand> > &Operands,
+      MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
+
+protected:
+  friend X86AsmInstrumentation *
+  CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+                              const MCContext &Ctx,
+                              const MCSubtargetInfo *&STI);
+
+  X86AsmInstrumentation(const MCSubtargetInfo *&STI);
+
+  unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out);
+
+  void EmitInstruction(MCStreamer &Out, const MCInst &Inst);
+
+  const MCSubtargetInfo *&STI;
+
+  unsigned InitialFrameReg;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
new file mode 100644
index 0000000..4d8ffac
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -0,0 +1,2951 @@
+//===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86AsmInstrumentation.h"
+#include "X86AsmParserCommon.h"
+#include "X86Operand.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <memory>
+
+using namespace llvm;
+
+namespace {
+
+static const char OpPrecedence[] = {
+  0, // IC_OR
+  1, // IC_XOR
+  2, // IC_AND
+  3, // IC_LSHIFT
+  3, // IC_RSHIFT
+  4, // IC_PLUS
+  4, // IC_MINUS
+  5, // IC_MULTIPLY
+  5, // IC_DIVIDE
+  6, // IC_RPAREN
+  7, // IC_LPAREN
+  0, // IC_IMM
+  0  // IC_REGISTER
+};
+
+class X86AsmParser : public MCTargetAsmParser {
+  const MCInstrInfo &MII;
+  ParseInstructionInfo *InstInfo;
+  std::unique_ptr<X86AsmInstrumentation> Instrumentation;
+
+private:
+  SMLoc consumeToken() {
+    MCAsmParser &Parser = getParser();
+    SMLoc Result = Parser.getTok().getLoc();
+    Parser.Lex();
+    return Result;
+  }
+
+  enum InfixCalculatorTok {
+    IC_OR = 0,
+    IC_XOR,
+    IC_AND,
+    IC_LSHIFT,
+    IC_RSHIFT,
+    IC_PLUS,
+    IC_MINUS,
+    IC_MULTIPLY,
+    IC_DIVIDE,
+    IC_RPAREN,
+    IC_LPAREN,
+    IC_IMM,
+    IC_REGISTER
+  };
+
+  class InfixCalculator {
+    typedef std::pair< InfixCalculatorTok, int64_t > ICToken;
+    SmallVector<InfixCalculatorTok, 4> InfixOperatorStack;
+    SmallVector<ICToken, 4> PostfixStack;
+
+  public:
+    int64_t popOperand() {
+      assert (!PostfixStack.empty() && "Poped an empty stack!");
+      ICToken Op = PostfixStack.pop_back_val();
+      assert ((Op.first == IC_IMM || Op.first == IC_REGISTER)
+              && "Expected and immediate or register!");
+      return Op.second;
+    }
+    void pushOperand(InfixCalculatorTok Op, int64_t Val = 0) {
+      assert ((Op == IC_IMM || Op == IC_REGISTER) &&
+              "Unexpected operand!");
+      PostfixStack.push_back(std::make_pair(Op, Val));
+    }
+
+    void popOperator() { InfixOperatorStack.pop_back(); }
+    void pushOperator(InfixCalculatorTok Op) {
+      // Push the new operator if the stack is empty.
+      if (InfixOperatorStack.empty()) {
+        InfixOperatorStack.push_back(Op);
+        return;
+      }
+
+      // Push the new operator if it has a higher precedence than the operator
+      // on the top of the stack or the operator on the top of the stack is a
+      // left parentheses.
+      unsigned Idx = InfixOperatorStack.size() - 1;
+      InfixCalculatorTok StackOp = InfixOperatorStack[Idx];
+      if (OpPrecedence[Op] > OpPrecedence[StackOp] || StackOp == IC_LPAREN) {
+        InfixOperatorStack.push_back(Op);
+        return;
+      }
+
+      // The operator on the top of the stack has higher precedence than the
+      // new operator.
+      unsigned ParenCount = 0;
+      while (1) {
+        // Nothing to process.
+        if (InfixOperatorStack.empty())
+          break;
+
+        Idx = InfixOperatorStack.size() - 1;
+        StackOp = InfixOperatorStack[Idx];
+        if (!(OpPrecedence[StackOp] >= OpPrecedence[Op] || ParenCount))
+          break;
+
+        // If we have an even parentheses count and we see a left parentheses,
+        // then stop processing.
+        if (!ParenCount && StackOp == IC_LPAREN)
+          break;
+
+        if (StackOp == IC_RPAREN) {
+          ++ParenCount;
+          InfixOperatorStack.pop_back();
+        } else if (StackOp == IC_LPAREN) {
+          --ParenCount;
+          InfixOperatorStack.pop_back();
+        } else {
+          InfixOperatorStack.pop_back();
+          PostfixStack.push_back(std::make_pair(StackOp, 0));
+        }
+      }
+      // Push the new operator.
+      InfixOperatorStack.push_back(Op);
+    }
+
+    int64_t execute() {
+      // Push any remaining operators onto the postfix stack.
+      while (!InfixOperatorStack.empty()) {
+        InfixCalculatorTok StackOp = InfixOperatorStack.pop_back_val();
+        if (StackOp != IC_LPAREN && StackOp != IC_RPAREN)
+          PostfixStack.push_back(std::make_pair(StackOp, 0));
+      }
+
+      if (PostfixStack.empty())
+        return 0;
+
+      SmallVector<ICToken, 16> OperandStack;
+      for (unsigned i = 0, e = PostfixStack.size(); i != e; ++i) {
+        ICToken Op = PostfixStack[i];
+        if (Op.first == IC_IMM || Op.first == IC_REGISTER) {
+          OperandStack.push_back(Op);
+        } else {
+          assert (OperandStack.size() > 1 && "Too few operands.");
+          int64_t Val;
+          ICToken Op2 = OperandStack.pop_back_val();
+          ICToken Op1 = OperandStack.pop_back_val();
+          switch (Op.first) {
+          default:
+            report_fatal_error("Unexpected operator!");
+            break;
+          case IC_PLUS:
+            Val = Op1.second + Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_MINUS:
+            Val = Op1.second - Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_MULTIPLY:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Multiply operation with an immediate and a register!");
+            Val = Op1.second * Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_DIVIDE:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Divide operation with an immediate and a register!");
+            assert (Op2.second != 0 && "Division by zero!");
+            Val = Op1.second / Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_OR:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Or operation with an immediate and a register!");
+            Val = Op1.second | Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_XOR:
+            assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+              "Xor operation with an immediate and a register!");
+            Val = Op1.second ^ Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_AND:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "And operation with an immediate and a register!");
+            Val = Op1.second & Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_LSHIFT:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Left shift operation with an immediate and a register!");
+            Val = Op1.second << Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_RSHIFT:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Right shift operation with an immediate and a register!");
+            Val = Op1.second >> Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          }
+        }
+      }
+      assert (OperandStack.size() == 1 && "Expected a single result.");
+      return OperandStack.pop_back_val().second;
+    }
+  };
+
+  enum IntelExprState {
+    IES_OR,
+    IES_XOR,
+    IES_AND,
+    IES_LSHIFT,
+    IES_RSHIFT,
+    IES_PLUS,
+    IES_MINUS,
+    IES_NOT,
+    IES_MULTIPLY,
+    IES_DIVIDE,
+    IES_LBRAC,
+    IES_RBRAC,
+    IES_LPAREN,
+    IES_RPAREN,
+    IES_REGISTER,
+    IES_INTEGER,
+    IES_IDENTIFIER,
+    IES_ERROR
+  };
+
+  class IntelExprStateMachine {
+    IntelExprState State, PrevState;
+    unsigned BaseReg, IndexReg, TmpReg, Scale;
+    int64_t Imm;
+    const MCExpr *Sym;
+    StringRef SymName;
+    bool StopOnLBrac, AddImmPrefix;
+    InfixCalculator IC;
+    InlineAsmIdentifierInfo Info;
+
+  public:
+    IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) :
+      State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0),
+      Scale(1), Imm(imm), Sym(nullptr), StopOnLBrac(stoponlbrac),
+      AddImmPrefix(addimmprefix) { Info.clear(); }
+
+    unsigned getBaseReg() { return BaseReg; }
+    unsigned getIndexReg() { return IndexReg; }
+    unsigned getScale() { return Scale; }
+    const MCExpr *getSym() { return Sym; }
+    StringRef getSymName() { return SymName; }
+    int64_t getImm() { return Imm + IC.execute(); }
+    bool isValidEndState() {
+      return State == IES_RBRAC || State == IES_INTEGER;
+    }
+    bool getStopOnLBrac() { return StopOnLBrac; }
+    bool getAddImmPrefix() { return AddImmPrefix; }
+    bool hadError() { return State == IES_ERROR; }
+
+    InlineAsmIdentifierInfo &getIdentifierInfo() {
+      return Info;
+    }
+
+    void onOr() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_OR;
+        IC.pushOperator(IC_OR);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onXor() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_XOR;
+        IC.pushOperator(IC_XOR);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onAnd() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_AND;
+        IC.pushOperator(IC_AND);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onLShift() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_LSHIFT;
+        IC.pushOperator(IC_LSHIFT);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onRShift() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_RSHIFT;
+        IC.pushOperator(IC_RSHIFT);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onPlus() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_PLUS;
+        IC.pushOperator(IC_PLUS);
+        if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+          // If we already have a BaseReg, then assume this is the IndexReg with
+          // a scale of 1.
+          if (!BaseReg) {
+            BaseReg = TmpReg;
+          } else {
+            assert (!IndexReg && "BaseReg/IndexReg already set!");
+            IndexReg = TmpReg;
+            Scale = 1;
+          }
+        }
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onMinus() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_NOT:
+      case IES_MULTIPLY:
+      case IES_DIVIDE:
+      case IES_LPAREN:
+      case IES_RPAREN:
+      case IES_LBRAC:
+      case IES_RBRAC:
+      case IES_INTEGER:
+      case IES_REGISTER:
+        State = IES_MINUS;
+        // Only push the minus operator if it is not a unary operator.
+        if (!(CurrState == IES_PLUS || CurrState == IES_MINUS ||
+              CurrState == IES_MULTIPLY || CurrState == IES_DIVIDE ||
+              CurrState == IES_LPAREN || CurrState == IES_LBRAC))
+          IC.pushOperator(IC_MINUS);
+        if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+          // If we already have a BaseReg, then assume this is the IndexReg with
+          // a scale of 1.
+          if (!BaseReg) {
+            BaseReg = TmpReg;
+          } else {
+            assert (!IndexReg && "BaseReg/IndexReg already set!");
+            IndexReg = TmpReg;
+            Scale = 1;
+          }
+        }
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onNot() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_NOT:
+        State = IES_NOT;
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onRegister(unsigned Reg) {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_LPAREN:
+        State = IES_REGISTER;
+        TmpReg = Reg;
+        IC.pushOperand(IC_REGISTER);
+        break;
+      case IES_MULTIPLY:
+        // Index Register - Scale * Register
+        if (PrevState == IES_INTEGER) {
+          assert (!IndexReg && "IndexReg already set!");
+          State = IES_REGISTER;
+          IndexReg = Reg;
+          // Get the scale and replace the 'Scale * Register' with '0'.
+          Scale = IC.popOperand();
+          IC.pushOperand(IC_IMM);
+          IC.popOperator();
+        } else {
+          State = IES_ERROR;
+        }
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName) {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_MINUS:
+      case IES_NOT:
+        State = IES_INTEGER;
+        Sym = SymRef;
+        SymName = SymRefName;
+        IC.pushOperand(IC_IMM);
+        break;
+      }
+    }
+    bool onInteger(int64_t TmpInt, StringRef &ErrMsg) {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_MINUS:
+      case IES_NOT:
+      case IES_OR:
+      case IES_XOR:
+      case IES_AND:
+      case IES_LSHIFT:
+      case IES_RSHIFT:
+      case IES_DIVIDE:
+      case IES_MULTIPLY:
+      case IES_LPAREN:
+        State = IES_INTEGER;
+        if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) {
+          // Index Register - Register * Scale
+          assert (!IndexReg && "IndexReg already set!");
+          IndexReg = TmpReg;
+          Scale = TmpInt;
+          if(Scale != 1 && Scale != 2 && Scale != 4 && Scale != 8) {
+            ErrMsg = "scale factor in address must be 1, 2, 4 or 8";
+            return true;
+          }
+          // Get the scale and replace the 'Register * Scale' with '0'.
+          IC.popOperator();
+        } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+                    PrevState == IES_OR || PrevState == IES_AND ||
+                    PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
+                    PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+                    PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+                    PrevState == IES_NOT || PrevState == IES_XOR) &&
+                   CurrState == IES_MINUS) {
+          // Unary minus.  No need to pop the minus operand because it was never
+          // pushed.
+          IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm.
+        } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+                    PrevState == IES_OR || PrevState == IES_AND ||
+                    PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
+                    PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+                    PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+                    PrevState == IES_NOT || PrevState == IES_XOR) &&
+                   CurrState == IES_NOT) {
+          // Unary not.  No need to pop the not operand because it was never
+          // pushed.
+          IC.pushOperand(IC_IMM, ~TmpInt); // Push ~Imm.
+        } else {
+          IC.pushOperand(IC_IMM, TmpInt);
+        }
+        break;
+      }
+      PrevState = CurrState;
+      return false;
+    }
+    void onStar() {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_REGISTER:
+      case IES_RPAREN:
+        State = IES_MULTIPLY;
+        IC.pushOperator(IC_MULTIPLY);
+        break;
+      }
+    }
+    void onDivide() {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+        State = IES_DIVIDE;
+        IC.pushOperator(IC_DIVIDE);
+        break;
+      }
+    }
+    void onLBrac() {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_RBRAC:
+        State = IES_PLUS;
+        IC.pushOperator(IC_PLUS);
+        break;
+      }
+    }
+    void onRBrac() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_REGISTER:
+      case IES_RPAREN:
+        State = IES_RBRAC;
+        if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+          // If we already have a BaseReg, then assume this is the IndexReg with
+          // a scale of 1.
+          if (!BaseReg) {
+            BaseReg = TmpReg;
+          } else {
+            assert (!IndexReg && "BaseReg/IndexReg already set!");
+            IndexReg = TmpReg;
+            Scale = 1;
+          }
+        }
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onLParen() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_MINUS:
+      case IES_NOT:
+      case IES_OR:
+      case IES_XOR:
+      case IES_AND:
+      case IES_LSHIFT:
+      case IES_RSHIFT:
+      case IES_MULTIPLY:
+      case IES_DIVIDE:
+      case IES_LPAREN:
+        // FIXME: We don't handle this type of unary minus or not, yet.
+        if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+            PrevState == IES_OR || PrevState == IES_AND ||
+            PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
+            PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+            PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+            PrevState == IES_NOT || PrevState == IES_XOR) &&
+            (CurrState == IES_MINUS || CurrState == IES_NOT)) {
+          State = IES_ERROR;
+          break;
+        }
+        State = IES_LPAREN;
+        IC.pushOperator(IC_LPAREN);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onRParen() {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_REGISTER:
+      case IES_RPAREN:
+        State = IES_RPAREN;
+        IC.pushOperator(IC_RPAREN);
+        break;
+      }
+    }
+  };
+
+  bool Error(SMLoc L, const Twine &Msg,
+             ArrayRef<SMRange> Ranges = None,
+             bool MatchingInlineAsm = false) {
+    MCAsmParser &Parser = getParser();
+    if (MatchingInlineAsm) return true;
+    return Parser.Error(L, Msg, Ranges);
+  }
+
+  bool ErrorAndEatStatement(SMLoc L, const Twine &Msg,
+          ArrayRef<SMRange> Ranges = None,
+          bool MatchingInlineAsm = false) {
+    MCAsmParser &Parser = getParser();
+    Parser.eatToEndOfStatement();
+    return Error(L, Msg, Ranges, MatchingInlineAsm);
+  }
+
+  std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg) {
+    Error(Loc, Msg);
+    return nullptr;
+  }
+
+  std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc);
+  std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc);
+  void AddDefaultSrcDestOperands(
+      OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
+      std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst);
+  std::unique_ptr<X86Operand> ParseOperand();
+  std::unique_ptr<X86Operand> ParseATTOperand();
+  std::unique_ptr<X86Operand> ParseIntelOperand();
+  std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator();
+  bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
+  std::unique_ptr<X86Operand> ParseIntelOperator(unsigned OpKind);
+  std::unique_ptr<X86Operand>
+  ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
+  std::unique_ptr<X86Operand>
+  ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc, unsigned Size);
+  std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End);
+  bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
+  std::unique_ptr<X86Operand> ParseIntelBracExpression(unsigned SegReg,
+                                                       SMLoc Start,
+                                                       int64_t ImmDisp,
+                                                       unsigned Size);
+  bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
+                            InlineAsmIdentifierInfo &Info,
+                            bool IsUnevaluatedOperand, SMLoc &End);
+
+  std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
+
+  std::unique_ptr<X86Operand>
+  CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
+                        unsigned IndexReg, unsigned Scale, SMLoc Start,
+                        SMLoc End, unsigned Size, StringRef Identifier,
+                        InlineAsmIdentifierInfo &Info);
+
+  bool parseDirectiveEven(SMLoc L);
+  bool ParseDirectiveWord(unsigned Size, SMLoc L);
+  bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
+
+  bool processInstruction(MCInst &Inst, const OperandVector &Ops);
+
+  /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds
+  /// instrumentation around Inst.
+  void EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out);
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+
+  void MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, OperandVector &Operands,
+                         MCStreamer &Out, bool MatchingInlineAsm);
+
+  bool ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
+                           bool MatchingInlineAsm);
+
+  bool MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                  OperandVector &Operands, MCStreamer &Out,
+                                  uint64_t &ErrorInfo,
+                                  bool MatchingInlineAsm);
+
+  bool MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                    OperandVector &Operands, MCStreamer &Out,
+                                    uint64_t &ErrorInfo,
+                                    bool MatchingInlineAsm);
+
+  bool OmitRegisterFromClobberLists(unsigned RegNo) override;
+
+  /// doSrcDstMatch - Returns true if operands are matching in their
+  /// word size (%si and %di, %esi and %edi, etc.). Order depends on
+  /// the parsing mode (Intel vs. AT&T).
+  bool doSrcDstMatch(X86Operand &Op1, X86Operand &Op2);
+
+  /// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z})
+  /// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required.
+  /// \return \c true if no parsing errors occurred, \c false otherwise.
+  bool HandleAVX512Operand(OperandVector &Operands,
+                           const MCParsedAsmOperand &Op);
+
+  bool is64BitMode() const {
+    // FIXME: Can tablegen auto-generate this?
+    return getSTI().getFeatureBits()[X86::Mode64Bit];
+  }
+  bool is32BitMode() const {
+    // FIXME: Can tablegen auto-generate this?
+    return getSTI().getFeatureBits()[X86::Mode32Bit];
+  }
+  bool is16BitMode() const {
+    // FIXME: Can tablegen auto-generate this?
+    return getSTI().getFeatureBits()[X86::Mode16Bit];
+  }
+  void SwitchMode(unsigned mode) {
+    MCSubtargetInfo &STI = copySTI();
+    FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit});
+    FeatureBitset OldMode = STI.getFeatureBits() & AllModes;
+    unsigned FB = ComputeAvailableFeatures(
+      STI.ToggleFeature(OldMode.flip(mode)));
+    setAvailableFeatures(FB);
+
+    assert(FeatureBitset({mode}) == (STI.getFeatureBits() & AllModes));
+  }
+
+  unsigned getPointerWidth() {
+    if (is16BitMode()) return 16;
+    if (is32BitMode()) return 32;
+    if (is64BitMode()) return 64;
+    llvm_unreachable("invalid mode");
+  }
+
+  bool isParsingIntelSyntax() {
+    return getParser().getAssemblerDialect();
+  }
+
+  /// @name Auto-generated Matcher Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "X86GenAsmMatcher.inc"
+
+  /// }
+
+public:
+  X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
+               const MCInstrInfo &mii, const MCTargetOptions &Options)
+    : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr) {
+
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+    Instrumentation.reset(
+        CreateX86AsmInstrumentation(Options, Parser.getContext(), STI));
+  }
+
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+  void SetFrameRegister(unsigned RegNo) override;
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+
+  bool ParseDirective(AsmToken DirectiveID) override;
+};
+} // end anonymous namespace
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+static bool CheckBaseRegAndIndexReg(unsigned BaseReg, unsigned IndexReg,
+                                    StringRef &ErrMsg) {
+  // If we have both a base register and an index register make sure they are
+  // both 64-bit or 32-bit registers.
+  // To support VSIB, IndexReg can be 128-bit or 256-bit registers.
+  if (BaseReg != 0 && IndexReg != 0) {
+    if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
+        (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+         X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) &&
+        IndexReg != X86::RIZ) {
+      ErrMsg = "base register is 64-bit, but index register is not";
+      return true;
+    }
+    if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) &&
+        (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+         X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) &&
+        IndexReg != X86::EIZ){
+      ErrMsg = "base register is 32-bit, but index register is not";
+      return true;
+    }
+    if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) {
+      if (X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) ||
+          X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) {
+        ErrMsg = "base register is 16-bit, but index register is not";
+        return true;
+      }
+      if (((BaseReg == X86::BX || BaseReg == X86::BP) &&
+           IndexReg != X86::SI && IndexReg != X86::DI) ||
+          ((BaseReg == X86::SI || BaseReg == X86::DI) &&
+           IndexReg != X86::BX && IndexReg != X86::BP)) {
+        ErrMsg = "invalid 16-bit base/index register combination";
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool X86AsmParser::doSrcDstMatch(X86Operand &Op1, X86Operand &Op2)
+{
+  // Return true and let a normal complaint about bogus operands happen.
+  if (!Op1.isMem() || !Op2.isMem())
+    return true;
+
+  // Actually these might be the other way round if Intel syntax is
+  // being used. It doesn't matter.
+  unsigned diReg = Op1.Mem.BaseReg;
+  unsigned siReg = Op2.Mem.BaseReg;
+
+  if (X86MCRegisterClasses[X86::GR16RegClassID].contains(siReg))
+    return X86MCRegisterClasses[X86::GR16RegClassID].contains(diReg);
+  if (X86MCRegisterClasses[X86::GR32RegClassID].contains(siReg))
+    return X86MCRegisterClasses[X86::GR32RegClassID].contains(diReg);
+  if (X86MCRegisterClasses[X86::GR64RegClassID].contains(siReg))
+    return X86MCRegisterClasses[X86::GR64RegClassID].contains(diReg);
+  // Again, return true and let another error happen.
+  return true;
+}
+
+bool X86AsmParser::ParseRegister(unsigned &RegNo,
+                                 SMLoc &StartLoc, SMLoc &EndLoc) {
+  MCAsmParser &Parser = getParser();
+  RegNo = 0;
+  const AsmToken &PercentTok = Parser.getTok();
+  StartLoc = PercentTok.getLoc();
+
+  // If we encounter a %, ignore it. This code handles registers with and
+  // without the prefix, unprefixed registers can occur in cfi directives.
+  if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent))
+    Parser.Lex(); // Eat percent token.
+
+  const AsmToken &Tok = Parser.getTok();
+  EndLoc = Tok.getEndLoc();
+
+  if (Tok.isNot(AsmToken::Identifier)) {
+    if (isParsingIntelSyntax()) return true;
+    return Error(StartLoc, "invalid register name",
+                 SMRange(StartLoc, EndLoc));
+  }
+
+  RegNo = MatchRegisterName(Tok.getString());
+
+  // If the match failed, try the register name as lowercase.
+  if (RegNo == 0)
+    RegNo = MatchRegisterName(Tok.getString().lower());
+
+  // The "flags" register cannot be referenced directly.
+  // Treat it as an identifier instead.
+  if (isParsingInlineAsm() && isParsingIntelSyntax() && RegNo == X86::EFLAGS)
+    RegNo = 0;
+
+  if (!is64BitMode()) {
+    // FIXME: This should be done using Requires<Not64BitMode> and
+    // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also
+    // checked.
+    // FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a
+    // REX prefix.
+    if (RegNo == X86::RIZ ||
+        X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
+        X86II::isX86_64NonExtLowByteReg(RegNo) ||
+        X86II::isX86_64ExtendedReg(RegNo))
+      return Error(StartLoc, "register %"
+                   + Tok.getString() + " is only available in 64-bit mode",
+                   SMRange(StartLoc, EndLoc));
+  }
+
+  // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
+  if (RegNo == 0 && (Tok.getString() == "st" || Tok.getString() == "ST")) {
+    RegNo = X86::ST0;
+    Parser.Lex(); // Eat 'st'
+
+    // Check to see if we have '(4)' after %st.
+    if (getLexer().isNot(AsmToken::LParen))
+      return false;
+    // Lex the paren.
+    getParser().Lex();
+
+    const AsmToken &IntTok = Parser.getTok();
+    if (IntTok.isNot(AsmToken::Integer))
+      return Error(IntTok.getLoc(), "expected stack index");
+    switch (IntTok.getIntVal()) {
+    case 0: RegNo = X86::ST0; break;
+    case 1: RegNo = X86::ST1; break;
+    case 2: RegNo = X86::ST2; break;
+    case 3: RegNo = X86::ST3; break;
+    case 4: RegNo = X86::ST4; break;
+    case 5: RegNo = X86::ST5; break;
+    case 6: RegNo = X86::ST6; break;
+    case 7: RegNo = X86::ST7; break;
+    default: return Error(IntTok.getLoc(), "invalid stack index");
+    }
+
+    if (getParser().Lex().isNot(AsmToken::RParen))
+      return Error(Parser.getTok().getLoc(), "expected ')'");
+
+    EndLoc = Parser.getTok().getEndLoc();
+    Parser.Lex(); // Eat ')'
+    return false;
+  }
+
+  EndLoc = Parser.getTok().getEndLoc();
+
+  // If this is "db[0-7]", match it as an alias
+  // for dr[0-7].
+  if (RegNo == 0 && Tok.getString().size() == 3 &&
+      Tok.getString().startswith("db")) {
+    switch (Tok.getString()[2]) {
+    case '0': RegNo = X86::DR0; break;
+    case '1': RegNo = X86::DR1; break;
+    case '2': RegNo = X86::DR2; break;
+    case '3': RegNo = X86::DR3; break;
+    case '4': RegNo = X86::DR4; break;
+    case '5': RegNo = X86::DR5; break;
+    case '6': RegNo = X86::DR6; break;
+    case '7': RegNo = X86::DR7; break;
+    }
+
+    if (RegNo != 0) {
+      EndLoc = Parser.getTok().getEndLoc();
+      Parser.Lex(); // Eat it.
+      return false;
+    }
+  }
+
+  if (RegNo == 0) {
+    if (isParsingIntelSyntax()) return true;
+    return Error(StartLoc, "invalid register name",
+                 SMRange(StartLoc, EndLoc));
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  return false;
+}
+
+void X86AsmParser::SetFrameRegister(unsigned RegNo) {
+  Instrumentation->SetInitialFrameRegister(RegNo);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
+  unsigned basereg =
+    is64BitMode() ? X86::RSI : (is32BitMode() ? X86::ESI : X86::SI);
+  const MCExpr *Disp = MCConstantExpr::create(0, getContext());
+  return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
+                               /*BaseReg=*/basereg, /*IndexReg=*/0, /*Scale=*/1,
+                               Loc, Loc, 0);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
+  unsigned basereg =
+    is64BitMode() ? X86::RDI : (is32BitMode() ? X86::EDI : X86::DI);
+  const MCExpr *Disp = MCConstantExpr::create(0, getContext());
+  return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
+                               /*BaseReg=*/basereg, /*IndexReg=*/0, /*Scale=*/1,
+                               Loc, Loc, 0);
+}
+
+void X86AsmParser::AddDefaultSrcDestOperands(
+    OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
+    std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst) {
+  if (isParsingIntelSyntax()) {
+    Operands.push_back(std::move(Dst));
+    Operands.push_back(std::move(Src));
+  }
+  else {
+    Operands.push_back(std::move(Src));
+    Operands.push_back(std::move(Dst));
+  }
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
+  if (isParsingIntelSyntax())
+    return ParseIntelOperand();
+  return ParseATTOperand();
+}
+
+/// getIntelMemOperandSize - Return intel memory operand size.
+static unsigned getIntelMemOperandSize(StringRef OpStr) {
+  unsigned Size = StringSwitch<unsigned>(OpStr)
+    .Cases("BYTE", "byte", 8)
+    .Cases("WORD", "word", 16)
+    .Cases("DWORD", "dword", 32)
+    .Cases("FWORD", "fword", 48)
+    .Cases("QWORD", "qword", 64)
+    .Cases("MMWORD","mmword", 64)
+    .Cases("XWORD", "xword", 80)
+    .Cases("TBYTE", "tbyte", 80)
+    .Cases("XMMWORD", "xmmword", 128)
+    .Cases("YMMWORD", "ymmword", 256)
+    .Cases("ZMMWORD", "zmmword", 512)
+    .Cases("OPAQUE", "opaque", -1U) // needs to be non-zero, but doesn't matter
+    .Default(0);
+  return Size;
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
+    unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg,
+    unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier,
+    InlineAsmIdentifierInfo &Info) {
+  // If we found a decl other than a VarDecl, then assume it is a FuncDecl or
+  // some other label reference.
+  if (isa<MCSymbolRefExpr>(Disp) && Info.OpDecl && !Info.IsVarDecl) {
+    // Insert an explicit size if the user didn't have one.
+    if (!Size) {
+      Size = getPointerWidth();
+      InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start,
+                                          /*Len=*/0, Size);
+    }
+
+    // Create an absolute memory reference in order to match against
+    // instructions taking a PC relative operand.
+    return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size,
+                                 Identifier, Info.OpDecl);
+  }
+
+  // We either have a direct symbol reference, or an offset from a symbol.  The
+  // parser always puts the symbol on the LHS, so look there for size
+  // calculation purposes.
+  const MCBinaryExpr *BinOp = dyn_cast<MCBinaryExpr>(Disp);
+  bool IsSymRef =
+      isa<MCSymbolRefExpr>(BinOp ? BinOp->getLHS() : Disp);
+  if (IsSymRef) {
+    if (!Size) {
+      Size = Info.Type * 8; // Size is in terms of bits in this context.
+      if (Size)
+        InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start,
+                                            /*Len=*/0, Size);
+    }
+  }
+
+  // When parsing inline assembly we set the base register to a non-zero value
+  // if we don't know the actual value at this time.  This is necessary to
+  // get the matching correct in some cases.
+  BaseReg = BaseReg ? BaseReg : 1;
+  return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
+                               IndexReg, Scale, Start, End, Size, Identifier,
+                               Info.OpDecl);
+}
+
+static void
+RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites,
+                           StringRef SymName, int64_t ImmDisp,
+                           int64_t FinalImmDisp, SMLoc &BracLoc,
+                           SMLoc &StartInBrac, SMLoc &End) {
+  // Remove the '[' and ']' from the IR string.
+  AsmRewrites.emplace_back(AOK_Skip, BracLoc, 1);
+  AsmRewrites.emplace_back(AOK_Skip, End, 1);
+
+  // If ImmDisp is non-zero, then we parsed a displacement before the
+  // bracketed expression (i.e., ImmDisp [ BaseReg + Scale*IndexReg + Disp])
+  // If ImmDisp doesn't match the displacement computed by the state machine
+  // then we have an additional displacement in the bracketed expression.
+  if (ImmDisp != FinalImmDisp) {
+    if (ImmDisp) {
+      // We have an immediate displacement before the bracketed expression.
+      // Adjust this to match the final immediate displacement.
+      bool Found = false;
+      for (AsmRewrite &AR : AsmRewrites) {
+        if (AR.Loc.getPointer() > BracLoc.getPointer())
+          continue;
+        if (AR.Kind == AOK_ImmPrefix || AR.Kind == AOK_Imm) {
+          assert (!Found && "ImmDisp already rewritten.");
+          AR.Kind = AOK_Imm;
+          AR.Len = BracLoc.getPointer() - AR.Loc.getPointer();
+          AR.Val = FinalImmDisp;
+          Found = true;
+          break;
+        }
+      }
+      assert (Found && "Unable to rewrite ImmDisp.");
+      (void)Found;
+    } else {
+      // We have a symbolic and an immediate displacement, but no displacement
+      // before the bracketed expression.  Put the immediate displacement
+      // before the bracketed expression.
+      AsmRewrites.emplace_back(AOK_Imm, BracLoc, 0, FinalImmDisp);
+    }
+  }
+  // Remove all the ImmPrefix rewrites within the brackets.
+  for (AsmRewrite &AR : AsmRewrites) {
+    if (AR.Loc.getPointer() < StartInBrac.getPointer())
+      continue;
+    if (AR.Kind == AOK_ImmPrefix)
+      AR.Kind = AOK_Delete;
+  }
+  const char *SymLocPtr = SymName.data();
+  // Skip everything before the symbol.
+  if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) {
+    assert(Len > 0 && "Expected a non-negative length.");
+    AsmRewrites.emplace_back(AOK_Skip, StartInBrac, Len);
+  }
+  // Skip everything after the symbol.
+  if (unsigned Len = End.getPointer() - (SymLocPtr + SymName.size())) {
+    SMLoc Loc = SMLoc::getFromPointer(SymLocPtr + SymName.size());
+    assert(Len > 0 && "Expected a non-negative length.");
+    AsmRewrites.emplace_back(AOK_Skip, Loc, Len);
+  }
+}
+
+bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+
+  AsmToken::TokenKind PrevTK = AsmToken::Error;
+  bool Done = false;
+  while (!Done) {
+    bool UpdateLocLex = true;
+
+    // The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an
+    // identifier.  Don't try an parse it as a register.
+    if (Tok.getString().startswith("."))
+      break;
+
+    // If we're parsing an immediate expression, we don't expect a '['.
+    if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac)
+      break;
+
+    AsmToken::TokenKind TK = getLexer().getKind();
+    switch (TK) {
+    default: {
+      if (SM.isValidEndState()) {
+        Done = true;
+        break;
+      }
+      return Error(Tok.getLoc(), "unknown token in expression");
+    }
+    case AsmToken::EndOfStatement: {
+      Done = true;
+      break;
+    }
+    case AsmToken::String:
+    case AsmToken::Identifier: {
+      // This could be a register or a symbolic displacement.
+      unsigned TmpReg;
+      const MCExpr *Val;
+      SMLoc IdentLoc = Tok.getLoc();
+      StringRef Identifier = Tok.getString();
+      if (TK != AsmToken::String && !ParseRegister(TmpReg, IdentLoc, End)) {
+        SM.onRegister(TmpReg);
+        UpdateLocLex = false;
+        break;
+      } else {
+        if (!isParsingInlineAsm()) {
+          if (getParser().parsePrimaryExpr(Val, End))
+            return Error(Tok.getLoc(), "Unexpected identifier!");
+        } else {
+          // This is a dot operator, not an adjacent identifier.
+          if (Identifier.find('.') != StringRef::npos &&
+              PrevTK == AsmToken::RBrac) {
+            return false;
+          } else {
+            InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+            if (ParseIntelIdentifier(Val, Identifier, Info,
+                                     /*Unevaluated=*/false, End))
+              return true;
+          }
+        }
+        SM.onIdentifierExpr(Val, Identifier);
+        UpdateLocLex = false;
+        break;
+      }
+      return Error(Tok.getLoc(), "Unexpected identifier!");
+    }
+    case AsmToken::Integer: {
+      StringRef ErrMsg;
+      if (isParsingInlineAsm() && SM.getAddImmPrefix())
+        InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Tok.getLoc());
+      // Look for 'b' or 'f' following an Integer as a directional label
+      SMLoc Loc = getTok().getLoc();
+      int64_t IntVal = getTok().getIntVal();
+      End = consumeToken();
+      UpdateLocLex = false;
+      if (getLexer().getKind() == AsmToken::Identifier) {
+        StringRef IDVal = getTok().getString();
+        if (IDVal == "f" || IDVal == "b") {
+          MCSymbol *Sym =
+              getContext().getDirectionalLocalSymbol(IntVal, IDVal == "b");
+          MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+          const MCExpr *Val =
+              MCSymbolRefExpr::create(Sym, Variant, getContext());
+          if (IDVal == "b" && Sym->isUndefined())
+            return Error(Loc, "invalid reference to undefined symbol");
+          StringRef Identifier = Sym->getName();
+          SM.onIdentifierExpr(Val, Identifier);
+          End = consumeToken();
+        } else {
+          if (SM.onInteger(IntVal, ErrMsg))
+            return Error(Loc, ErrMsg);
+        }
+      } else {
+        if (SM.onInteger(IntVal, ErrMsg))
+          return Error(Loc, ErrMsg);
+      }
+      break;
+    }
+    case AsmToken::Plus:    SM.onPlus(); break;
+    case AsmToken::Minus:   SM.onMinus(); break;
+    case AsmToken::Tilde:   SM.onNot(); break;
+    case AsmToken::Star:    SM.onStar(); break;
+    case AsmToken::Slash:   SM.onDivide(); break;
+    case AsmToken::Pipe:    SM.onOr(); break;
+    case AsmToken::Caret:   SM.onXor(); break;
+    case AsmToken::Amp:     SM.onAnd(); break;
+    case AsmToken::LessLess:
+                            SM.onLShift(); break;
+    case AsmToken::GreaterGreater:
+                            SM.onRShift(); break;
+    case AsmToken::LBrac:   SM.onLBrac(); break;
+    case AsmToken::RBrac:   SM.onRBrac(); break;
+    case AsmToken::LParen:  SM.onLParen(); break;
+    case AsmToken::RParen:  SM.onRParen(); break;
+    }
+    if (SM.hadError())
+      return Error(Tok.getLoc(), "unknown token in expression");
+
+    if (!Done && UpdateLocLex)
+      End = consumeToken();
+
+    PrevTK = TK;
+  }
+  return false;
+}
+
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
+                                       int64_t ImmDisp, unsigned Size) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc();
+  if (getLexer().isNot(AsmToken::LBrac))
+    return ErrorOperand(BracLoc, "Expected '[' token!");
+  Parser.Lex(); // Eat '['
+
+  SMLoc StartInBrac = Tok.getLoc();
+  // Parse [ Symbol + ImmDisp ] and [ BaseReg + Scale*IndexReg + ImmDisp ].  We
+  // may have already parsed an immediate displacement before the bracketed
+  // expression.
+  IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true);
+  if (ParseIntelExpression(SM, End))
+    return nullptr;
+
+  const MCExpr *Disp = nullptr;
+  if (const MCExpr *Sym = SM.getSym()) {
+    // A symbolic displacement.
+    Disp = Sym;
+    if (isParsingInlineAsm())
+      RewriteIntelBracExpression(*InstInfo->AsmRewrites, SM.getSymName(),
+                                 ImmDisp, SM.getImm(), BracLoc, StartInBrac,
+                                 End);
+  }
+
+  if (SM.getImm() || !Disp) {
+    const MCExpr *Imm = MCConstantExpr::create(SM.getImm(), getContext());
+    if (Disp)
+      Disp = MCBinaryExpr::createAdd(Disp, Imm, getContext());
+    else
+      Disp = Imm;  // An immediate displacement only.
+  }
+
+  // Parse struct field access.  Intel requires a dot, but MSVC doesn't.  MSVC
+  // will in fact do global lookup the field name inside all global typedefs,
+  // but we don't emulate that.
+  if (Tok.getString().find('.') != StringRef::npos) {
+    const MCExpr *NewDisp;
+    if (ParseIntelDotOperator(Disp, NewDisp))
+      return nullptr;
+
+    End = Tok.getEndLoc();
+    Parser.Lex();  // Eat the field.
+    Disp = NewDisp;
+  }
+
+  int BaseReg = SM.getBaseReg();
+  int IndexReg = SM.getIndexReg();
+  int Scale = SM.getScale();
+  if (!isParsingInlineAsm()) {
+    // handle [-42]
+    if (!BaseReg && !IndexReg) {
+      if (!SegReg)
+        return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size);
+      return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
+                                   Start, End, Size);
+    }
+    StringRef ErrMsg;
+    if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
+      Error(StartInBrac, ErrMsg);
+      return nullptr;
+    }
+    return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
+                                 IndexReg, Scale, Start, End, Size);
+  }
+
+  InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+  return CreateMemForInlineAsm(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
+                               End, Size, SM.getSymName(), Info);
+}
+
+// Inline assembly may use variable names with namespace alias qualifiers.
+bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
+                                        StringRef &Identifier,
+                                        InlineAsmIdentifierInfo &Info,
+                                        bool IsUnevaluatedOperand, SMLoc &End) {
+  MCAsmParser &Parser = getParser();
+  assert(isParsingInlineAsm() && "Expected to be parsing inline assembly.");
+  Val = nullptr;
+
+  StringRef LineBuf(Identifier.data());
+  void *Result =
+    SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand);
+
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc Loc = Tok.getLoc();
+
+  // Advance the token stream until the end of the current token is
+  // after the end of what the frontend claimed.
+  const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size();
+  do {
+    End = Tok.getEndLoc();
+    getLexer().Lex();
+  } while (End.getPointer() < EndPtr);
+  Identifier = LineBuf;
+
+  // The frontend should end parsing on an assembler token boundary, unless it
+  // failed parsing.
+  assert((End.getPointer() == EndPtr || !Result) &&
+         "frontend claimed part of a token?");
+
+  // If the identifier lookup was unsuccessful, assume that we are dealing with
+  // a label.
+  if (!Result) {
+    StringRef InternalName =
+      SemaCallback->LookupInlineAsmLabel(Identifier, getSourceManager(),
+                                         Loc, false);
+    assert(InternalName.size() && "We should have an internal name here.");
+    // Push a rewrite for replacing the identifier name with the internal name.
+    InstInfo->AsmRewrites->emplace_back(AOK_Label, Loc, Identifier.size(),
+                                        InternalName);
+  }
+
+  // Create the symbol reference.
+  MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
+  MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+  Val = MCSymbolRefExpr::create(Sym, Variant, getParser().getContext());
+  return false;
+}
+
+/// \brief Parse intel style segment override.
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start,
+                                        unsigned Size) {
+  MCAsmParser &Parser = getParser();
+  assert(SegReg != 0 && "Tried to parse a segment override without a segment!");
+  const AsmToken &Tok = Parser.getTok(); // Eat colon.
+  if (Tok.isNot(AsmToken::Colon))
+    return ErrorOperand(Tok.getLoc(), "Expected ':' token!");
+  Parser.Lex(); // Eat ':'
+
+  int64_t ImmDisp = 0;
+  if (getLexer().is(AsmToken::Integer)) {
+    ImmDisp = Tok.getIntVal();
+    AsmToken ImmDispToken = Parser.Lex(); // Eat the integer.
+
+    if (isParsingInlineAsm())
+      InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, ImmDispToken.getLoc());
+
+    if (getLexer().isNot(AsmToken::LBrac)) {
+      // An immediate following a 'segment register', 'colon' token sequence can
+      // be followed by a bracketed expression.  If it isn't we know we have our
+      // final segment override.
+      const MCExpr *Disp = MCConstantExpr::create(ImmDisp, getContext());
+      return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp,
+                                   /*BaseReg=*/0, /*IndexReg=*/0, /*Scale=*/1,
+                                   Start, ImmDispToken.getEndLoc(), Size);
+    }
+  }
+
+  if (getLexer().is(AsmToken::LBrac))
+    return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
+
+  const MCExpr *Val;
+  SMLoc End;
+  if (!isParsingInlineAsm()) {
+    if (getParser().parsePrimaryExpr(Val, End))
+      return ErrorOperand(Tok.getLoc(), "unknown token in expression");
+
+    return X86Operand::CreateMem(getPointerWidth(), Val, Start, End, Size);
+  }
+
+  InlineAsmIdentifierInfo Info;
+  StringRef Identifier = Tok.getString();
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/false, End))
+    return nullptr;
+  return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0,
+                               /*Scale=*/1, Start, End, Size, Identifier, Info);
+}
+
+//ParseRoundingModeOp - Parse AVX-512 rounding mode operand
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  // Eat "{" and mark the current place.
+  const SMLoc consumedToken = consumeToken();
+  if (Tok.getIdentifier().startswith("r")){
+    int rndMode = StringSwitch<int>(Tok.getIdentifier())
+      .Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT)
+      .Case("rd", X86::STATIC_ROUNDING::TO_NEG_INF)
+      .Case("ru", X86::STATIC_ROUNDING::TO_POS_INF)
+      .Case("rz", X86::STATIC_ROUNDING::TO_ZERO)
+      .Default(-1);
+    if (-1 == rndMode)
+      return ErrorOperand(Tok.getLoc(), "Invalid rounding mode.");
+     Parser.Lex();  // Eat "r*" of r*-sae
+    if (!getLexer().is(AsmToken::Minus))
+      return ErrorOperand(Tok.getLoc(), "Expected - at this point");
+    Parser.Lex();  // Eat "-"
+    Parser.Lex();  // Eat the sae
+    if (!getLexer().is(AsmToken::RCurly))
+      return ErrorOperand(Tok.getLoc(), "Expected } at this point");
+    Parser.Lex();  // Eat "}"
+    const MCExpr *RndModeOp =
+      MCConstantExpr::create(rndMode, Parser.getContext());
+    return X86Operand::CreateImm(RndModeOp, Start, End);
+  }
+  if(Tok.getIdentifier().equals("sae")){
+    Parser.Lex();  // Eat the sae
+    if (!getLexer().is(AsmToken::RCurly))
+      return ErrorOperand(Tok.getLoc(), "Expected } at this point");
+    Parser.Lex();  // Eat "}"
+    return X86Operand::CreateToken("{sae}", consumedToken);
+  }
+  return ErrorOperand(Tok.getLoc(), "unknown token in expression");
+}
+/// ParseIntelMemOperand - Parse intel style memory operand.
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp,
+                                                               SMLoc Start,
+                                                               unsigned Size) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc End;
+
+  // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
+  if (getLexer().is(AsmToken::LBrac))
+    return ParseIntelBracExpression(/*SegReg=*/0, Start, ImmDisp, Size);
+  assert(ImmDisp == 0);
+
+  const MCExpr *Val;
+  if (!isParsingInlineAsm()) {
+    if (getParser().parsePrimaryExpr(Val, End))
+      return ErrorOperand(Tok.getLoc(), "unknown token in expression");
+
+    return X86Operand::CreateMem(getPointerWidth(), Val, Start, End, Size);
+  }
+
+  InlineAsmIdentifierInfo Info;
+  StringRef Identifier = Tok.getString();
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/false, End))
+    return nullptr;
+
+  if (!getLexer().is(AsmToken::LBrac))
+    return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0, /*IndexReg=*/0,
+                                 /*Scale=*/1, Start, End, Size, Identifier, Info);
+
+  Parser.Lex(); // Eat '['
+
+  // Parse Identifier [ ImmDisp ]
+  IntelExprStateMachine SM(/*ImmDisp=*/0, /*StopOnLBrac=*/true,
+                           /*AddImmPrefix=*/false);
+  if (ParseIntelExpression(SM, End))
+    return nullptr;
+
+  if (SM.getSym()) {
+    Error(Start, "cannot use more than one symbol in memory operand");
+    return nullptr;
+  }
+  if (SM.getBaseReg()) {
+    Error(Start, "cannot use base register with variable reference");
+    return nullptr;
+  }
+  if (SM.getIndexReg()) {
+    Error(Start, "cannot use index register with variable reference");
+    return nullptr;
+  }
+
+  const MCExpr *Disp = MCConstantExpr::create(SM.getImm(), getContext());
+  // BaseReg is non-zero to avoid assertions.  In the context of inline asm,
+  // we're pointing to a local variable in memory, so the base register is
+  // really the frame or stack pointer.
+  return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
+                               /*BaseReg=*/1, /*IndexReg=*/0, /*Scale=*/1,
+                               Start, End, Size, Identifier, Info.OpDecl);
+}
+
+/// Parse the '.' operator.
+bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
+                                                const MCExpr *&NewDisp) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  int64_t OrigDispVal, DotDispVal;
+
+  // FIXME: Handle non-constant expressions.
+  if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp))
+    OrigDispVal = OrigDisp->getValue();
+  else
+    return Error(Tok.getLoc(), "Non-constant offsets are not supported!");
+
+  // Drop the optional '.'.
+  StringRef DotDispStr = Tok.getString();
+  if (DotDispStr.startswith("."))
+    DotDispStr = DotDispStr.drop_front(1);
+
+  // .Imm gets lexed as a real.
+  if (Tok.is(AsmToken::Real)) {
+    APInt DotDisp;
+    DotDispStr.getAsInteger(10, DotDisp);
+    DotDispVal = DotDisp.getZExtValue();
+  } else if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
+    unsigned DotDisp;
+    std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
+    if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second,
+                                           DotDisp))
+      return Error(Tok.getLoc(), "Unable to lookup field reference!");
+    DotDispVal = DotDisp;
+  } else
+    return Error(Tok.getLoc(), "Unexpected token type!");
+
+  if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
+    SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
+    unsigned Len = DotDispStr.size();
+    unsigned Val = OrigDispVal + DotDispVal;
+    InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, Val);
+  }
+
+  NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext());
+  return false;
+}
+
+/// Parse the 'offset' operator.  This operator is used to specify the
+/// location rather then the content of a variable.
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc OffsetOfLoc = Tok.getLoc();
+  Parser.Lex(); // Eat offset.
+
+  const MCExpr *Val;
+  InlineAsmIdentifierInfo Info;
+  SMLoc Start = Tok.getLoc(), End;
+  StringRef Identifier = Tok.getString();
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/false, End))
+    return nullptr;
+
+  // Don't emit the offset operator.
+  InstInfo->AsmRewrites->emplace_back(AOK_Skip, OffsetOfLoc, 7);
+
+  // The offset operator will have an 'r' constraint, thus we need to create
+  // register operand to ensure proper matching.  Just pick a GPR based on
+  // the size of a pointer.
+  unsigned RegNo =
+      is64BitMode() ? X86::RBX : (is32BitMode() ? X86::EBX : X86::BX);
+  return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true,
+                               OffsetOfLoc, Identifier, Info.OpDecl);
+}
+
+enum IntelOperatorKind {
+  IOK_LENGTH,
+  IOK_SIZE,
+  IOK_TYPE
+};
+
+/// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators.  The LENGTH operator
+/// returns the number of elements in an array.  It returns the value 1 for
+/// non-array variables.  The SIZE operator returns the size of a C or C++
+/// variable.  A variable's size is the product of its LENGTH and TYPE.  The
+/// TYPE operator returns the size of a C or C++ type or variable. If the
+/// variable is an array, TYPE returns the size of a single element.
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc TypeLoc = Tok.getLoc();
+  Parser.Lex(); // Eat operator.
+
+  const MCExpr *Val = nullptr;
+  InlineAsmIdentifierInfo Info;
+  SMLoc Start = Tok.getLoc(), End;
+  StringRef Identifier = Tok.getString();
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/true, End))
+    return nullptr;
+
+  if (!Info.OpDecl)
+    return ErrorOperand(Start, "unable to lookup expression");
+
+  unsigned CVal = 0;
+  switch(OpKind) {
+  default: llvm_unreachable("Unexpected operand kind!");
+  case IOK_LENGTH: CVal = Info.Length; break;
+  case IOK_SIZE: CVal = Info.Size; break;
+  case IOK_TYPE: CVal = Info.Type; break;
+  }
+
+  // Rewrite the type operator and the C or C++ type or variable in terms of an
+  // immediate.  E.g. TYPE foo -> $$4
+  unsigned Len = End.getPointer() - TypeLoc.getPointer();
+  InstInfo->AsmRewrites->emplace_back(AOK_Imm, TypeLoc, Len, CVal);
+
+  const MCExpr *Imm = MCConstantExpr::create(CVal, getContext());
+  return X86Operand::CreateImm(Imm, Start, End);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc Start, End;
+
+  // Offset, length, type and size operators.
+  if (isParsingInlineAsm()) {
+    StringRef AsmTokStr = Tok.getString();
+    if (AsmTokStr == "offset" || AsmTokStr == "OFFSET")
+      return ParseIntelOffsetOfOperator();
+    if (AsmTokStr == "length" || AsmTokStr == "LENGTH")
+      return ParseIntelOperator(IOK_LENGTH);
+    if (AsmTokStr == "size" || AsmTokStr == "SIZE")
+      return ParseIntelOperator(IOK_SIZE);
+    if (AsmTokStr == "type" || AsmTokStr == "TYPE")
+      return ParseIntelOperator(IOK_TYPE);
+  }
+
+  bool PtrInOperand = false;
+  unsigned Size = getIntelMemOperandSize(Tok.getString());
+  if (Size) {
+    Parser.Lex(); // Eat operand size (e.g., byte, word).
+    if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
+      return ErrorOperand(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!");
+    Parser.Lex(); // Eat ptr.
+    PtrInOperand = true;
+  }
+  Start = Tok.getLoc();
+
+  // Immediate.
+  if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) ||
+      getLexer().is(AsmToken::Tilde) || getLexer().is(AsmToken::LParen)) {
+    AsmToken StartTok = Tok;
+    IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true,
+                             /*AddImmPrefix=*/false);
+    if (ParseIntelExpression(SM, End))
+      return nullptr;
+
+    int64_t Imm = SM.getImm();
+    if (isParsingInlineAsm()) {
+      unsigned Len = Tok.getLoc().getPointer() - Start.getPointer();
+      if (StartTok.getString().size() == Len)
+        // Just add a prefix if this wasn't a complex immediate expression.
+        InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Start);
+      else
+        // Otherwise, rewrite the complex expression as a single immediate.
+        InstInfo->AsmRewrites->emplace_back(AOK_Imm, Start, Len, Imm);
+    }
+
+    if (getLexer().isNot(AsmToken::LBrac)) {
+      // If a directional label (ie. 1f or 2b) was parsed above from
+      // ParseIntelExpression() then SM.getSym() was set to a pointer to
+      // to the MCExpr with the directional local symbol and this is a
+      // memory operand not an immediate operand.
+      if (SM.getSym())
+        return X86Operand::CreateMem(getPointerWidth(), SM.getSym(), Start, End,
+                                     Size);
+
+      const MCExpr *ImmExpr = MCConstantExpr::create(Imm, getContext());
+      return X86Operand::CreateImm(ImmExpr, Start, End);
+    }
+
+    // Only positive immediates are valid.
+    if (Imm < 0)
+      return ErrorOperand(Start, "expected a positive immediate displacement "
+                          "before bracketed expr.");
+
+    // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
+    return ParseIntelMemOperand(Imm, Start, Size);
+  }
+
+  // rounding mode token
+  if (getSTI().getFeatureBits()[X86::FeatureAVX512] &&
+      getLexer().is(AsmToken::LCurly))
+    return ParseRoundingModeOp(Start, End);
+
+  // Register.
+  unsigned RegNo = 0;
+  if (!ParseRegister(RegNo, Start, End)) {
+    // If this is a segment register followed by a ':', then this is the start
+    // of a segment override, otherwise this is a normal register reference.
+    // In case it is a normal register and there is ptr in the operand this 
+    // is an error
+    if (getLexer().isNot(AsmToken::Colon)){
+      if (PtrInOperand){
+        return ErrorOperand(Start, "expected memory operand after "
+                                   "'ptr', found register operand instead");
+      }
+      return X86Operand::CreateReg(RegNo, Start, End);
+    }
+    
+    return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size);
+  }
+
+  // Memory operand.
+  return ParseIntelMemOperand(/*Disp=*/0, Start, Size);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
+  MCAsmParser &Parser = getParser();
+  switch (getLexer().getKind()) {
+  default:
+    // Parse a memory operand with no segment register.
+    return ParseMemOperand(0, Parser.getTok().getLoc());
+  case AsmToken::Percent: {
+    // Read the register.
+    unsigned RegNo;
+    SMLoc Start, End;
+    if (ParseRegister(RegNo, Start, End)) return nullptr;
+    if (RegNo == X86::EIZ || RegNo == X86::RIZ) {
+      Error(Start, "%eiz and %riz can only be used as index registers",
+            SMRange(Start, End));
+      return nullptr;
+    }
+
+    // If this is a segment register followed by a ':', then this is the start
+    // of a memory reference, otherwise this is a normal register reference.
+    if (getLexer().isNot(AsmToken::Colon))
+      return X86Operand::CreateReg(RegNo, Start, End);
+
+    if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo))
+      return ErrorOperand(Start, "invalid segment register");
+
+    getParser().Lex(); // Eat the colon.
+    return ParseMemOperand(RegNo, Start);
+  }
+  case AsmToken::Dollar: {
+    // $42 -> immediate.
+    SMLoc Start = Parser.getTok().getLoc(), End;
+    Parser.Lex();
+    const MCExpr *Val;
+    if (getParser().parseExpression(Val, End))
+      return nullptr;
+    return X86Operand::CreateImm(Val, Start, End);
+  }
+  case AsmToken::LCurly:{
+    SMLoc Start = Parser.getTok().getLoc(), End;
+    if (getSTI().getFeatureBits()[X86::FeatureAVX512])
+      return ParseRoundingModeOp(Start, End);
+    return ErrorOperand(Start, "unknown token in expression");
+  }
+  }
+}
+
+bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
+                                       const MCParsedAsmOperand &Op) {
+  MCAsmParser &Parser = getParser();
+  if(getSTI().getFeatureBits()[X86::FeatureAVX512]) {
+    if (getLexer().is(AsmToken::LCurly)) {
+      // Eat "{" and mark the current place.
+      const SMLoc consumedToken = consumeToken();
+      // Distinguish {1to<NUM>} from {%k<NUM>}.
+      if(getLexer().is(AsmToken::Integer)) {
+        // Parse memory broadcasting ({1to<NUM>}).
+        if (getLexer().getTok().getIntVal() != 1)
+          return !ErrorAndEatStatement(getLexer().getLoc(),
+                                       "Expected 1to<NUM> at this point");
+        Parser.Lex();  // Eat "1" of 1to8
+        if (!getLexer().is(AsmToken::Identifier) ||
+            !getLexer().getTok().getIdentifier().startswith("to"))
+          return !ErrorAndEatStatement(getLexer().getLoc(),
+                                       "Expected 1to<NUM> at this point");
+        // Recognize only reasonable suffixes.
+        const char *BroadcastPrimitive =
+          StringSwitch<const char*>(getLexer().getTok().getIdentifier())
+            .Case("to2",  "{1to2}")
+            .Case("to4",  "{1to4}")
+            .Case("to8",  "{1to8}")
+            .Case("to16", "{1to16}")
+            .Default(nullptr);
+        if (!BroadcastPrimitive)
+          return !ErrorAndEatStatement(getLexer().getLoc(),
+                                       "Invalid memory broadcast primitive.");
+        Parser.Lex();  // Eat "toN" of 1toN
+        if (!getLexer().is(AsmToken::RCurly))
+          return !ErrorAndEatStatement(getLexer().getLoc(),
+                                       "Expected } at this point");
+        Parser.Lex();  // Eat "}"
+        Operands.push_back(X86Operand::CreateToken(BroadcastPrimitive,
+                                                   consumedToken));
+        // No AVX512 specific primitives can pass
+        // after memory broadcasting, so return.
+        return true;
+      } else {
+        // Parse mask register {%k1}
+        Operands.push_back(X86Operand::CreateToken("{", consumedToken));
+        if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
+          Operands.push_back(std::move(Op));
+          if (!getLexer().is(AsmToken::RCurly))
+            return !ErrorAndEatStatement(getLexer().getLoc(),
+                                         "Expected } at this point");
+          Operands.push_back(X86Operand::CreateToken("}", consumeToken()));
+
+          // Parse "zeroing non-masked" semantic {z}
+          if (getLexer().is(AsmToken::LCurly)) {
+            Operands.push_back(X86Operand::CreateToken("{z}", consumeToken()));
+            if (!getLexer().is(AsmToken::Identifier) ||
+                getLexer().getTok().getIdentifier() != "z")
+              return !ErrorAndEatStatement(getLexer().getLoc(),
+                                           "Expected z at this point");
+            Parser.Lex();  // Eat the z
+            if (!getLexer().is(AsmToken::RCurly))
+              return !ErrorAndEatStatement(getLexer().getLoc(),
+                                           "Expected } at this point");
+            Parser.Lex();  // Eat the }
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+
+/// ParseMemOperand: segment: disp(basereg, indexreg, scale).  The '%ds:' prefix
+/// has already been parsed if present.
+std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
+                                                          SMLoc MemStart) {
+
+  MCAsmParser &Parser = getParser();
+  // We have to disambiguate a parenthesized expression "(4+5)" from the start
+  // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)".  The
+  // only way to do this without lookahead is to eat the '(' and see what is
+  // after it.
+  const MCExpr *Disp = MCConstantExpr::create(0, getParser().getContext());
+  if (getLexer().isNot(AsmToken::LParen)) {
+    SMLoc ExprEnd;
+    if (getParser().parseExpression(Disp, ExprEnd)) return nullptr;
+
+    // After parsing the base expression we could either have a parenthesized
+    // memory address or not.  If not, return now.  If so, eat the (.
+    if (getLexer().isNot(AsmToken::LParen)) {
+      // Unless we have a segment register, treat this as an immediate.
+      if (SegReg == 0)
+        return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, ExprEnd);
+      return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
+                                   MemStart, ExprEnd);
+    }
+
+    // Eat the '('.
+    Parser.Lex();
+  } else {
+    // Okay, we have a '('.  We don't know if this is an expression or not, but
+    // so we have to eat the ( to see beyond it.
+    SMLoc LParenLoc = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat the '('.
+
+    if (getLexer().is(AsmToken::Percent) || getLexer().is(AsmToken::Comma)) {
+      // Nothing to do here, fall into the code below with the '(' part of the
+      // memory operand consumed.
+    } else {
+      SMLoc ExprEnd;
+
+      // It must be an parenthesized expression, parse it now.
+      if (getParser().parseParenExpression(Disp, ExprEnd))
+        return nullptr;
+
+      // After parsing the base expression we could either have a parenthesized
+      // memory address or not.  If not, return now.  If so, eat the (.
+      if (getLexer().isNot(AsmToken::LParen)) {
+        // Unless we have a segment register, treat this as an immediate.
+        if (SegReg == 0)
+          return X86Operand::CreateMem(getPointerWidth(), Disp, LParenLoc,
+                                       ExprEnd);
+        return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
+                                     MemStart, ExprEnd);
+      }
+
+      // Eat the '('.
+      Parser.Lex();
+    }
+  }
+
+  // If we reached here, then we just ate the ( of the memory operand.  Process
+  // the rest of the memory operand.
+  unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
+  SMLoc IndexLoc, BaseLoc;
+
+  if (getLexer().is(AsmToken::Percent)) {
+    SMLoc StartLoc, EndLoc;
+    BaseLoc = Parser.getTok().getLoc();
+    if (ParseRegister(BaseReg, StartLoc, EndLoc)) return nullptr;
+    if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) {
+      Error(StartLoc, "eiz and riz can only be used as index registers",
+            SMRange(StartLoc, EndLoc));
+      return nullptr;
+    }
+  }
+
+  if (getLexer().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the comma.
+    IndexLoc = Parser.getTok().getLoc();
+
+    // Following the comma we should have either an index register, or a scale
+    // value. We don't support the later form, but we want to parse it
+    // correctly.
+    //
+    // Not that even though it would be completely consistent to support syntax
+    // like "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this.
+    if (getLexer().is(AsmToken::Percent)) {
+      SMLoc L;
+      if (ParseRegister(IndexReg, L, L)) return nullptr;
+
+      if (getLexer().isNot(AsmToken::RParen)) {
+        // Parse the scale amount:
+        //  ::= ',' [scale-expression]
+        if (getLexer().isNot(AsmToken::Comma)) {
+          Error(Parser.getTok().getLoc(),
+                "expected comma in scale expression");
+          return nullptr;
+        }
+        Parser.Lex(); // Eat the comma.
+
+        if (getLexer().isNot(AsmToken::RParen)) {
+          SMLoc Loc = Parser.getTok().getLoc();
+
+          int64_t ScaleVal;
+          if (getParser().parseAbsoluteExpression(ScaleVal)){
+            Error(Loc, "expected scale expression");
+            return nullptr;
+          }
+
+          // Validate the scale amount.
+          if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
+              ScaleVal != 1) {
+            Error(Loc, "scale factor in 16-bit address must be 1");
+            return nullptr;
+          }
+          if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 &&
+              ScaleVal != 8) {
+            Error(Loc, "scale factor in address must be 1, 2, 4 or 8");
+            return nullptr;
+          }
+          Scale = (unsigned)ScaleVal;
+        }
+      }
+    } else if (getLexer().isNot(AsmToken::RParen)) {
+      // A scale amount without an index is ignored.
+      // index.
+      SMLoc Loc = Parser.getTok().getLoc();
+
+      int64_t Value;
+      if (getParser().parseAbsoluteExpression(Value))
+        return nullptr;
+
+      if (Value != 1)
+        Warning(Loc, "scale factor without index register is ignored");
+      Scale = 1;
+    }
+  }
+
+  // Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
+  if (getLexer().isNot(AsmToken::RParen)) {
+    Error(Parser.getTok().getLoc(), "unexpected token in memory operand");
+    return nullptr;
+  }
+  SMLoc MemEnd = Parser.getTok().getEndLoc();
+  Parser.Lex(); // Eat the ')'.
+
+  // Check for use of invalid 16-bit registers. Only BX/BP/SI/DI are allowed,
+  // and then only in non-64-bit modes. Except for DX, which is a special case
+  // because an unofficial form of in/out instructions uses it.
+  if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
+      (is64BitMode() || (BaseReg != X86::BX && BaseReg != X86::BP &&
+                         BaseReg != X86::SI && BaseReg != X86::DI)) &&
+      BaseReg != X86::DX) {
+    Error(BaseLoc, "invalid 16-bit base register");
+    return nullptr;
+  }
+  if (BaseReg == 0 &&
+      X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) {
+    Error(IndexLoc, "16-bit memory operand may not include only index register");
+    return nullptr;
+  }
+
+  StringRef ErrMsg;
+  if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
+    Error(BaseLoc, ErrMsg);
+    return nullptr;
+  }
+
+  if (SegReg || BaseReg || IndexReg)
+    return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
+                                 IndexReg, Scale, MemStart, MemEnd);
+  return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, MemEnd);
+}
+
+bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                    SMLoc NameLoc, OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  InstInfo = &Info;
+  StringRef PatchedName = Name;
+
+  // FIXME: Hack to recognize setneb as setne.
+  if (PatchedName.startswith("set") && PatchedName.endswith("b") &&
+      PatchedName != "setb" && PatchedName != "setnb")
+    PatchedName = PatchedName.substr(0, Name.size()-1);
+
+  // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
+  if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
+      (PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
+       PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
+    bool IsVCMP = PatchedName[0] == 'v';
+    unsigned CCIdx = IsVCMP ? 4 : 3;
+    unsigned ComparisonCode = StringSwitch<unsigned>(
+      PatchedName.slice(CCIdx, PatchedName.size() - 2))
+      .Case("eq",       0x00)
+      .Case("lt",       0x01)
+      .Case("le",       0x02)
+      .Case("unord",    0x03)
+      .Case("neq",      0x04)
+      .Case("nlt",      0x05)
+      .Case("nle",      0x06)
+      .Case("ord",      0x07)
+      /* AVX only from here */
+      .Case("eq_uq",    0x08)
+      .Case("nge",      0x09)
+      .Case("ngt",      0x0A)
+      .Case("false",    0x0B)
+      .Case("neq_oq",   0x0C)
+      .Case("ge",       0x0D)
+      .Case("gt",       0x0E)
+      .Case("true",     0x0F)
+      .Case("eq_os",    0x10)
+      .Case("lt_oq",    0x11)
+      .Case("le_oq",    0x12)
+      .Case("unord_s",  0x13)
+      .Case("neq_us",   0x14)
+      .Case("nlt_uq",   0x15)
+      .Case("nle_uq",   0x16)
+      .Case("ord_s",    0x17)
+      .Case("eq_us",    0x18)
+      .Case("nge_uq",   0x19)
+      .Case("ngt_uq",   0x1A)
+      .Case("false_os", 0x1B)
+      .Case("neq_os",   0x1C)
+      .Case("ge_oq",    0x1D)
+      .Case("gt_oq",    0x1E)
+      .Case("true_us",  0x1F)
+      .Default(~0U);
+    if (ComparisonCode != ~0U && (IsVCMP || ComparisonCode < 8)) {
+
+      Operands.push_back(X86Operand::CreateToken(PatchedName.slice(0, CCIdx),
+                                                 NameLoc));
+
+      const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
+                                                   getParser().getContext());
+      Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+
+      PatchedName = PatchedName.substr(PatchedName.size() - 2);
+    }
+  }
+
+  // FIXME: Hack to recognize vpcmp<comparison code>{ub,uw,ud,uq,b,w,d,q}.
+  if (PatchedName.startswith("vpcmp") &&
+      (PatchedName.endswith("b") || PatchedName.endswith("w") ||
+       PatchedName.endswith("d") || PatchedName.endswith("q"))) {
+    unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1;
+    unsigned ComparisonCode = StringSwitch<unsigned>(
+      PatchedName.slice(5, PatchedName.size() - CCIdx))
+      .Case("eq",    0x0) // Only allowed on unsigned. Checked below.
+      .Case("lt",    0x1)
+      .Case("le",    0x2)
+      //.Case("false", 0x3) // Not a documented alias.
+      .Case("neq",   0x4)
+      .Case("nlt",   0x5)
+      .Case("nle",   0x6)
+      //.Case("true",  0x7) // Not a documented alias.
+      .Default(~0U);
+    if (ComparisonCode != ~0U && (ComparisonCode != 0 || CCIdx == 2)) {
+      Operands.push_back(X86Operand::CreateToken("vpcmp", NameLoc));
+
+      const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
+                                                   getParser().getContext());
+      Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+
+      PatchedName = PatchedName.substr(PatchedName.size() - CCIdx);
+    }
+  }
+
+  // FIXME: Hack to recognize vpcom<comparison code>{ub,uw,ud,uq,b,w,d,q}.
+  if (PatchedName.startswith("vpcom") &&
+      (PatchedName.endswith("b") || PatchedName.endswith("w") ||
+       PatchedName.endswith("d") || PatchedName.endswith("q"))) {
+    unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1;
+    unsigned ComparisonCode = StringSwitch<unsigned>(
+      PatchedName.slice(5, PatchedName.size() - CCIdx))
+      .Case("lt",    0x0)
+      .Case("le",    0x1)
+      .Case("gt",    0x2)
+      .Case("ge",    0x3)
+      .Case("eq",    0x4)
+      .Case("neq",   0x5)
+      .Case("false", 0x6)
+      .Case("true",  0x7)
+      .Default(~0U);
+    if (ComparisonCode != ~0U) {
+      Operands.push_back(X86Operand::CreateToken("vpcom", NameLoc));
+
+      const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
+                                                   getParser().getContext());
+      Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+
+      PatchedName = PatchedName.substr(PatchedName.size() - CCIdx);
+    }
+  }
+
+  Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
+
+  // Determine whether this is an instruction prefix.
+  bool isPrefix =
+    Name == "lock" || Name == "rep" ||
+    Name == "repe" || Name == "repz" ||
+    Name == "repne" || Name == "repnz" ||
+    Name == "rex64" || Name == "data16";
+
+  // This does the actual operand parsing.  Don't parse any more if we have a
+  // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we
+  // just want to parse the "lock" as the first instruction and the "incl" as
+  // the next one.
+  if (getLexer().isNot(AsmToken::EndOfStatement) && !isPrefix) {
+
+    // Parse '*' modifier.
+    if (getLexer().is(AsmToken::Star))
+      Operands.push_back(X86Operand::CreateToken("*", consumeToken()));
+
+    // Read the operands.
+    while(1) {
+      if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
+        Operands.push_back(std::move(Op));
+        if (!HandleAVX512Operand(Operands, *Operands.back()))
+          return true;
+      } else {
+         Parser.eatToEndOfStatement();
+         return true;
+      }
+      // check for comma and eat it
+      if (getLexer().is(AsmToken::Comma))
+        Parser.Lex();
+      else
+        break;
+     }
+
+    if (getLexer().isNot(AsmToken::EndOfStatement))
+      return ErrorAndEatStatement(getLexer().getLoc(),
+                                  "unexpected token in argument list");
+   }
+
+  // Consume the EndOfStatement or the prefix separator Slash
+  if (getLexer().is(AsmToken::EndOfStatement) ||
+      (isPrefix && getLexer().is(AsmToken::Slash)))
+    Parser.Lex();
+
+  // This is for gas compatibility and cannot be done in td.
+  // Adding "p" for some floating point with no argument.
+  // For example: fsub --> fsubp
+  bool IsFp =
+    Name == "fsub" || Name == "fdiv" || Name == "fsubr" || Name == "fdivr";
+  if (IsFp && Operands.size() == 1) {
+    const char *Repl = StringSwitch<const char *>(Name)
+      .Case("fsub", "fsubp")
+      .Case("fdiv", "fdivp")
+      .Case("fsubr", "fsubrp")
+      .Case("fdivr", "fdivrp");
+    static_cast<X86Operand &>(*Operands[0]).setTokenValue(Repl);
+  }
+
+  // This is a terrible hack to handle "out[bwl]? %al, (%dx)" ->
+  // "outb %al, %dx".  Out doesn't take a memory form, but this is a widely
+  // documented form in various unofficial manuals, so a lot of code uses it.
+  if ((Name == "outb" || Name == "outw" || Name == "outl" || Name == "out") &&
+      Operands.size() == 3) {
+    X86Operand &Op = (X86Operand &)*Operands.back();
+    if (Op.isMem() && Op.Mem.SegReg == 0 &&
+        isa<MCConstantExpr>(Op.Mem.Disp) &&
+        cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+        Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
+      SMLoc Loc = Op.getEndLoc();
+      Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
+    }
+  }
+  // Same hack for "in[bwl]? (%dx), %al" -> "inb %dx, %al".
+  if ((Name == "inb" || Name == "inw" || Name == "inl" || Name == "in") &&
+      Operands.size() == 3) {
+    X86Operand &Op = (X86Operand &)*Operands[1];
+    if (Op.isMem() && Op.Mem.SegReg == 0 &&
+        isa<MCConstantExpr>(Op.Mem.Disp) &&
+        cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+        Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
+      SMLoc Loc = Op.getEndLoc();
+      Operands[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
+    }
+  }
+
+  // Append default arguments to "ins[bwld]"
+  if (Name.startswith("ins") && Operands.size() == 1 &&
+      (Name == "insb" || Name == "insw" || Name == "insl" || Name == "insd")) {
+    AddDefaultSrcDestOperands(Operands,
+                              X86Operand::CreateReg(X86::DX, NameLoc, NameLoc),
+                              DefaultMemDIOperand(NameLoc));
+  }
+
+  // Append default arguments to "outs[bwld]"
+  if (Name.startswith("outs") && Operands.size() == 1 &&
+      (Name == "outsb" || Name == "outsw" || Name == "outsl" ||
+       Name == "outsd" )) {
+    AddDefaultSrcDestOperands(Operands,
+                              DefaultMemSIOperand(NameLoc),
+                              X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
+  }
+
+  // Transform "lods[bwlq]" into "lods[bwlq] ($SIREG)" for appropriate
+  // values of $SIREG according to the mode. It would be nice if this
+  // could be achieved with InstAlias in the tables.
+  if (Name.startswith("lods") && Operands.size() == 1 &&
+      (Name == "lods" || Name == "lodsb" || Name == "lodsw" ||
+       Name == "lodsl" || Name == "lodsd" || Name == "lodsq"))
+    Operands.push_back(DefaultMemSIOperand(NameLoc));
+
+  // Transform "stos[bwlq]" into "stos[bwlq] ($DIREG)" for appropriate
+  // values of $DIREG according to the mode. It would be nice if this
+  // could be achieved with InstAlias in the tables.
+  if (Name.startswith("stos") && Operands.size() == 1 &&
+      (Name == "stos" || Name == "stosb" || Name == "stosw" ||
+       Name == "stosl" || Name == "stosd" || Name == "stosq"))
+    Operands.push_back(DefaultMemDIOperand(NameLoc));
+
+  // Transform "scas[bwlq]" into "scas[bwlq] ($DIREG)" for appropriate
+  // values of $DIREG according to the mode. It would be nice if this
+  // could be achieved with InstAlias in the tables.
+  if (Name.startswith("scas") && Operands.size() == 1 &&
+      (Name == "scas" || Name == "scasb" || Name == "scasw" ||
+       Name == "scasl" || Name == "scasd" || Name == "scasq"))
+    Operands.push_back(DefaultMemDIOperand(NameLoc));
+
+  // Add default SI and DI operands to "cmps[bwlq]".
+  if (Name.startswith("cmps") &&
+      (Name == "cmps" || Name == "cmpsb" || Name == "cmpsw" ||
+       Name == "cmpsl" || Name == "cmpsd" || Name == "cmpsq")) {
+    if (Operands.size() == 1) {
+      AddDefaultSrcDestOperands(Operands,
+                                DefaultMemDIOperand(NameLoc),
+                                DefaultMemSIOperand(NameLoc));
+    } else if (Operands.size() == 3) {
+      X86Operand &Op = (X86Operand &)*Operands[1];
+      X86Operand &Op2 = (X86Operand &)*Operands[2];
+      if (!doSrcDstMatch(Op, Op2))
+        return Error(Op.getStartLoc(),
+                     "mismatching source and destination index registers");
+    }
+  }
+
+  // Add default SI and DI operands to "movs[bwlq]".
+  if ((Name.startswith("movs") &&
+      (Name == "movs" || Name == "movsb" || Name == "movsw" ||
+       Name == "movsl" || Name == "movsd" || Name == "movsq")) ||
+      (Name.startswith("smov") &&
+      (Name == "smov" || Name == "smovb" || Name == "smovw" ||
+       Name == "smovl" || Name == "smovd" || Name == "smovq"))) {
+    if (Operands.size() == 1) {
+      if (Name == "movsd")
+        Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
+      AddDefaultSrcDestOperands(Operands,
+                                DefaultMemSIOperand(NameLoc),
+                                DefaultMemDIOperand(NameLoc));
+    } else if (Operands.size() == 3) {
+      X86Operand &Op = (X86Operand &)*Operands[1];
+      X86Operand &Op2 = (X86Operand &)*Operands[2];
+      if (!doSrcDstMatch(Op, Op2))
+        return Error(Op.getStartLoc(),
+                     "mismatching source and destination index registers");
+    }
+  }
+
+  // FIXME: Hack to handle recognize s{hr,ar,hl} $1, <op>.  Canonicalize to
+  // "shift <op>".
+  if ((Name.startswith("shr") || Name.startswith("sar") ||
+       Name.startswith("shl") || Name.startswith("sal") ||
+       Name.startswith("rcl") || Name.startswith("rcr") ||
+       Name.startswith("rol") || Name.startswith("ror")) &&
+      Operands.size() == 3) {
+    if (isParsingIntelSyntax()) {
+      // Intel syntax
+      X86Operand &Op1 = static_cast<X86Operand &>(*Operands[2]);
+      if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+          cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
+        Operands.pop_back();
+    } else {
+      X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+      if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+          cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
+        Operands.erase(Operands.begin() + 1);
+    }
+  }
+
+  // Transforms "int $3" into "int3" as a size optimization.  We can't write an
+  // instalias with an immediate operand yet.
+  if (Name == "int" && Operands.size() == 2) {
+    X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+    if (Op1.isImm())
+      if (auto *CE = dyn_cast<MCConstantExpr>(Op1.getImm()))
+        if (CE->getValue() == 3) {
+          Operands.erase(Operands.begin() + 1);
+          static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3");
+        }
+  }
+
+  return false;
+}
+
+bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
+  switch (Inst.getOpcode()) {
+  default: return false;
+  case X86::VMOVZPQILo2PQIrr:
+  case X86::VMOVAPDrr:
+  case X86::VMOVAPDYrr:
+  case X86::VMOVAPSrr:
+  case X86::VMOVAPSYrr:
+  case X86::VMOVDQArr:
+  case X86::VMOVDQAYrr:
+  case X86::VMOVDQUrr:
+  case X86::VMOVDQUYrr:
+  case X86::VMOVUPDrr:
+  case X86::VMOVUPDYrr:
+  case X86::VMOVUPSrr:
+  case X86::VMOVUPSYrr: {
+    if (X86II::isX86_64ExtendedReg(Inst.getOperand(0).getReg()) ||
+        !X86II::isX86_64ExtendedReg(Inst.getOperand(1).getReg()))
+      return false;
+
+    unsigned NewOpc;
+    switch (Inst.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr;   break;
+    case X86::VMOVAPDrr:        NewOpc = X86::VMOVAPDrr_REV;  break;
+    case X86::VMOVAPDYrr:       NewOpc = X86::VMOVAPDYrr_REV; break;
+    case X86::VMOVAPSrr:        NewOpc = X86::VMOVAPSrr_REV;  break;
+    case X86::VMOVAPSYrr:       NewOpc = X86::VMOVAPSYrr_REV; break;
+    case X86::VMOVDQArr:        NewOpc = X86::VMOVDQArr_REV;  break;
+    case X86::VMOVDQAYrr:       NewOpc = X86::VMOVDQAYrr_REV; break;
+    case X86::VMOVDQUrr:        NewOpc = X86::VMOVDQUrr_REV;  break;
+    case X86::VMOVDQUYrr:       NewOpc = X86::VMOVDQUYrr_REV; break;
+    case X86::VMOVUPDrr:        NewOpc = X86::VMOVUPDrr_REV;  break;
+    case X86::VMOVUPDYrr:       NewOpc = X86::VMOVUPDYrr_REV; break;
+    case X86::VMOVUPSrr:        NewOpc = X86::VMOVUPSrr_REV;  break;
+    case X86::VMOVUPSYrr:       NewOpc = X86::VMOVUPSYrr_REV; break;
+    }
+    Inst.setOpcode(NewOpc);
+    return true;
+  }
+  case X86::VMOVSDrr:
+  case X86::VMOVSSrr: {
+    if (X86II::isX86_64ExtendedReg(Inst.getOperand(0).getReg()) ||
+        !X86II::isX86_64ExtendedReg(Inst.getOperand(2).getReg()))
+      return false;
+    unsigned NewOpc;
+    switch (Inst.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV;   break;
+    case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV;   break;
+    }
+    Inst.setOpcode(NewOpc);
+    return true;
+  }
+  }
+}
+
+static const char *getSubtargetFeatureName(uint64_t Val);
+
+void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands,
+                                   MCStreamer &Out) {
+  Instrumentation->InstrumentAndEmitInstruction(Inst, Operands, getContext(),
+                                                MII, Out);
+}
+
+bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                           OperandVector &Operands,
+                                           MCStreamer &Out, uint64_t &ErrorInfo,
+                                           bool MatchingInlineAsm) {
+  if (isParsingIntelSyntax())
+    return MatchAndEmitIntelInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo,
+                                        MatchingInlineAsm);
+  return MatchAndEmitATTInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo,
+                                    MatchingInlineAsm);
+}
+
+void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op,
+                                     OperandVector &Operands, MCStreamer &Out,
+                                     bool MatchingInlineAsm) {
+  // FIXME: This should be replaced with a real .td file alias mechanism.
+  // Also, MatchInstructionImpl should actually *do* the EmitInstruction
+  // call.
+  const char *Repl = StringSwitch<const char *>(Op.getToken())
+                         .Case("finit", "fninit")
+                         .Case("fsave", "fnsave")
+                         .Case("fstcw", "fnstcw")
+                         .Case("fstcww", "fnstcw")
+                         .Case("fstenv", "fnstenv")
+                         .Case("fstsw", "fnstsw")
+                         .Case("fstsww", "fnstsw")
+                         .Case("fclex", "fnclex")
+                         .Default(nullptr);
+  if (Repl) {
+    MCInst Inst;
+    Inst.setOpcode(X86::WAIT);
+    Inst.setLoc(IDLoc);
+    if (!MatchingInlineAsm)
+      EmitInstruction(Inst, Operands, Out);
+    Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
+  }
+}
+
+bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
+                                       bool MatchingInlineAsm) {
+  assert(ErrorInfo && "Unknown missing feature!");
+  ArrayRef<SMRange> EmptyRanges = None;
+  SmallString<126> Msg;
+  raw_svector_ostream OS(Msg);
+  OS << "instruction requires:";
+  uint64_t Mask = 1;
+  for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+    if (ErrorInfo & Mask)
+      OS << ' ' << getSubtargetFeatureName(ErrorInfo & Mask);
+    Mask <<= 1;
+  }
+  return Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm);
+}
+
+bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                              OperandVector &Operands,
+                                              MCStreamer &Out,
+                                              uint64_t &ErrorInfo,
+                                              bool MatchingInlineAsm) {
+  assert(!Operands.empty() && "Unexpect empty operand list!");
+  X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
+  assert(Op.isToken() && "Leading operand should always be a mnemonic!");
+  ArrayRef<SMRange> EmptyRanges = None;
+
+  // First, handle aliases that expand to multiple instructions.
+  MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);
+
+  bool WasOriginallyInvalidOperand = false;
+  MCInst Inst;
+
+  // First, try a direct match.
+  switch (MatchInstructionImpl(Operands, Inst,
+                               ErrorInfo, MatchingInlineAsm,
+                               isParsingIntelSyntax())) {
+  default: llvm_unreachable("Unexpected match result!");
+  case Match_Success:
+    // Some instructions need post-processing to, for example, tweak which
+    // encoding is selected. Loop on it while changes happen so the
+    // individual transformations can chain off each other.
+    if (!MatchingInlineAsm)
+      while (processInstruction(Inst, Operands))
+        ;
+
+    Inst.setLoc(IDLoc);
+    if (!MatchingInlineAsm)
+      EmitInstruction(Inst, Operands, Out);
+    Opcode = Inst.getOpcode();
+    return false;
+  case Match_MissingFeature:
+    return ErrorMissingFeature(IDLoc, ErrorInfo, MatchingInlineAsm);
+  case Match_InvalidOperand:
+    WasOriginallyInvalidOperand = true;
+    break;
+  case Match_MnemonicFail:
+    break;
+  }
+
+  // FIXME: Ideally, we would only attempt suffix matches for things which are
+  // valid prefixes, and we could just infer the right unambiguous
+  // type. However, that requires substantially more matcher support than the
+  // following hack.
+
+  // Change the operand to point to a temporary token.
+  StringRef Base = Op.getToken();
+  SmallString<16> Tmp;
+  Tmp += Base;
+  Tmp += ' ';
+  Op.setTokenValue(Tmp);
+
+  // If this instruction starts with an 'f', then it is a floating point stack
+  // instruction.  These come in up to three forms for 32-bit, 64-bit, and
+  // 80-bit floating point, which use the suffixes s,l,t respectively.
+  //
+  // Otherwise, we assume that this may be an integer instruction, which comes
+  // in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively.
+  const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0";
+
+  // Check for the various suffix matches.
+  uint64_t ErrorInfoIgnore;
+  uint64_t ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings.
+  unsigned Match[4];
+
+  for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) {
+    Tmp.back() = Suffixes[I];
+    Match[I] = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                                  MatchingInlineAsm, isParsingIntelSyntax());
+    // If this returned as a missing feature failure, remember that.
+    if (Match[I] == Match_MissingFeature)
+      ErrorInfoMissingFeature = ErrorInfoIgnore;
+  }
+
+  // Restore the old token.
+  Op.setTokenValue(Base);
+
+  // If exactly one matched, then we treat that as a successful match (and the
+  // instruction will already have been filled in correctly, since the failing
+  // matches won't have modified it).
+  unsigned NumSuccessfulMatches =
+      std::count(std::begin(Match), std::end(Match), Match_Success);
+  if (NumSuccessfulMatches == 1) {
+    Inst.setLoc(IDLoc);
+    if (!MatchingInlineAsm)
+      EmitInstruction(Inst, Operands, Out);
+    Opcode = Inst.getOpcode();
+    return false;
+  }
+
+  // Otherwise, the match failed, try to produce a decent error message.
+
+  // If we had multiple suffix matches, then identify this as an ambiguous
+  // match.
+  if (NumSuccessfulMatches > 1) {
+    char MatchChars[4];
+    unsigned NumMatches = 0;
+    for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I)
+      if (Match[I] == Match_Success)
+        MatchChars[NumMatches++] = Suffixes[I];
+
+    SmallString<126> Msg;
+    raw_svector_ostream OS(Msg);
+    OS << "ambiguous instructions require an explicit suffix (could be ";
+    for (unsigned i = 0; i != NumMatches; ++i) {
+      if (i != 0)
+        OS << ", ";
+      if (i + 1 == NumMatches)
+        OS << "or ";
+      OS << "'" << Base << MatchChars[i] << "'";
+    }
+    OS << ")";
+    Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm);
+    return true;
+  }
+
+  // Okay, we know that none of the variants matched successfully.
+
+  // If all of the instructions reported an invalid mnemonic, then the original
+  // mnemonic was invalid.
+  if (std::count(std::begin(Match), std::end(Match), Match_MnemonicFail) == 4) {
+    if (!WasOriginallyInvalidOperand) {
+      ArrayRef<SMRange> Ranges =
+          MatchingInlineAsm ? EmptyRanges : Op.getLocRange();
+      return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
+                   Ranges, MatchingInlineAsm);
+    }
+
+    // Recover location info for the operand if we know which was the problem.
+    if (ErrorInfo != ~0ULL) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction",
+                     EmptyRanges, MatchingInlineAsm);
+
+      X86Operand &Operand = (X86Operand &)*Operands[ErrorInfo];
+      if (Operand.getStartLoc().isValid()) {
+        SMRange OperandRange = Operand.getLocRange();
+        return Error(Operand.getStartLoc(), "invalid operand for instruction",
+                     OperandRange, MatchingInlineAsm);
+      }
+    }
+
+    return Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+                 MatchingInlineAsm);
+  }
+
+  // If one instruction matched with a missing feature, report this as a
+  // missing feature.
+  if (std::count(std::begin(Match), std::end(Match),
+                 Match_MissingFeature) == 1) {
+    ErrorInfo = ErrorInfoMissingFeature;
+    return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature,
+                               MatchingInlineAsm);
+  }
+
+  // If one instruction matched with an invalid operand, report this as an
+  // operand failure.
+  if (std::count(std::begin(Match), std::end(Match),
+                 Match_InvalidOperand) == 1) {
+    return Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+                 MatchingInlineAsm);
+  }
+
+  // If all of these were an outright failure, report it in a useless way.
+  Error(IDLoc, "unknown use of instruction mnemonic without a size suffix",
+        EmptyRanges, MatchingInlineAsm);
+  return true;
+}
+
+bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                                OperandVector &Operands,
+                                                MCStreamer &Out,
+                                                uint64_t &ErrorInfo,
+                                                bool MatchingInlineAsm) {
+  assert(!Operands.empty() && "Unexpect empty operand list!");
+  X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
+  assert(Op.isToken() && "Leading operand should always be a mnemonic!");
+  StringRef Mnemonic = Op.getToken();
+  ArrayRef<SMRange> EmptyRanges = None;
+
+  // First, handle aliases that expand to multiple instructions.
+  MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);
+
+  MCInst Inst;
+
+  // Find one unsized memory operand, if present.
+  X86Operand *UnsizedMemOp = nullptr;
+  for (const auto &Op : Operands) {
+    X86Operand *X86Op = static_cast<X86Operand *>(Op.get());
+    if (X86Op->isMemUnsized())
+      UnsizedMemOp = X86Op;
+  }
+
+  // Allow some instructions to have implicitly pointer-sized operands.  This is
+  // compatible with gas.
+  if (UnsizedMemOp) {
+    static const char *const PtrSizedInstrs[] = {"call", "jmp", "push"};
+    for (const char *Instr : PtrSizedInstrs) {
+      if (Mnemonic == Instr) {
+        UnsizedMemOp->Mem.Size = getPointerWidth();
+        break;
+      }
+    }
+  }
+
+  // If an unsized memory operand is present, try to match with each memory
+  // operand size.  In Intel assembly, the size is not part of the instruction
+  // mnemonic.
+  SmallVector<unsigned, 8> Match;
+  uint64_t ErrorInfoMissingFeature = 0;
+  if (UnsizedMemOp && UnsizedMemOp->isMemUnsized()) {
+    static const unsigned MopSizes[] = {8, 16, 32, 64, 80, 128, 256, 512};
+    for (unsigned Size : MopSizes) {
+      UnsizedMemOp->Mem.Size = Size;
+      uint64_t ErrorInfoIgnore;
+      unsigned LastOpcode = Inst.getOpcode();
+      unsigned M =
+          MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                               MatchingInlineAsm, isParsingIntelSyntax());
+      if (Match.empty() || LastOpcode != Inst.getOpcode())
+        Match.push_back(M);
+
+      // If this returned as a missing feature failure, remember that.
+      if (Match.back() == Match_MissingFeature)
+        ErrorInfoMissingFeature = ErrorInfoIgnore;
+    }
+
+    // Restore the size of the unsized memory operand if we modified it.
+    if (UnsizedMemOp)
+      UnsizedMemOp->Mem.Size = 0;
+  }
+
+  // If we haven't matched anything yet, this is not a basic integer or FPU
+  // operation.  There shouldn't be any ambiguity in our mnemonic table, so try
+  // matching with the unsized operand.
+  if (Match.empty()) {
+    Match.push_back(MatchInstructionImpl(Operands, Inst, ErrorInfo,
+                                         MatchingInlineAsm,
+                                         isParsingIntelSyntax()));
+    // If this returned as a missing feature failure, remember that.
+    if (Match.back() == Match_MissingFeature)
+      ErrorInfoMissingFeature = ErrorInfo;
+  }
+
+  // Restore the size of the unsized memory operand if we modified it.
+  if (UnsizedMemOp)
+    UnsizedMemOp->Mem.Size = 0;
+
+  // If it's a bad mnemonic, all results will be the same.
+  if (Match.back() == Match_MnemonicFail) {
+    ArrayRef<SMRange> Ranges =
+        MatchingInlineAsm ? EmptyRanges : Op.getLocRange();
+    return Error(IDLoc, "invalid instruction mnemonic '" + Mnemonic + "'",
+                 Ranges, MatchingInlineAsm);
+  }
+
+  // If exactly one matched, then we treat that as a successful match (and the
+  // instruction will already have been filled in correctly, since the failing
+  // matches won't have modified it).
+  unsigned NumSuccessfulMatches =
+      std::count(std::begin(Match), std::end(Match), Match_Success);
+  if (NumSuccessfulMatches == 1) {
+    // Some instructions need post-processing to, for example, tweak which
+    // encoding is selected. Loop on it while changes happen so the individual
+    // transformations can chain off each other.
+    if (!MatchingInlineAsm)
+      while (processInstruction(Inst, Operands))
+        ;
+    Inst.setLoc(IDLoc);
+    if (!MatchingInlineAsm)
+      EmitInstruction(Inst, Operands, Out);
+    Opcode = Inst.getOpcode();
+    return false;
+  } else if (NumSuccessfulMatches > 1) {
+    assert(UnsizedMemOp &&
+           "multiple matches only possible with unsized memory operands");
+    ArrayRef<SMRange> Ranges =
+        MatchingInlineAsm ? EmptyRanges : UnsizedMemOp->getLocRange();
+    return Error(UnsizedMemOp->getStartLoc(),
+                 "ambiguous operand size for instruction '" + Mnemonic + "\'",
+                 Ranges, MatchingInlineAsm);
+  }
+
+  // If one instruction matched with a missing feature, report this as a
+  // missing feature.
+  if (std::count(std::begin(Match), std::end(Match),
+                 Match_MissingFeature) == 1) {
+    ErrorInfo = ErrorInfoMissingFeature;
+    return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature,
+                               MatchingInlineAsm);
+  }
+
+  // If one instruction matched with an invalid operand, report this as an
+  // operand failure.
+  if (std::count(std::begin(Match), std::end(Match),
+                 Match_InvalidOperand) == 1) {
+    return Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+                 MatchingInlineAsm);
+  }
+
+  // If all of these were an outright failure, report it in a useless way.
+  return Error(IDLoc, "unknown instruction mnemonic", EmptyRanges,
+               MatchingInlineAsm);
+}
+
+bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) {
+  return X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo);
+}
+
+bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
+  MCAsmParser &Parser = getParser();
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (IDVal == ".word")
+    return ParseDirectiveWord(2, DirectiveID.getLoc());
+  else if (IDVal.startswith(".code"))
+    return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
+  else if (IDVal.startswith(".att_syntax")) {
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      if (Parser.getTok().getString() == "prefix")
+        Parser.Lex();
+      else if (Parser.getTok().getString() == "noprefix")
+        return Error(DirectiveID.getLoc(), "'.att_syntax noprefix' is not "
+                                           "supported: registers must have a "
+                                           "'%' prefix in .att_syntax");
+    }
+    getParser().setAssemblerDialect(0);
+    return false;
+  } else if (IDVal.startswith(".intel_syntax")) {
+    getParser().setAssemblerDialect(1);
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      if (Parser.getTok().getString() == "noprefix")
+        Parser.Lex();
+      else if (Parser.getTok().getString() == "prefix")
+        return Error(DirectiveID.getLoc(), "'.intel_syntax prefix' is not "
+                                           "supported: registers must not have "
+                                           "a '%' prefix in .intel_syntax");
+    }
+    return false;
+  } else if (IDVal == ".even")
+    return parseDirectiveEven(DirectiveID.getLoc());
+  return true;
+}
+
+/// parseDirectiveEven
+///  ::= .even
+bool X86AsmParser::parseDirectiveEven(SMLoc L) {
+  const MCSection *Section = getStreamer().getCurrentSection().first;
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    TokError("unexpected token in directive");
+    return false;  
+  }
+  if (!Section) {
+    getStreamer().InitSections(false);
+    Section = getStreamer().getCurrentSection().first;
+  }
+  if (Section->UseCodeAlign())
+    getStreamer().EmitCodeAlignment(2, 0);
+  else
+    getStreamer().EmitValueToAlignment(2, 0, 1, 0);
+  return false;
+}
+/// ParseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      SMLoc ExprLoc = getLexer().getLoc();
+      if (getParser().parseExpression(Value))
+        return false;
+
+      if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) {
+        assert(Size <= 8 && "Invalid size");
+        uint64_t IntValue = MCE->getValue();
+        if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
+          return Error(ExprLoc, "literal value out of range for directive");
+        getStreamer().EmitIntValue(IntValue, Size);
+      } else {
+        getStreamer().EmitValue(Value, Size, ExprLoc);
+      }
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma)) {
+        Error(L, "unexpected token in directive");
+        return false;
+      }
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+/// ParseDirectiveCode
+///  ::= .code16 | .code32 | .code64
+bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  if (IDVal == ".code16") {
+    Parser.Lex();
+    if (!is16BitMode()) {
+      SwitchMode(X86::Mode16Bit);
+      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+    }
+  } else if (IDVal == ".code32") {
+    Parser.Lex();
+    if (!is32BitMode()) {
+      SwitchMode(X86::Mode32Bit);
+      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+    }
+  } else if (IDVal == ".code64") {
+    Parser.Lex();
+    if (!is64BitMode()) {
+      SwitchMode(X86::Mode64Bit);
+      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code64);
+    }
+  } else {
+    Error(L, "unknown directive " + IDVal);
+    return false;
+  }
+
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmParser() {
+  RegisterMCAsmParser<X86AsmParser> X(TheX86_32Target);
+  RegisterMCAsmParser<X86AsmParser> Y(TheX86_64Target);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#define GET_SUBTARGET_FEATURE_NAME
+#include "X86GenAsmMatcher.inc"
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h
new file mode 100644
index 0000000..54538c8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h
@@ -0,0 +1,39 @@
+//===-- X86AsmParserCommon.h - Common functions for X86AsmParser ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H
+#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H
+
+namespace llvm {
+
+inline bool isImmSExti16i8Value(uint64_t Value) {
+  return isInt<8>(Value) ||
+         (isUInt<16>(Value) && isInt<8>(static_cast<int16_t>(Value)));
+}
+
+inline bool isImmSExti32i8Value(uint64_t Value) {
+  return isInt<8>(Value) ||
+         (isUInt<32>(Value) && isInt<8>(static_cast<int32_t>(Value)));
+}
+
+inline bool isImmSExti64i8Value(uint64_t Value) {
+  return isInt<8>(Value);
+}
+
+inline bool isImmSExti64i32Value(uint64_t Value) {
+  return isInt<32>(Value);
+}
+
+inline bool isImmUnsignedi8Value(uint64_t Value) {
+  return isUInt<8>(Value) || isInt<8>(Value);
+}
+
+} // End of namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
new file mode 100644
index 0000000..7ec0240
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -0,0 +1,543 @@
+//===-- X86Operand.h - Parsed X86 machine instruction --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
+#define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
+
+#include "X86AsmParserCommon.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/ADT/STLExtras.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+
+namespace llvm {
+
+/// X86Operand - Instances of this class represent a parsed X86 machine
+/// instruction.
+struct X86Operand : public MCParsedAsmOperand {
+  enum KindTy {
+    Token,
+    Register,
+    Immediate,
+    Memory
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+  SMLoc OffsetOfLoc;
+  StringRef SymName;
+  void *OpDecl;
+  bool AddressOf;
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNo;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct MemOp {
+    unsigned SegReg;
+    const MCExpr *Disp;
+    unsigned BaseReg;
+    unsigned IndexReg;
+    unsigned Scale;
+    unsigned Size;
+    unsigned ModeSize;
+  };
+
+  union {
+    struct TokOp Tok;
+    struct RegOp Reg;
+    struct ImmOp Imm;
+    struct MemOp Mem;
+  };
+
+  X86Operand(KindTy K, SMLoc Start, SMLoc End)
+    : Kind(K), StartLoc(Start), EndLoc(End) {}
+
+  StringRef getSymName() override { return SymName; }
+  void *getOpDecl() override { return OpDecl; }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
+  /// getLocRange - Get the range between the first and last token of this
+  /// operand.
+  SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+  /// getOffsetOfLoc - Get the location of the offset operator.
+  SMLoc getOffsetOfLoc() const override { return OffsetOfLoc; }
+
+  void print(raw_ostream &OS) const override {}
+
+  StringRef getToken() const {
+    assert(Kind == Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+  void setTokenValue(StringRef Value) {
+    assert(Kind == Token && "Invalid access!");
+    Tok.Data = Value.data();
+    Tok.Length = Value.size();
+  }
+
+  unsigned getReg() const override {
+    assert(Kind == Register && "Invalid access!");
+    return Reg.RegNo;
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  const MCExpr *getMemDisp() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Disp;
+  }
+  unsigned getMemSegReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.SegReg;
+  }
+  unsigned getMemBaseReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.BaseReg;
+  }
+  unsigned getMemIndexReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.IndexReg;
+  }
+  unsigned getMemScale() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Scale;
+  }
+  unsigned getMemModeSize() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.ModeSize;
+  }
+
+  bool isToken() const override {return Kind == Token; }
+
+  bool isImm() const override { return Kind == Immediate; }
+
+  bool isImmSExti16i8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti16i8Value(CE->getValue());
+  }
+  bool isImmSExti32i8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti32i8Value(CE->getValue());
+  }
+  bool isImmSExti64i8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti64i8Value(CE->getValue());
+  }
+  bool isImmSExti64i32() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti64i32Value(CE->getValue());
+  }
+
+  bool isImmUnsignedi8() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    return isImmUnsignedi8Value(CE->getValue());
+  }
+
+  bool isOffsetOf() const override {
+    return OffsetOfLoc.getPointer();
+  }
+
+  bool needAddressOf() const override {
+    return AddressOf;
+  }
+
+  bool isMem() const override { return Kind == Memory; }
+  bool isMemUnsized() const {
+    return Kind == Memory && Mem.Size == 0;
+  }
+  bool isMem8() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 8);
+  }
+  bool isMem16() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 16);
+  }
+  bool isMem32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32);
+  }
+  bool isMem64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64);
+  }
+  bool isMem80() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 80);
+  }
+  bool isMem128() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 128);
+  }
+  bool isMem256() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 256);
+  }
+  bool isMem512() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 512);
+  }
+
+  bool isMemVX32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+  }
+  bool isMemVX32X() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM31;
+  }
+  bool isMemVY32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+  }
+  bool isMemVY32X() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM31;
+  }
+  bool isMemVX64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+  }
+  bool isMemVX64X() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM31;
+  }
+  bool isMemVY64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+  }
+  bool isMemVY64X() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM31;
+  }
+  bool isMemVZ32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+      getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
+  }
+  bool isMemVZ64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+      getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
+  }
+
+  bool isAbsMem() const {
+    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1;
+  }
+  bool isAVX512RC() const{
+      return isImm();
+  }
+
+  bool isAbsMem16() const {
+    return isAbsMem() && Mem.ModeSize == 16;
+  }
+
+  bool isSrcIdx() const {
+    return !getMemIndexReg() && getMemScale() == 1 &&
+      (getMemBaseReg() == X86::RSI || getMemBaseReg() == X86::ESI ||
+       getMemBaseReg() == X86::SI) && isa<MCConstantExpr>(getMemDisp()) &&
+      cast<MCConstantExpr>(getMemDisp())->getValue() == 0;
+  }
+  bool isSrcIdx8() const {
+    return isMem8() && isSrcIdx();
+  }
+  bool isSrcIdx16() const {
+    return isMem16() && isSrcIdx();
+  }
+  bool isSrcIdx32() const {
+    return isMem32() && isSrcIdx();
+  }
+  bool isSrcIdx64() const {
+    return isMem64() && isSrcIdx();
+  }
+
+  bool isDstIdx() const {
+    return !getMemIndexReg() && getMemScale() == 1 &&
+      (getMemSegReg() == 0 || getMemSegReg() == X86::ES) &&
+      (getMemBaseReg() == X86::RDI || getMemBaseReg() == X86::EDI ||
+       getMemBaseReg() == X86::DI) && isa<MCConstantExpr>(getMemDisp()) &&
+      cast<MCConstantExpr>(getMemDisp())->getValue() == 0;
+  }
+  bool isDstIdx8() const {
+    return isMem8() && isDstIdx();
+  }
+  bool isDstIdx16() const {
+    return isMem16() && isDstIdx();
+  }
+  bool isDstIdx32() const {
+    return isMem32() && isDstIdx();
+  }
+  bool isDstIdx64() const {
+    return isMem64() && isDstIdx();
+  }
+
+  bool isMemOffs() const {
+    return Kind == Memory && !getMemBaseReg() && !getMemIndexReg() &&
+      getMemScale() == 1;
+  }
+
+  bool isMemOffs16_8() const {
+    return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 8);
+  }
+  bool isMemOffs16_16() const {
+    return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 16);
+  }
+  bool isMemOffs16_32() const {
+    return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 32);
+  }
+  bool isMemOffs32_8() const {
+    return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 8);
+  }
+  bool isMemOffs32_16() const {
+    return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 16);
+  }
+  bool isMemOffs32_32() const {
+    return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 32);
+  }
+  bool isMemOffs32_64() const {
+    return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 64);
+  }
+  bool isMemOffs64_8() const {
+    return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 8);
+  }
+  bool isMemOffs64_16() const {
+    return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 16);
+  }
+  bool isMemOffs64_32() const {
+    return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 32);
+  }
+  bool isMemOffs64_64() const {
+    return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 64);
+  }
+
+  bool isReg() const override { return Kind == Register; }
+
+  bool isGR32orGR64() const {
+    return Kind == Register &&
+      (X86MCRegisterClasses[X86::GR32RegClassID].contains(getReg()) ||
+      X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::createImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+
+  static unsigned getGR32FromGR64(unsigned RegNo) {
+    switch (RegNo) {
+    default: llvm_unreachable("Unexpected register");
+    case X86::RAX: return X86::EAX;
+    case X86::RCX: return X86::ECX;
+    case X86::RDX: return X86::EDX;
+    case X86::RBX: return X86::EBX;
+    case X86::RBP: return X86::EBP;
+    case X86::RSP: return X86::ESP;
+    case X86::RSI: return X86::ESI;
+    case X86::RDI: return X86::EDI;
+    case X86::R8: return X86::R8D;
+    case X86::R9: return X86::R9D;
+    case X86::R10: return X86::R10D;
+    case X86::R11: return X86::R11D;
+    case X86::R12: return X86::R12D;
+    case X86::R13: return X86::R13D;
+    case X86::R14: return X86::R14D;
+    case X86::R15: return X86::R15D;
+    case X86::RIP: return X86::EIP;
+    }
+  }
+
+  void addGR32orGR64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    unsigned RegNo = getReg();
+    if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
+      RegNo = getGR32FromGR64(RegNo);
+    Inst.addOperand(MCOperand::createReg(RegNo));
+  }
+  void addAVX512RCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addMemOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 5) && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+    Inst.addOperand(MCOperand::createImm(getMemScale()));
+    Inst.addOperand(MCOperand::createReg(getMemIndexReg()));
+    addExpr(Inst, getMemDisp());
+    Inst.addOperand(MCOperand::createReg(getMemSegReg()));
+  }
+
+  void addAbsMemOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 1) && "Invalid number of operands!");
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+      Inst.addOperand(MCOperand::createImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::createExpr(getMemDisp()));
+  }
+
+  void addSrcIdxOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 2) && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+    Inst.addOperand(MCOperand::createReg(getMemSegReg()));
+  }
+  void addDstIdxOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 1) && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+  }
+
+  void addMemOffsOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 2) && "Invalid number of operands!");
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+      Inst.addOperand(MCOperand::createImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::createExpr(getMemDisp()));
+    Inst.addOperand(MCOperand::createReg(getMemSegReg()));
+  }
+
+  static std::unique_ptr<X86Operand> CreateToken(StringRef Str, SMLoc Loc) {
+    SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
+    auto Res = llvm::make_unique<X86Operand>(Token, Loc, EndLoc);
+    Res->Tok.Data = Str.data();
+    Res->Tok.Length = Str.size();
+    return Res;
+  }
+
+  static std::unique_ptr<X86Operand>
+  CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
+            bool AddressOf = false, SMLoc OffsetOfLoc = SMLoc(),
+            StringRef SymName = StringRef(), void *OpDecl = nullptr) {
+    auto Res = llvm::make_unique<X86Operand>(Register, StartLoc, EndLoc);
+    Res->Reg.RegNo = RegNo;
+    Res->AddressOf = AddressOf;
+    Res->OffsetOfLoc = OffsetOfLoc;
+    Res->SymName = SymName;
+    Res->OpDecl = OpDecl;
+    return Res;
+  }
+
+  static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val,
+                                               SMLoc StartLoc, SMLoc EndLoc) {
+    auto Res = llvm::make_unique<X86Operand>(Immediate, StartLoc, EndLoc);
+    Res->Imm.Val = Val;
+    return Res;
+  }
+
+  /// Create an absolute memory operand.
+  static std::unique_ptr<X86Operand>
+  CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
+            unsigned Size = 0, StringRef SymName = StringRef(),
+            void *OpDecl = nullptr) {
+    auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
+    Res->Mem.SegReg   = 0;
+    Res->Mem.Disp     = Disp;
+    Res->Mem.BaseReg  = 0;
+    Res->Mem.IndexReg = 0;
+    Res->Mem.Scale    = 1;
+    Res->Mem.Size     = Size;
+    Res->Mem.ModeSize = ModeSize;
+    Res->SymName      = SymName;
+    Res->OpDecl       = OpDecl;
+    Res->AddressOf    = false;
+    return Res;
+  }
+
+  /// Create a generalized memory operand.
+  static std::unique_ptr<X86Operand>
+  CreateMem(unsigned ModeSize, unsigned SegReg, const MCExpr *Disp,
+            unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc StartLoc,
+            SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(),
+            void *OpDecl = nullptr) {
+    // We should never just have a displacement, that should be parsed as an
+    // absolute memory operand.
+    assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
+
+    // The scale should always be one of {1,2,4,8}.
+    assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
+           "Invalid scale!");
+    auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
+    Res->Mem.SegReg   = SegReg;
+    Res->Mem.Disp     = Disp;
+    Res->Mem.BaseReg  = BaseReg;
+    Res->Mem.IndexReg = IndexReg;
+    Res->Mem.Scale    = Scale;
+    Res->Mem.Size     = Size;
+    Res->Mem.ModeSize = ModeSize;
+    Res->SymName      = SymName;
+    Res->OpDecl       = OpDecl;
+    Res->AddressOf    = false;
+    return Res;
+  }
+};
+
+} // End of namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
new file mode 100644
index 0000000..ce8fcf1
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -0,0 +1,1009 @@
+//===-- X86Disassembler.cpp - Disassembler for x86 and x86_64 -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains code to translate the data produced by the decoder into
+//  MCInsts.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86Disassembler.h"
+#include "X86DisassemblerDecoder.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::X86Disassembler;
+
+#define DEBUG_TYPE "x86-disassembler"
+
+#define GET_REGINFO_ENUM
+#include "X86GenRegisterInfo.inc"
+#define GET_INSTRINFO_ENUM
+#include "X86GenInstrInfo.inc"
+#define GET_SUBTARGETINFO_ENUM
+#include "X86GenSubtargetInfo.inc"
+
+void llvm::X86Disassembler::Debug(const char *file, unsigned line,
+                                  const char *s) {
+  dbgs() << file << ":" << line << ": " << s;
+}
+
+const char *llvm::X86Disassembler::GetInstrName(unsigned Opcode,
+                                                const void *mii) {
+  const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii);
+  return MII->getName(Opcode);
+}
+
+#define debug(s) DEBUG(Debug(__FILE__, __LINE__, s));
+
+namespace llvm {
+
+// Fill-ins to make the compiler happy.  These constants are never actually
+//   assigned; they are just filler to make an automatically-generated switch
+//   statement work.
+namespace X86 {
+  enum {
+    BX_SI = 500,
+    BX_DI = 501,
+    BP_SI = 502,
+    BP_DI = 503,
+    sib   = 504,
+    sib64 = 505
+  };
+}
+
+extern Target TheX86_32Target, TheX86_64Target;
+
+}
+
+static bool translateInstruction(MCInst &target,
+                                InternalInstruction &source,
+                                const MCDisassembler *Dis);
+
+X86GenericDisassembler::X86GenericDisassembler(
+                                         const MCSubtargetInfo &STI,
+                                         MCContext &Ctx,
+                                         std::unique_ptr<const MCInstrInfo> MII)
+  : MCDisassembler(STI, Ctx), MII(std::move(MII)) {
+  const FeatureBitset &FB = STI.getFeatureBits();
+  if (FB[X86::Mode16Bit]) {
+    fMode = MODE_16BIT;
+    return;
+  } else if (FB[X86::Mode32Bit]) {
+    fMode = MODE_32BIT;
+    return;
+  } else if (FB[X86::Mode64Bit]) {
+    fMode = MODE_64BIT;
+    return;
+  }
+
+  llvm_unreachable("Invalid CPU mode");
+}
+
+namespace {
+struct Region {
+  ArrayRef<uint8_t> Bytes;
+  uint64_t Base;
+  Region(ArrayRef<uint8_t> Bytes, uint64_t Base) : Bytes(Bytes), Base(Base) {}
+};
+} // end anonymous namespace
+
+/// A callback function that wraps the readByte method from Region.
+///
+/// @param Arg      - The generic callback parameter.  In this case, this should
+///                   be a pointer to a Region.
+/// @param Byte     - A pointer to the byte to be read.
+/// @param Address  - The address to be read.
+static int regionReader(const void *Arg, uint8_t *Byte, uint64_t Address) {
+  auto *R = static_cast<const Region *>(Arg);
+  ArrayRef<uint8_t> Bytes = R->Bytes;
+  unsigned Index = Address - R->Base;
+  if (Bytes.size() <= Index)
+    return -1;
+  *Byte = Bytes[Index];
+  return 0;
+}
+
+/// logger - a callback function that wraps the operator<< method from
+///   raw_ostream.
+///
+/// @param arg      - The generic callback parameter.  This should be a pointe
+///                   to a raw_ostream.
+/// @param log      - A string to be logged.  logger() adds a newline.
+static void logger(void* arg, const char* log) {
+  if (!arg)
+    return;
+
+  raw_ostream &vStream = *(static_cast<raw_ostream*>(arg));
+  vStream << log << "\n";
+}
+
+//
+// Public interface for the disassembler
+//
+
+MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction(
+    MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address,
+    raw_ostream &VStream, raw_ostream &CStream) const {
+  CommentStream = &CStream;
+
+  InternalInstruction InternalInstr;
+
+  dlog_t LoggerFn = logger;
+  if (&VStream == &nulls())
+    LoggerFn = nullptr; // Disable logging completely if it's going to nulls().
+
+  Region R(Bytes, Address);
+
+  int Ret = decodeInstruction(&InternalInstr, regionReader, (const void *)&R,
+                              LoggerFn, (void *)&VStream,
+                              (const void *)MII.get(), Address, fMode);
+
+  if (Ret) {
+    Size = InternalInstr.readerCursor - Address;
+    return Fail;
+  } else {
+    Size = InternalInstr.length;
+    return (!translateInstruction(Instr, InternalInstr, this)) ? Success : Fail;
+  }
+}
+
+//
+// Private code that translates from struct InternalInstructions to MCInsts.
+//
+
+/// translateRegister - Translates an internal register to the appropriate LLVM
+///   register, and appends it as an operand to an MCInst.
+///
+/// @param mcInst     - The MCInst to append to.
+/// @param reg        - The Reg to append.
+static void translateRegister(MCInst &mcInst, Reg reg) {
+#define ENTRY(x) X86::x,
+  uint8_t llvmRegnums[] = {
+    ALL_REGS
+    0
+  };
+#undef ENTRY
+
+  uint8_t llvmRegnum = llvmRegnums[reg];
+  mcInst.addOperand(MCOperand::createReg(llvmRegnum));
+}
+
+/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
+/// immediate Value in the MCInst.
+///
+/// @param Value      - The immediate Value, has had any PC adjustment made by
+///                     the caller.
+/// @param isBranch   - If the instruction is a branch instruction
+/// @param Address    - The starting address of the instruction
+/// @param Offset     - The byte offset to this immediate in the instruction
+/// @param Width      - The byte width of this immediate in the instruction
+///
+/// If the getOpInfo() function was set when setupForSymbolicDisassembly() was
+/// called then that function is called to get any symbolic information for the
+/// immediate in the instruction using the Address, Offset and Width.  If that
+/// returns non-zero then the symbolic information it returns is used to create
+/// an MCExpr and that is added as an operand to the MCInst.  If getOpInfo()
+/// returns zero and isBranch is true then a symbol look up for immediate Value
+/// is done and if a symbol is found an MCExpr is created with that, else
+/// an MCExpr with the immediate Value is created.  This function returns true
+/// if it adds an operand to the MCInst and false otherwise.
+static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
+                                     uint64_t Address, uint64_t Offset,
+                                     uint64_t Width, MCInst &MI,
+                                     const MCDisassembler *Dis) {
+  return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
+                                       Offset, Width);
+}
+
+/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
+/// referenced by a load instruction with the base register that is the rip.
+/// These can often be addresses in a literal pool.  The Address of the
+/// instruction and its immediate Value are used to determine the address
+/// being referenced in the literal pool entry.  The SymbolLookUp call back will
+/// return a pointer to a literal 'C' string if the referenced address is an
+/// address into a section with 'C' string literals.
+static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value,
+                                            const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  Dis->tryAddingPcLoadReferenceComment(Value, Address);
+}
+
+static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = {
+  0,        // SEG_OVERRIDE_NONE
+  X86::CS,
+  X86::SS,
+  X86::DS,
+  X86::ES,
+  X86::FS,
+  X86::GS
+};
+
+/// translateSrcIndex   - Appends a source index operand to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param insn         - The internal instruction.
+static bool translateSrcIndex(MCInst &mcInst, InternalInstruction &insn) {
+  unsigned baseRegNo;
+
+  if (insn.mode == MODE_64BIT)
+    baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::RSI;
+  else if (insn.mode == MODE_32BIT)
+    baseRegNo = insn.prefixPresent[0x67] ? X86::SI : X86::ESI;
+  else {
+    assert(insn.mode == MODE_16BIT);
+    baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::SI;
+  }
+  MCOperand baseReg = MCOperand::createReg(baseRegNo);
+  mcInst.addOperand(baseReg);
+
+  MCOperand segmentReg;
+  segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
+  mcInst.addOperand(segmentReg);
+  return false;
+}
+
+/// translateDstIndex   - Appends a destination index operand to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param insn         - The internal instruction.
+
+static bool translateDstIndex(MCInst &mcInst, InternalInstruction &insn) {
+  unsigned baseRegNo;
+
+  if (insn.mode == MODE_64BIT)
+    baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::RDI;
+  else if (insn.mode == MODE_32BIT)
+    baseRegNo = insn.prefixPresent[0x67] ? X86::DI : X86::EDI;
+  else {
+    assert(insn.mode == MODE_16BIT);
+    baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::DI;
+  }
+  MCOperand baseReg = MCOperand::createReg(baseRegNo);
+  mcInst.addOperand(baseReg);
+  return false;
+}
+
+/// translateImmediate  - Appends an immediate operand to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param immediate    - The immediate value to append.
+/// @param operand      - The operand, as stored in the descriptor table.
+/// @param insn         - The internal instruction.
+static void translateImmediate(MCInst &mcInst, uint64_t immediate,
+                               const OperandSpecifier &operand,
+                               InternalInstruction &insn,
+                               const MCDisassembler *Dis) {
+  // Sign-extend the immediate if necessary.
+
+  OperandType type = (OperandType)operand.type;
+
+  bool isBranch = false;
+  uint64_t pcrel = 0;
+  if (type == TYPE_RELv) {
+    isBranch = true;
+    pcrel = insn.startLocation +
+            insn.immediateOffset + insn.immediateSize;
+    switch (insn.displacementSize) {
+    default:
+      break;
+    case 1:
+      if(immediate & 0x80)
+        immediate |= ~(0xffull);
+      break;
+    case 2:
+      if(immediate & 0x8000)
+        immediate |= ~(0xffffull);
+      break;
+    case 4:
+      if(immediate & 0x80000000)
+        immediate |= ~(0xffffffffull);
+      break;
+    case 8:
+      break;
+    }
+  }
+  // By default sign-extend all X86 immediates based on their encoding.
+  else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 ||
+           type == TYPE_IMM64 || type == TYPE_IMMv) {
+    switch (operand.encoding) {
+    default:
+      break;
+    case ENCODING_IB:
+      if(immediate & 0x80)
+        immediate |= ~(0xffull);
+      break;
+    case ENCODING_IW:
+      if(immediate & 0x8000)
+        immediate |= ~(0xffffull);
+      break;
+    case ENCODING_ID:
+      if(immediate & 0x80000000)
+        immediate |= ~(0xffffffffull);
+      break;
+    case ENCODING_IO:
+      break;
+    }
+  } else if (type == TYPE_IMM3) {
+    // Check for immediates that printSSECC can't handle.
+    if (immediate >= 8) {
+      unsigned NewOpc;
+      switch (mcInst.getOpcode()) {
+      default: llvm_unreachable("unexpected opcode");
+      case X86::CMPPDrmi:  NewOpc = X86::CMPPDrmi_alt;  break;
+      case X86::CMPPDrri:  NewOpc = X86::CMPPDrri_alt;  break;
+      case X86::CMPPSrmi:  NewOpc = X86::CMPPSrmi_alt;  break;
+      case X86::CMPPSrri:  NewOpc = X86::CMPPSrri_alt;  break;
+      case X86::CMPSDrm:   NewOpc = X86::CMPSDrm_alt;   break;
+      case X86::CMPSDrr:   NewOpc = X86::CMPSDrr_alt;   break;
+      case X86::CMPSSrm:   NewOpc = X86::CMPSSrm_alt;   break;
+      case X86::CMPSSrr:   NewOpc = X86::CMPSSrr_alt;   break;
+      case X86::VPCOMBri:  NewOpc = X86::VPCOMBri_alt;  break;
+      case X86::VPCOMBmi:  NewOpc = X86::VPCOMBmi_alt;  break;
+      case X86::VPCOMWri:  NewOpc = X86::VPCOMWri_alt;  break;
+      case X86::VPCOMWmi:  NewOpc = X86::VPCOMWmi_alt;  break;
+      case X86::VPCOMDri:  NewOpc = X86::VPCOMDri_alt;  break;
+      case X86::VPCOMDmi:  NewOpc = X86::VPCOMDmi_alt;  break;
+      case X86::VPCOMQri:  NewOpc = X86::VPCOMQri_alt;  break;
+      case X86::VPCOMQmi:  NewOpc = X86::VPCOMQmi_alt;  break;
+      case X86::VPCOMUBri: NewOpc = X86::VPCOMUBri_alt; break;
+      case X86::VPCOMUBmi: NewOpc = X86::VPCOMUBmi_alt; break;
+      case X86::VPCOMUWri: NewOpc = X86::VPCOMUWri_alt; break;
+      case X86::VPCOMUWmi: NewOpc = X86::VPCOMUWmi_alt; break;
+      case X86::VPCOMUDri: NewOpc = X86::VPCOMUDri_alt; break;
+      case X86::VPCOMUDmi: NewOpc = X86::VPCOMUDmi_alt; break;
+      case X86::VPCOMUQri: NewOpc = X86::VPCOMUQri_alt; break;
+      case X86::VPCOMUQmi: NewOpc = X86::VPCOMUQmi_alt; break;
+      }
+      // Switch opcode to the one that doesn't get special printing.
+      mcInst.setOpcode(NewOpc);
+    }
+  } else if (type == TYPE_IMM5) {
+    // Check for immediates that printAVXCC can't handle.
+    if (immediate >= 32) {
+      unsigned NewOpc;
+      switch (mcInst.getOpcode()) {
+      default: llvm_unreachable("unexpected opcode");
+      case X86::VCMPPDrmi:   NewOpc = X86::VCMPPDrmi_alt;   break;
+      case X86::VCMPPDrri:   NewOpc = X86::VCMPPDrri_alt;   break;
+      case X86::VCMPPSrmi:   NewOpc = X86::VCMPPSrmi_alt;   break;
+      case X86::VCMPPSrri:   NewOpc = X86::VCMPPSrri_alt;   break;
+      case X86::VCMPSDrm:    NewOpc = X86::VCMPSDrm_alt;    break;
+      case X86::VCMPSDrr:    NewOpc = X86::VCMPSDrr_alt;    break;
+      case X86::VCMPSSrm:    NewOpc = X86::VCMPSSrm_alt;    break;
+      case X86::VCMPSSrr:    NewOpc = X86::VCMPSSrr_alt;    break;
+      case X86::VCMPPDYrmi:  NewOpc = X86::VCMPPDYrmi_alt;  break;
+      case X86::VCMPPDYrri:  NewOpc = X86::VCMPPDYrri_alt;  break;
+      case X86::VCMPPSYrmi:  NewOpc = X86::VCMPPSYrmi_alt;  break;
+      case X86::VCMPPSYrri:  NewOpc = X86::VCMPPSYrri_alt;  break;
+      case X86::VCMPPDZrmi:  NewOpc = X86::VCMPPDZrmi_alt;  break;
+      case X86::VCMPPDZrri:  NewOpc = X86::VCMPPDZrri_alt;  break;
+      case X86::VCMPPDZrrib: NewOpc = X86::VCMPPDZrrib_alt; break;
+      case X86::VCMPPSZrmi:  NewOpc = X86::VCMPPSZrmi_alt;  break;
+      case X86::VCMPPSZrri:  NewOpc = X86::VCMPPSZrri_alt;  break;
+      case X86::VCMPPSZrrib: NewOpc = X86::VCMPPSZrrib_alt; break;
+      case X86::VCMPSDZrm:   NewOpc = X86::VCMPSDZrmi_alt;  break;
+      case X86::VCMPSDZrr:   NewOpc = X86::VCMPSDZrri_alt;  break;
+      case X86::VCMPSSZrm:   NewOpc = X86::VCMPSSZrmi_alt;  break;
+      case X86::VCMPSSZrr:   NewOpc = X86::VCMPSSZrri_alt;  break;
+      }
+      // Switch opcode to the one that doesn't get special printing.
+      mcInst.setOpcode(NewOpc);
+    }
+  } else if (type == TYPE_AVX512ICC) {
+    if (immediate >= 8 || ((immediate & 0x3) == 3)) {
+      unsigned NewOpc;
+      switch (mcInst.getOpcode()) {
+      default: llvm_unreachable("unexpected opcode");
+      case X86::VPCMPBZ128rmi:    NewOpc = X86::VPCMPBZ128rmi_alt;    break;
+      case X86::VPCMPBZ128rmik:   NewOpc = X86::VPCMPBZ128rmik_alt;   break;
+      case X86::VPCMPBZ128rri:    NewOpc = X86::VPCMPBZ128rri_alt;    break;
+      case X86::VPCMPBZ128rrik:   NewOpc = X86::VPCMPBZ128rrik_alt;   break;
+      case X86::VPCMPBZ256rmi:    NewOpc = X86::VPCMPBZ256rmi_alt;    break;
+      case X86::VPCMPBZ256rmik:   NewOpc = X86::VPCMPBZ256rmik_alt;   break;
+      case X86::VPCMPBZ256rri:    NewOpc = X86::VPCMPBZ256rri_alt;    break;
+      case X86::VPCMPBZ256rrik:   NewOpc = X86::VPCMPBZ256rrik_alt;   break;
+      case X86::VPCMPBZrmi:       NewOpc = X86::VPCMPBZrmi_alt;       break;
+      case X86::VPCMPBZrmik:      NewOpc = X86::VPCMPBZrmik_alt;      break;
+      case X86::VPCMPBZrri:       NewOpc = X86::VPCMPBZrri_alt;       break;
+      case X86::VPCMPBZrrik:      NewOpc = X86::VPCMPBZrrik_alt;      break;
+      case X86::VPCMPDZ128rmi:    NewOpc = X86::VPCMPDZ128rmi_alt;    break;
+      case X86::VPCMPDZ128rmib:   NewOpc = X86::VPCMPDZ128rmib_alt;   break;
+      case X86::VPCMPDZ128rmibk:  NewOpc = X86::VPCMPDZ128rmibk_alt;  break;
+      case X86::VPCMPDZ128rmik:   NewOpc = X86::VPCMPDZ128rmik_alt;   break;
+      case X86::VPCMPDZ128rri:    NewOpc = X86::VPCMPDZ128rri_alt;    break;
+      case X86::VPCMPDZ128rrik:   NewOpc = X86::VPCMPDZ128rrik_alt;   break;
+      case X86::VPCMPDZ256rmi:    NewOpc = X86::VPCMPDZ256rmi_alt;    break;
+      case X86::VPCMPDZ256rmib:   NewOpc = X86::VPCMPDZ256rmib_alt;   break;
+      case X86::VPCMPDZ256rmibk:  NewOpc = X86::VPCMPDZ256rmibk_alt;  break;
+      case X86::VPCMPDZ256rmik:   NewOpc = X86::VPCMPDZ256rmik_alt;   break;
+      case X86::VPCMPDZ256rri:    NewOpc = X86::VPCMPDZ256rri_alt;    break;
+      case X86::VPCMPDZ256rrik:   NewOpc = X86::VPCMPDZ256rrik_alt;   break;
+      case X86::VPCMPDZrmi:       NewOpc = X86::VPCMPDZrmi_alt;       break;
+      case X86::VPCMPDZrmib:      NewOpc = X86::VPCMPDZrmib_alt;      break;
+      case X86::VPCMPDZrmibk:     NewOpc = X86::VPCMPDZrmibk_alt;     break;
+      case X86::VPCMPDZrmik:      NewOpc = X86::VPCMPDZrmik_alt;      break;
+      case X86::VPCMPDZrri:       NewOpc = X86::VPCMPDZrri_alt;       break;
+      case X86::VPCMPDZrrik:      NewOpc = X86::VPCMPDZrrik_alt;      break;
+      case X86::VPCMPQZ128rmi:    NewOpc = X86::VPCMPQZ128rmi_alt;    break;
+      case X86::VPCMPQZ128rmib:   NewOpc = X86::VPCMPQZ128rmib_alt;   break;
+      case X86::VPCMPQZ128rmibk:  NewOpc = X86::VPCMPQZ128rmibk_alt;  break;
+      case X86::VPCMPQZ128rmik:   NewOpc = X86::VPCMPQZ128rmik_alt;   break;
+      case X86::VPCMPQZ128rri:    NewOpc = X86::VPCMPQZ128rri_alt;    break;
+      case X86::VPCMPQZ128rrik:   NewOpc = X86::VPCMPQZ128rrik_alt;   break;
+      case X86::VPCMPQZ256rmi:    NewOpc = X86::VPCMPQZ256rmi_alt;    break;
+      case X86::VPCMPQZ256rmib:   NewOpc = X86::VPCMPQZ256rmib_alt;   break;
+      case X86::VPCMPQZ256rmibk:  NewOpc = X86::VPCMPQZ256rmibk_alt;  break;
+      case X86::VPCMPQZ256rmik:   NewOpc = X86::VPCMPQZ256rmik_alt;   break;
+      case X86::VPCMPQZ256rri:    NewOpc = X86::VPCMPQZ256rri_alt;    break;
+      case X86::VPCMPQZ256rrik:   NewOpc = X86::VPCMPQZ256rrik_alt;   break;
+      case X86::VPCMPQZrmi:       NewOpc = X86::VPCMPQZrmi_alt;       break;
+      case X86::VPCMPQZrmib:      NewOpc = X86::VPCMPQZrmib_alt;      break;
+      case X86::VPCMPQZrmibk:     NewOpc = X86::VPCMPQZrmibk_alt;     break;
+      case X86::VPCMPQZrmik:      NewOpc = X86::VPCMPQZrmik_alt;      break;
+      case X86::VPCMPQZrri:       NewOpc = X86::VPCMPQZrri_alt;       break;
+      case X86::VPCMPQZrrik:      NewOpc = X86::VPCMPQZrrik_alt;      break;
+      case X86::VPCMPUBZ128rmi:   NewOpc = X86::VPCMPUBZ128rmi_alt;   break;
+      case X86::VPCMPUBZ128rmik:  NewOpc = X86::VPCMPUBZ128rmik_alt;  break;
+      case X86::VPCMPUBZ128rri:   NewOpc = X86::VPCMPUBZ128rri_alt;   break;
+      case X86::VPCMPUBZ128rrik:  NewOpc = X86::VPCMPUBZ128rrik_alt;  break;
+      case X86::VPCMPUBZ256rmi:   NewOpc = X86::VPCMPUBZ256rmi_alt;   break;
+      case X86::VPCMPUBZ256rmik:  NewOpc = X86::VPCMPUBZ256rmik_alt;  break;
+      case X86::VPCMPUBZ256rri:   NewOpc = X86::VPCMPUBZ256rri_alt;   break;
+      case X86::VPCMPUBZ256rrik:  NewOpc = X86::VPCMPUBZ256rrik_alt;  break;
+      case X86::VPCMPUBZrmi:      NewOpc = X86::VPCMPUBZrmi_alt;      break;
+      case X86::VPCMPUBZrmik:     NewOpc = X86::VPCMPUBZrmik_alt;     break;
+      case X86::VPCMPUBZrri:      NewOpc = X86::VPCMPUBZrri_alt;      break;
+      case X86::VPCMPUBZrrik:     NewOpc = X86::VPCMPUBZrrik_alt;     break;
+      case X86::VPCMPUDZ128rmi:   NewOpc = X86::VPCMPUDZ128rmi_alt;   break;
+      case X86::VPCMPUDZ128rmib:  NewOpc = X86::VPCMPUDZ128rmib_alt;  break;
+      case X86::VPCMPUDZ128rmibk: NewOpc = X86::VPCMPUDZ128rmibk_alt; break;
+      case X86::VPCMPUDZ128rmik:  NewOpc = X86::VPCMPUDZ128rmik_alt;  break;
+      case X86::VPCMPUDZ128rri:   NewOpc = X86::VPCMPUDZ128rri_alt;   break;
+      case X86::VPCMPUDZ128rrik:  NewOpc = X86::VPCMPUDZ128rrik_alt;  break;
+      case X86::VPCMPUDZ256rmi:   NewOpc = X86::VPCMPUDZ256rmi_alt;   break;
+      case X86::VPCMPUDZ256rmib:  NewOpc = X86::VPCMPUDZ256rmib_alt;  break;
+      case X86::VPCMPUDZ256rmibk: NewOpc = X86::VPCMPUDZ256rmibk_alt; break;
+      case X86::VPCMPUDZ256rmik:  NewOpc = X86::VPCMPUDZ256rmik_alt;  break;
+      case X86::VPCMPUDZ256rri:   NewOpc = X86::VPCMPUDZ256rri_alt;   break;
+      case X86::VPCMPUDZ256rrik:  NewOpc = X86::VPCMPUDZ256rrik_alt;  break;
+      case X86::VPCMPUDZrmi:      NewOpc = X86::VPCMPUDZrmi_alt;      break;
+      case X86::VPCMPUDZrmib:     NewOpc = X86::VPCMPUDZrmib_alt;     break;
+      case X86::VPCMPUDZrmibk:    NewOpc = X86::VPCMPUDZrmibk_alt;    break;
+      case X86::VPCMPUDZrmik:     NewOpc = X86::VPCMPUDZrmik_alt;     break;
+      case X86::VPCMPUDZrri:      NewOpc = X86::VPCMPUDZrri_alt;      break;
+      case X86::VPCMPUDZrrik:     NewOpc = X86::VPCMPUDZrrik_alt;     break;
+      case X86::VPCMPUQZ128rmi:   NewOpc = X86::VPCMPUQZ128rmi_alt;   break;
+      case X86::VPCMPUQZ128rmib:  NewOpc = X86::VPCMPUQZ128rmib_alt;  break;
+      case X86::VPCMPUQZ128rmibk: NewOpc = X86::VPCMPUQZ128rmibk_alt; break;
+      case X86::VPCMPUQZ128rmik:  NewOpc = X86::VPCMPUQZ128rmik_alt;  break;
+      case X86::VPCMPUQZ128rri:   NewOpc = X86::VPCMPUQZ128rri_alt;   break;
+      case X86::VPCMPUQZ128rrik:  NewOpc = X86::VPCMPUQZ128rrik_alt;  break;
+      case X86::VPCMPUQZ256rmi:   NewOpc = X86::VPCMPUQZ256rmi_alt;   break;
+      case X86::VPCMPUQZ256rmib:  NewOpc = X86::VPCMPUQZ256rmib_alt;  break;
+      case X86::VPCMPUQZ256rmibk: NewOpc = X86::VPCMPUQZ256rmibk_alt; break;
+      case X86::VPCMPUQZ256rmik:  NewOpc = X86::VPCMPUQZ256rmik_alt;  break;
+      case X86::VPCMPUQZ256rri:   NewOpc = X86::VPCMPUQZ256rri_alt;   break;
+      case X86::VPCMPUQZ256rrik:  NewOpc = X86::VPCMPUQZ256rrik_alt;  break;
+      case X86::VPCMPUQZrmi:      NewOpc = X86::VPCMPUQZrmi_alt;      break;
+      case X86::VPCMPUQZrmib:     NewOpc = X86::VPCMPUQZrmib_alt;     break;
+      case X86::VPCMPUQZrmibk:    NewOpc = X86::VPCMPUQZrmibk_alt;    break;
+      case X86::VPCMPUQZrmik:     NewOpc = X86::VPCMPUQZrmik_alt;     break;
+      case X86::VPCMPUQZrri:      NewOpc = X86::VPCMPUQZrri_alt;      break;
+      case X86::VPCMPUQZrrik:     NewOpc = X86::VPCMPUQZrrik_alt;     break;
+      case X86::VPCMPUWZ128rmi:   NewOpc = X86::VPCMPUWZ128rmi_alt;   break;
+      case X86::VPCMPUWZ128rmik:  NewOpc = X86::VPCMPUWZ128rmik_alt;  break;
+      case X86::VPCMPUWZ128rri:   NewOpc = X86::VPCMPUWZ128rri_alt;   break;
+      case X86::VPCMPUWZ128rrik:  NewOpc = X86::VPCMPUWZ128rrik_alt;  break;
+      case X86::VPCMPUWZ256rmi:   NewOpc = X86::VPCMPUWZ256rmi_alt;   break;
+      case X86::VPCMPUWZ256rmik:  NewOpc = X86::VPCMPUWZ256rmik_alt;  break;
+      case X86::VPCMPUWZ256rri:   NewOpc = X86::VPCMPUWZ256rri_alt;   break;
+      case X86::VPCMPUWZ256rrik:  NewOpc = X86::VPCMPUWZ256rrik_alt;  break;
+      case X86::VPCMPUWZrmi:      NewOpc = X86::VPCMPUWZrmi_alt;      break;
+      case X86::VPCMPUWZrmik:     NewOpc = X86::VPCMPUWZrmik_alt;     break;
+      case X86::VPCMPUWZrri:      NewOpc = X86::VPCMPUWZrri_alt;      break;
+      case X86::VPCMPUWZrrik:     NewOpc = X86::VPCMPUWZrrik_alt;     break;
+      case X86::VPCMPWZ128rmi:    NewOpc = X86::VPCMPWZ128rmi_alt;    break;
+      case X86::VPCMPWZ128rmik:   NewOpc = X86::VPCMPWZ128rmik_alt;   break;
+      case X86::VPCMPWZ128rri:    NewOpc = X86::VPCMPWZ128rri_alt;    break;
+      case X86::VPCMPWZ128rrik:   NewOpc = X86::VPCMPWZ128rrik_alt;   break;
+      case X86::VPCMPWZ256rmi:    NewOpc = X86::VPCMPWZ256rmi_alt;    break;
+      case X86::VPCMPWZ256rmik:   NewOpc = X86::VPCMPWZ256rmik_alt;   break;
+      case X86::VPCMPWZ256rri:    NewOpc = X86::VPCMPWZ256rri_alt;    break;
+      case X86::VPCMPWZ256rrik:   NewOpc = X86::VPCMPWZ256rrik_alt;   break;
+      case X86::VPCMPWZrmi:       NewOpc = X86::VPCMPWZrmi_alt;       break;
+      case X86::VPCMPWZrmik:      NewOpc = X86::VPCMPWZrmik_alt;      break;
+      case X86::VPCMPWZrri:       NewOpc = X86::VPCMPWZrri_alt;       break;
+      case X86::VPCMPWZrrik:      NewOpc = X86::VPCMPWZrrik_alt;      break;
+      }
+      // Switch opcode to the one that doesn't get special printing.
+      mcInst.setOpcode(NewOpc);
+    }
+  }
+
+  switch (type) {
+  case TYPE_XMM32:
+  case TYPE_XMM64:
+  case TYPE_XMM128:
+    mcInst.addOperand(MCOperand::createReg(X86::XMM0 + (immediate >> 4)));
+    return;
+  case TYPE_XMM256:
+    mcInst.addOperand(MCOperand::createReg(X86::YMM0 + (immediate >> 4)));
+    return;
+  case TYPE_XMM512:
+    mcInst.addOperand(MCOperand::createReg(X86::ZMM0 + (immediate >> 4)));
+    return;
+  case TYPE_BNDR:
+    mcInst.addOperand(MCOperand::createReg(X86::BND0 + (immediate >> 4)));
+  case TYPE_REL8:
+    isBranch = true;
+    pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+    if (immediate & 0x80)
+      immediate |= ~(0xffull);
+    break;
+  case TYPE_REL16:
+    isBranch = true;
+    pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+    if (immediate & 0x8000)
+      immediate |= ~(0xffffull);
+    break;
+  case TYPE_REL32:
+  case TYPE_REL64:
+    isBranch = true;
+    pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+    if(immediate & 0x80000000)
+      immediate |= ~(0xffffffffull);
+    break;
+  default:
+    // operand is 64 bits wide.  Do nothing.
+    break;
+  }
+
+  if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation,
+                               insn.immediateOffset, insn.immediateSize,
+                               mcInst, Dis))
+    mcInst.addOperand(MCOperand::createImm(immediate));
+
+  if (type == TYPE_MOFFS8 || type == TYPE_MOFFS16 ||
+      type == TYPE_MOFFS32 || type == TYPE_MOFFS64) {
+    MCOperand segmentReg;
+    segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
+    mcInst.addOperand(segmentReg);
+  }
+}
+
+/// translateRMRegister - Translates a register stored in the R/M field of the
+///   ModR/M byte to its LLVM equivalent and appends it to an MCInst.
+/// @param mcInst       - The MCInst to append to.
+/// @param insn         - The internal instruction to extract the R/M field
+///                       from.
+/// @return             - 0 on success; -1 otherwise
+static bool translateRMRegister(MCInst &mcInst,
+                                InternalInstruction &insn) {
+  if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+    debug("A R/M register operand may not have a SIB byte");
+    return true;
+  }
+
+  switch (insn.eaBase) {
+  default:
+    debug("Unexpected EA base register");
+    return true;
+  case EA_BASE_NONE:
+    debug("EA_BASE_NONE for ModR/M base");
+    return true;
+#define ENTRY(x) case EA_BASE_##x:
+  ALL_EA_BASES
+#undef ENTRY
+    debug("A R/M register operand may not have a base; "
+          "the operand must be a register.");
+    return true;
+#define ENTRY(x)                                                      \
+  case EA_REG_##x:                                                    \
+    mcInst.addOperand(MCOperand::createReg(X86::x)); break;
+  ALL_REGS
+#undef ENTRY
+  }
+
+  return false;
+}
+
+/// translateRMMemory - Translates a memory operand stored in the Mod and R/M
+///   fields of an internal instruction (and possibly its SIB byte) to a memory
+///   operand in LLVM's format, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param insn         - The instruction to extract Mod, R/M, and SIB fields
+///                       from.
+/// @return             - 0 on success; nonzero otherwise
+static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
+                              const MCDisassembler *Dis) {
+  // Addresses in an MCInst are represented as five operands:
+  //   1. basereg       (register)  The R/M base, or (if there is a SIB) the
+  //                                SIB base
+  //   2. scaleamount   (immediate) 1, or (if there is a SIB) the specified
+  //                                scale amount
+  //   3. indexreg      (register)  x86_registerNONE, or (if there is a SIB)
+  //                                the index (which is multiplied by the
+  //                                scale amount)
+  //   4. displacement  (immediate) 0, or the displacement if there is one
+  //   5. segmentreg    (register)  x86_registerNONE for now, but could be set
+  //                                if we have segment overrides
+
+  MCOperand baseReg;
+  MCOperand scaleAmount;
+  MCOperand indexReg;
+  MCOperand displacement;
+  MCOperand segmentReg;
+  uint64_t pcrel = 0;
+
+  if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+    if (insn.sibBase != SIB_BASE_NONE) {
+      switch (insn.sibBase) {
+      default:
+        debug("Unexpected sibBase");
+        return true;
+#define ENTRY(x)                                          \
+      case SIB_BASE_##x:                                  \
+        baseReg = MCOperand::createReg(X86::x); break;
+      ALL_SIB_BASES
+#undef ENTRY
+      }
+    } else {
+      baseReg = MCOperand::createReg(0);
+    }
+
+    // Check whether we are handling VSIB addressing mode for GATHER.
+    // If sibIndex was set to SIB_INDEX_NONE, index offset is 4 and
+    // we should use SIB_INDEX_XMM4|YMM4 for VSIB.
+    // I don't see a way to get the correct IndexReg in readSIB:
+    //   We can tell whether it is VSIB or SIB after instruction ID is decoded,
+    //   but instruction ID may not be decoded yet when calling readSIB.
+    uint32_t Opcode = mcInst.getOpcode();
+    bool IndexIs128 = (Opcode == X86::VGATHERDPDrm ||
+                       Opcode == X86::VGATHERDPDYrm ||
+                       Opcode == X86::VGATHERQPDrm ||
+                       Opcode == X86::VGATHERDPSrm ||
+                       Opcode == X86::VGATHERQPSrm ||
+                       Opcode == X86::VPGATHERDQrm ||
+                       Opcode == X86::VPGATHERDQYrm ||
+                       Opcode == X86::VPGATHERQQrm ||
+                       Opcode == X86::VPGATHERDDrm ||
+                       Opcode == X86::VPGATHERQDrm);
+    bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm ||
+                       Opcode == X86::VGATHERDPSYrm ||
+                       Opcode == X86::VGATHERQPSYrm ||
+                       Opcode == X86::VGATHERDPDZrm ||
+                       Opcode == X86::VPGATHERDQZrm ||
+                       Opcode == X86::VPGATHERQQYrm ||
+                       Opcode == X86::VPGATHERDDYrm ||
+                       Opcode == X86::VPGATHERQDYrm);
+    bool IndexIs512 = (Opcode == X86::VGATHERQPDZrm ||
+                       Opcode == X86::VGATHERDPSZrm ||
+                       Opcode == X86::VGATHERQPSZrm ||
+                       Opcode == X86::VPGATHERQQZrm ||
+                       Opcode == X86::VPGATHERDDZrm ||
+                       Opcode == X86::VPGATHERQDZrm);
+    if (IndexIs128 || IndexIs256 || IndexIs512) {
+      unsigned IndexOffset = insn.sibIndex -
+                         (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX);
+      SIBIndex IndexBase = IndexIs512 ? SIB_INDEX_ZMM0 :
+                           IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0;
+      insn.sibIndex = (SIBIndex)(IndexBase +
+                           (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset));
+    }
+
+    if (insn.sibIndex != SIB_INDEX_NONE) {
+      switch (insn.sibIndex) {
+      default:
+        debug("Unexpected sibIndex");
+        return true;
+#define ENTRY(x)                                          \
+      case SIB_INDEX_##x:                                 \
+        indexReg = MCOperand::createReg(X86::x); break;
+      EA_BASES_32BIT
+      EA_BASES_64BIT
+      REGS_XMM
+      REGS_YMM
+      REGS_ZMM
+#undef ENTRY
+      }
+    } else {
+      indexReg = MCOperand::createReg(0);
+    }
+
+    scaleAmount = MCOperand::createImm(insn.sibScale);
+  } else {
+    switch (insn.eaBase) {
+    case EA_BASE_NONE:
+      if (insn.eaDisplacement == EA_DISP_NONE) {
+        debug("EA_BASE_NONE and EA_DISP_NONE for ModR/M base");
+        return true;
+      }
+      if (insn.mode == MODE_64BIT){
+        pcrel = insn.startLocation +
+                insn.displacementOffset + insn.displacementSize;
+        tryAddingPcLoadReferenceComment(insn.startLocation +
+                                        insn.displacementOffset,
+                                        insn.displacement + pcrel, Dis);
+        baseReg = MCOperand::createReg(X86::RIP); // Section 2.2.1.6
+      }
+      else
+        baseReg = MCOperand::createReg(0);
+
+      indexReg = MCOperand::createReg(0);
+      break;
+    case EA_BASE_BX_SI:
+      baseReg = MCOperand::createReg(X86::BX);
+      indexReg = MCOperand::createReg(X86::SI);
+      break;
+    case EA_BASE_BX_DI:
+      baseReg = MCOperand::createReg(X86::BX);
+      indexReg = MCOperand::createReg(X86::DI);
+      break;
+    case EA_BASE_BP_SI:
+      baseReg = MCOperand::createReg(X86::BP);
+      indexReg = MCOperand::createReg(X86::SI);
+      break;
+    case EA_BASE_BP_DI:
+      baseReg = MCOperand::createReg(X86::BP);
+      indexReg = MCOperand::createReg(X86::DI);
+      break;
+    default:
+      indexReg = MCOperand::createReg(0);
+      switch (insn.eaBase) {
+      default:
+        debug("Unexpected eaBase");
+        return true;
+        // Here, we will use the fill-ins defined above.  However,
+        //   BX_SI, BX_DI, BP_SI, and BP_DI are all handled above and
+        //   sib and sib64 were handled in the top-level if, so they're only
+        //   placeholders to keep the compiler happy.
+#define ENTRY(x)                                        \
+      case EA_BASE_##x:                                 \
+        baseReg = MCOperand::createReg(X86::x); break;
+      ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) case EA_REG_##x:
+      ALL_REGS
+#undef ENTRY
+        debug("A R/M memory operand may not be a register; "
+              "the base field must be a base.");
+        return true;
+      }
+    }
+
+    scaleAmount = MCOperand::createImm(1);
+  }
+
+  displacement = MCOperand::createImm(insn.displacement);
+
+  segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
+
+  mcInst.addOperand(baseReg);
+  mcInst.addOperand(scaleAmount);
+  mcInst.addOperand(indexReg);
+  if(!tryAddingSymbolicOperand(insn.displacement + pcrel, false,
+                               insn.startLocation, insn.displacementOffset,
+                               insn.displacementSize, mcInst, Dis))
+    mcInst.addOperand(displacement);
+  mcInst.addOperand(segmentReg);
+  return false;
+}
+
+/// translateRM - Translates an operand stored in the R/M (and possibly SIB)
+///   byte of an instruction to LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param operand      - The operand, as stored in the descriptor table.
+/// @param insn         - The instruction to extract Mod, R/M, and SIB fields
+///                       from.
+/// @return             - 0 on success; nonzero otherwise
+static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
+                        InternalInstruction &insn, const MCDisassembler *Dis) {
+  switch (operand.type) {
+  default:
+    debug("Unexpected type for a R/M operand");
+    return true;
+  case TYPE_R8:
+  case TYPE_R16:
+  case TYPE_R32:
+  case TYPE_R64:
+  case TYPE_Rv:
+  case TYPE_MM64:
+  case TYPE_XMM:
+  case TYPE_XMM32:
+  case TYPE_XMM64:
+  case TYPE_XMM128:
+  case TYPE_XMM256:
+  case TYPE_XMM512:
+  case TYPE_VK1:
+  case TYPE_VK2:
+  case TYPE_VK4:
+  case TYPE_VK8:
+  case TYPE_VK16:
+  case TYPE_VK32:
+  case TYPE_VK64:
+  case TYPE_DEBUGREG:
+  case TYPE_CONTROLREG:
+  case TYPE_BNDR:
+    return translateRMRegister(mcInst, insn);
+  case TYPE_M:
+  case TYPE_M8:
+  case TYPE_M16:
+  case TYPE_M32:
+  case TYPE_M64:
+  case TYPE_M128:
+  case TYPE_M256:
+  case TYPE_M512:
+  case TYPE_Mv:
+  case TYPE_M32FP:
+  case TYPE_M64FP:
+  case TYPE_M80FP:
+  case TYPE_M1616:
+  case TYPE_M1632:
+  case TYPE_M1664:
+  case TYPE_LEA:
+    return translateRMMemory(mcInst, insn, Dis);
+  }
+}
+
+/// translateFPRegister - Translates a stack position on the FPU stack to its
+///   LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param stackPos     - The stack position to translate.
+static void translateFPRegister(MCInst &mcInst,
+                                uint8_t stackPos) {
+  mcInst.addOperand(MCOperand::createReg(X86::ST0 + stackPos));
+}
+
+/// translateMaskRegister - Translates a 3-bit mask register number to
+///   LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param maskRegNum   - Number of mask register from 0 to 7.
+/// @return             - false on success; true otherwise.
+static bool translateMaskRegister(MCInst &mcInst,
+                                uint8_t maskRegNum) {
+  if (maskRegNum >= 8) {
+    debug("Invalid mask register number");
+    return true;
+  }
+
+  mcInst.addOperand(MCOperand::createReg(X86::K0 + maskRegNum));
+  return false;
+}
+
+/// translateOperand - Translates an operand stored in an internal instruction
+///   to LLVM's format and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param operand      - The operand, as stored in the descriptor table.
+/// @param insn         - The internal instruction.
+/// @return             - false on success; true otherwise.
+static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
+                             InternalInstruction &insn,
+                             const MCDisassembler *Dis) {
+  switch (operand.encoding) {
+  default:
+    debug("Unhandled operand encoding during translation");
+    return true;
+  case ENCODING_REG:
+    translateRegister(mcInst, insn.reg);
+    return false;
+  case ENCODING_WRITEMASK:
+    return translateMaskRegister(mcInst, insn.writemask);
+  CASE_ENCODING_RM:
+    return translateRM(mcInst, operand, insn, Dis);
+  case ENCODING_CB:
+  case ENCODING_CW:
+  case ENCODING_CD:
+  case ENCODING_CP:
+  case ENCODING_CO:
+  case ENCODING_CT:
+    debug("Translation of code offsets isn't supported.");
+    return true;
+  case ENCODING_IB:
+  case ENCODING_IW:
+  case ENCODING_ID:
+  case ENCODING_IO:
+  case ENCODING_Iv:
+  case ENCODING_Ia:
+    translateImmediate(mcInst,
+                       insn.immediates[insn.numImmediatesTranslated++],
+                       operand,
+                       insn,
+                       Dis);
+    return false;
+  case ENCODING_SI:
+    return translateSrcIndex(mcInst, insn);
+  case ENCODING_DI:
+    return translateDstIndex(mcInst, insn);
+  case ENCODING_RB:
+  case ENCODING_RW:
+  case ENCODING_RD:
+  case ENCODING_RO:
+  case ENCODING_Rv:
+    translateRegister(mcInst, insn.opcodeRegister);
+    return false;
+  case ENCODING_FP:
+    translateFPRegister(mcInst, insn.modRM & 7);
+    return false;
+  case ENCODING_VVVV:
+    translateRegister(mcInst, insn.vvvv);
+    return false;
+  case ENCODING_DUP:
+    return translateOperand(mcInst, insn.operands[operand.type - TYPE_DUP0],
+                            insn, Dis);
+  }
+}
+
+/// translateInstruction - Translates an internal instruction and all its
+///   operands to an MCInst.
+///
+/// @param mcInst       - The MCInst to populate with the instruction's data.
+/// @param insn         - The internal instruction.
+/// @return             - false on success; true otherwise.
+static bool translateInstruction(MCInst &mcInst,
+                                InternalInstruction &insn,
+                                const MCDisassembler *Dis) {
+  if (!insn.spec) {
+    debug("Instruction has no specification");
+    return true;
+  }
+
+  mcInst.clear();
+  mcInst.setOpcode(insn.instructionID);
+  // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3
+  // prefix bytes should be disassembled as xrelease and xacquire then set the
+  // opcode to those instead of the rep and repne opcodes.
+  if (insn.xAcquireRelease) {
+    if(mcInst.getOpcode() == X86::REP_PREFIX)
+      mcInst.setOpcode(X86::XRELEASE_PREFIX);
+    else if(mcInst.getOpcode() == X86::REPNE_PREFIX)
+      mcInst.setOpcode(X86::XACQUIRE_PREFIX);
+  }
+
+  insn.numImmediatesTranslated = 0;
+
+  for (const auto &Op : insn.operands) {
+    if (Op.encoding != ENCODING_NONE) {
+      if (translateOperand(mcInst, Op, insn, Dis)) {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+static MCDisassembler *createX86Disassembler(const Target &T,
+                                             const MCSubtargetInfo &STI,
+                                             MCContext &Ctx) {
+  std::unique_ptr<const MCInstrInfo> MII(T.createMCInstrInfo());
+  return new X86Disassembler::X86GenericDisassembler(STI, Ctx, std::move(MII));
+}
+
+extern "C" void LLVMInitializeX86Disassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(TheX86_32Target,
+                                         createX86Disassembler);
+  TargetRegistry::RegisterMCDisassembler(TheX86_64Target,
+                                         createX86Disassembler);
+}
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h
new file mode 100644
index 0000000..d7f426b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h
@@ -0,0 +1,112 @@
+//===-- X86Disassembler.h - Disassembler for x86 and x86_64 -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The X86 disassembler is a table-driven disassembler for the 16-, 32-, and
+// 64-bit X86 instruction sets.  The main decode sequence for an assembly
+// instruction in this disassembler is:
+//
+// 1. Read the prefix bytes and determine the attributes of the instruction.
+//    These attributes, recorded in enum attributeBits
+//    (X86DisassemblerDecoderCommon.h), form a bitmask.  The table CONTEXTS_SYM
+//    provides a mapping from bitmasks to contexts, which are represented by
+//    enum InstructionContext (ibid.).
+//
+// 2. Read the opcode, and determine what kind of opcode it is.  The
+//    disassembler distinguishes four kinds of opcodes, which are enumerated in
+//    OpcodeType (X86DisassemblerDecoderCommon.h): one-byte (0xnn), two-byte
+//    (0x0f 0xnn), three-byte-38 (0x0f 0x38 0xnn), or three-byte-3a
+//    (0x0f 0x3a 0xnn).  Mandatory prefixes are treated as part of the context.
+//
+// 3. Depending on the opcode type, look in one of four ClassDecision structures
+//    (X86DisassemblerDecoderCommon.h).  Use the opcode class to determine which
+//    OpcodeDecision (ibid.) to look the opcode in.  Look up the opcode, to get
+//    a ModRMDecision (ibid.).
+//
+// 4. Some instructions, such as escape opcodes or extended opcodes, or even
+//    instructions that have ModRM*Reg / ModRM*Mem forms in LLVM, need the
+//    ModR/M byte to complete decode.  The ModRMDecision's type is an entry from
+//    ModRMDecisionType (X86DisassemblerDecoderCommon.h) that indicates if the
+//    ModR/M byte is required and how to interpret it.
+//
+// 5. After resolving the ModRMDecision, the disassembler has a unique ID
+//    of type InstrUID (X86DisassemblerDecoderCommon.h).  Looking this ID up in
+//    INSTRUCTIONS_SYM yields the name of the instruction and the encodings and
+//    meanings of its operands.
+//
+// 6. For each operand, its encoding is an entry from OperandEncoding
+//    (X86DisassemblerDecoderCommon.h) and its type is an entry from
+//    OperandType (ibid.).  The encoding indicates how to read it from the
+//    instruction; the type indicates how to interpret the value once it has
+//    been read.  For example, a register operand could be stored in the R/M
+//    field of the ModR/M byte, the REG field of the ModR/M byte, or added to
+//    the main opcode.  This is orthogonal from its meaning (an GPR or an XMM
+//    register, for instance).  Given this information, the operands can be
+//    extracted and interpreted.
+//
+// 7. As the last step, the disassembler translates the instruction information
+//    and operands into a format understandable by the client - in this case, an
+//    MCInst for use by the MC infrastructure.
+//
+// The disassembler is broken broadly into two parts: the table emitter that
+// emits the instruction decode tables discussed above during compilation, and
+// the disassembler itself.  The table emitter is documented in more detail in
+// utils/TableGen/X86DisassemblerEmitter.h.
+//
+// X86Disassembler.h contains the public interface for the disassembler,
+//   adhering to the MCDisassembler interface.
+// X86Disassembler.cpp contains the code responsible for step 7, and for
+//   invoking the decoder to execute steps 1-6.
+// X86DisassemblerDecoderCommon.h contains the definitions needed by both the
+//   table emitter and the disassembler.
+// X86DisassemblerDecoder.h contains the public interface of the decoder,
+//   factored out into C for possible use by other projects.
+// X86DisassemblerDecoder.c contains the source code of the decoder, which is
+//   responsible for steps 1-6.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLER_H
+#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLER_H
+
+#include "X86DisassemblerDecoderCommon.h"
+#include "llvm/MC/MCDisassembler.h"
+
+namespace llvm {
+
+class MCInst;
+class MCInstrInfo;
+class MCSubtargetInfo;
+class MemoryObject;
+class raw_ostream;
+
+namespace X86Disassembler {
+
+/// Generic disassembler for all X86 platforms. All each platform class should
+/// have to do is subclass the constructor, and provide a different
+/// disassemblerMode value.
+class X86GenericDisassembler : public MCDisassembler {
+  std::unique_ptr<const MCInstrInfo> MII;
+public:
+  X86GenericDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                         std::unique_ptr<const MCInstrInfo> MII);
+public:
+  DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const override;
+
+private:
+  DisassemblerMode              fMode;
+};
+
+} // namespace X86Disassembler
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
new file mode 100644
index 0000000..040143b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -0,0 +1,1909 @@
+//===-- X86DisassemblerDecoder.cpp - Disassembler decoder -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains the implementation of the instruction decoder.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdarg>   /* for va_*()       */
+#include <cstdio>    /* for vsnprintf()  */
+#include <cstdlib>   /* for exit()       */
+#include <cstring>   /* for memset()     */
+
+#include "X86DisassemblerDecoder.h"
+
+using namespace llvm::X86Disassembler;
+
+/// Specifies whether a ModR/M byte is needed and (if so) which
+/// instruction each possible value of the ModR/M byte corresponds to.  Once
+/// this information is known, we have narrowed down to a single instruction.
+struct ModRMDecision {
+  uint8_t modrm_type;
+  uint16_t instructionIDs;
+};
+
+/// Specifies which set of ModR/M->instruction tables to look at
+/// given a particular opcode.
+struct OpcodeDecision {
+  ModRMDecision modRMDecisions[256];
+};
+
+/// Specifies which opcode->instruction tables to look at given
+/// a particular context (set of attributes).  Since there are many possible
+/// contexts, the decoder first uses CONTEXTS_SYM to determine which context
+/// applies given a specific set of attributes.  Hence there are only IC_max
+/// entries in this table, rather than 2^(ATTR_max).
+struct ContextDecision {
+  OpcodeDecision opcodeDecisions[IC_max];
+};
+
+#include "X86GenDisassemblerTables.inc"
+
+#ifndef NDEBUG
+#define debug(s) do { Debug(__FILE__, __LINE__, s); } while (0)
+#else
+#define debug(s) do { } while (0)
+#endif
+
+
+/*
+ * contextForAttrs - Client for the instruction context table.  Takes a set of
+ *   attributes and returns the appropriate decode context.
+ *
+ * @param attrMask  - Attributes, from the enumeration attributeBits.
+ * @return          - The InstructionContext to use when looking up an
+ *                    an instruction with these attributes.
+ */
+static InstructionContext contextForAttrs(uint16_t attrMask) {
+  return static_cast<InstructionContext>(CONTEXTS_SYM[attrMask]);
+}
+
+/*
+ * modRMRequired - Reads the appropriate instruction table to determine whether
+ *   the ModR/M byte is required to decode a particular instruction.
+ *
+ * @param type        - The opcode type (i.e., how many bytes it has).
+ * @param insnContext - The context for the instruction, as returned by
+ *                      contextForAttrs.
+ * @param opcode      - The last byte of the instruction's opcode, not counting
+ *                      ModR/M extensions and escapes.
+ * @return            - true if the ModR/M byte is required, false otherwise.
+ */
+static int modRMRequired(OpcodeType type,
+                         InstructionContext insnContext,
+                         uint16_t opcode) {
+  const struct ContextDecision* decision = nullptr;
+
+  switch (type) {
+  case ONEBYTE:
+    decision = &ONEBYTE_SYM;
+    break;
+  case TWOBYTE:
+    decision = &TWOBYTE_SYM;
+    break;
+  case THREEBYTE_38:
+    decision = &THREEBYTE38_SYM;
+    break;
+  case THREEBYTE_3A:
+    decision = &THREEBYTE3A_SYM;
+    break;
+  case XOP8_MAP:
+    decision = &XOP8_MAP_SYM;
+    break;
+  case XOP9_MAP:
+    decision = &XOP9_MAP_SYM;
+    break;
+  case XOPA_MAP:
+    decision = &XOPA_MAP_SYM;
+    break;
+  }
+
+  return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
+    modrm_type != MODRM_ONEENTRY;
+}
+
+/*
+ * decode - Reads the appropriate instruction table to obtain the unique ID of
+ *   an instruction.
+ *
+ * @param type        - See modRMRequired().
+ * @param insnContext - See modRMRequired().
+ * @param opcode      - See modRMRequired().
+ * @param modRM       - The ModR/M byte if required, or any value if not.
+ * @return            - The UID of the instruction, or 0 on failure.
+ */
+static InstrUID decode(OpcodeType type,
+                       InstructionContext insnContext,
+                       uint8_t opcode,
+                       uint8_t modRM) {
+  const struct ModRMDecision* dec = nullptr;
+
+  switch (type) {
+  case ONEBYTE:
+    dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case TWOBYTE:
+    dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case THREEBYTE_38:
+    dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case THREEBYTE_3A:
+    dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case XOP8_MAP:
+    dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case XOP9_MAP:
+    dec = &XOP9_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case XOPA_MAP:
+    dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  }
+
+  switch (dec->modrm_type) {
+  default:
+    debug("Corrupt table!  Unknown modrm_type");
+    return 0;
+  case MODRM_ONEENTRY:
+    return modRMTable[dec->instructionIDs];
+  case MODRM_SPLITRM:
+    if (modFromModRM(modRM) == 0x3)
+      return modRMTable[dec->instructionIDs+1];
+    return modRMTable[dec->instructionIDs];
+  case MODRM_SPLITREG:
+    if (modFromModRM(modRM) == 0x3)
+      return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8];
+    return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+  case MODRM_SPLITMISC:
+    if (modFromModRM(modRM) == 0x3)
+      return modRMTable[dec->instructionIDs+(modRM & 0x3f)+8];
+    return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+  case MODRM_FULL:
+    return modRMTable[dec->instructionIDs+modRM];
+  }
+}
+
+/*
+ * specifierForUID - Given a UID, returns the name and operand specification for
+ *   that instruction.
+ *
+ * @param uid - The unique ID for the instruction.  This should be returned by
+ *              decode(); specifierForUID will not check bounds.
+ * @return    - A pointer to the specification for that instruction.
+ */
+static const struct InstructionSpecifier *specifierForUID(InstrUID uid) {
+  return &INSTRUCTIONS_SYM[uid];
+}
+
+/*
+ * consumeByte - Uses the reader function provided by the user to consume one
+ *   byte from the instruction's memory and advance the cursor.
+ *
+ * @param insn  - The instruction with the reader function to use.  The cursor
+ *                for this instruction is advanced.
+ * @param byte  - A pointer to a pre-allocated memory buffer to be populated
+ *                with the data read.
+ * @return      - 0 if the read was successful; nonzero otherwise.
+ */
+static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) {
+  int ret = insn->reader(insn->readerArg, byte, insn->readerCursor);
+
+  if (!ret)
+    ++(insn->readerCursor);
+
+  return ret;
+}
+
+/*
+ * lookAtByte - Like consumeByte, but does not advance the cursor.
+ *
+ * @param insn  - See consumeByte().
+ * @param byte  - See consumeByte().
+ * @return      - See consumeByte().
+ */
+static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) {
+  return insn->reader(insn->readerArg, byte, insn->readerCursor);
+}
+
+static void unconsumeByte(struct InternalInstruction* insn) {
+  insn->readerCursor--;
+}
+
+#define CONSUME_FUNC(name, type)                                  \
+  static int name(struct InternalInstruction* insn, type* ptr) {  \
+    type combined = 0;                                            \
+    unsigned offset;                                              \
+    for (offset = 0; offset < sizeof(type); ++offset) {           \
+      uint8_t byte;                                               \
+      int ret = insn->reader(insn->readerArg,                     \
+                             &byte,                               \
+                             insn->readerCursor + offset);        \
+      if (ret)                                                    \
+        return ret;                                               \
+      combined = combined | ((uint64_t)byte << (offset * 8));     \
+    }                                                             \
+    *ptr = combined;                                              \
+    insn->readerCursor += sizeof(type);                           \
+    return 0;                                                     \
+  }
+
+/*
+ * consume* - Use the reader function provided by the user to consume data
+ *   values of various sizes from the instruction's memory and advance the
+ *   cursor appropriately.  These readers perform endian conversion.
+ *
+ * @param insn    - See consumeByte().
+ * @param ptr     - A pointer to a pre-allocated memory of appropriate size to
+ *                  be populated with the data read.
+ * @return        - See consumeByte().
+ */
+CONSUME_FUNC(consumeInt8, int8_t)
+CONSUME_FUNC(consumeInt16, int16_t)
+CONSUME_FUNC(consumeInt32, int32_t)
+CONSUME_FUNC(consumeUInt16, uint16_t)
+CONSUME_FUNC(consumeUInt32, uint32_t)
+CONSUME_FUNC(consumeUInt64, uint64_t)
+
+/*
+ * dbgprintf - Uses the logging function provided by the user to log a single
+ *   message, typically without a carriage-return.
+ *
+ * @param insn    - The instruction containing the logging function.
+ * @param format  - See printf().
+ * @param ...     - See printf().
+ */
+static void dbgprintf(struct InternalInstruction* insn,
+                      const char* format,
+                      ...) {
+  char buffer[256];
+  va_list ap;
+
+  if (!insn->dlog)
+    return;
+
+  va_start(ap, format);
+  (void)vsnprintf(buffer, sizeof(buffer), format, ap);
+  va_end(ap);
+
+  insn->dlog(insn->dlogArg, buffer);
+
+  return;
+}
+
+/*
+ * setPrefixPresent - Marks that a particular prefix is present at a particular
+ *   location.
+ *
+ * @param insn      - The instruction to be marked as having the prefix.
+ * @param prefix    - The prefix that is present.
+ * @param location  - The location where the prefix is located (in the address
+ *                    space of the instruction's reader).
+ */
+static void setPrefixPresent(struct InternalInstruction* insn,
+                                    uint8_t prefix,
+                                    uint64_t location)
+{
+  insn->prefixPresent[prefix] = 1;
+  insn->prefixLocations[prefix] = location;
+}
+
+/*
+ * isPrefixAtLocation - Queries an instruction to determine whether a prefix is
+ *   present at a given location.
+ *
+ * @param insn      - The instruction to be queried.
+ * @param prefix    - The prefix.
+ * @param location  - The location to query.
+ * @return          - Whether the prefix is at that location.
+ */
+static bool isPrefixAtLocation(struct InternalInstruction* insn,
+                               uint8_t prefix,
+                               uint64_t location)
+{
+  return insn->prefixPresent[prefix] == 1 &&
+     insn->prefixLocations[prefix] == location;
+}
+
+/*
+ * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the
+ *   instruction as having them.  Also sets the instruction's default operand,
+ *   address, and other relevant data sizes to report operands correctly.
+ *
+ * @param insn  - The instruction whose prefixes are to be read.
+ * @return      - 0 if the instruction could be read until the end of the prefix
+ *                bytes, and no prefixes conflicted; nonzero otherwise.
+ */
+static int readPrefixes(struct InternalInstruction* insn) {
+  bool isPrefix = true;
+  bool prefixGroups[4] = { false };
+  uint64_t prefixLocation;
+  uint8_t byte = 0;
+  uint8_t nextByte;
+
+  bool hasAdSize = false;
+  bool hasOpSize = false;
+
+  dbgprintf(insn, "readPrefixes()");
+
+  while (isPrefix) {
+    prefixLocation = insn->readerCursor;
+
+    /* If we fail reading prefixes, just stop here and let the opcode reader deal with it */
+    if (consumeByte(insn, &byte))
+      break;
+
+    /*
+     * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then
+     * break and let it be disassembled as a normal "instruction".
+     */
+    if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0)
+      break;
+
+    if (insn->readerCursor - 1 == insn->startLocation
+        && (byte == 0xf2 || byte == 0xf3)
+        && !lookAtByte(insn, &nextByte))
+    {
+      /*
+       * If the byte is 0xf2 or 0xf3, and any of the following conditions are
+       * met:
+       * - it is followed by a LOCK (0xf0) prefix
+       * - it is followed by an xchg instruction
+       * then it should be disassembled as a xacquire/xrelease not repne/rep.
+       */
+      if ((byte == 0xf2 || byte == 0xf3) &&
+          ((nextByte == 0xf0) ||
+          ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90)))
+        insn->xAcquireRelease = true;
+      /*
+       * Also if the byte is 0xf3, and the following condition is met:
+       * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or
+       *                       "mov mem, imm" (opcode 0xc6/0xc7) instructions.
+       * then it should be disassembled as an xrelease not rep.
+       */
+      if (byte == 0xf3 &&
+          (nextByte == 0x88 || nextByte == 0x89 ||
+           nextByte == 0xc6 || nextByte == 0xc7))
+        insn->xAcquireRelease = true;
+      if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) {
+        if (consumeByte(insn, &nextByte))
+          return -1;
+        if (lookAtByte(insn, &nextByte))
+          return -1;
+        unconsumeByte(insn);
+      }
+      if (nextByte != 0x0f && nextByte != 0x90)
+        break;
+    }
+
+    switch (byte) {
+    case 0xf0:  /* LOCK */
+    case 0xf2:  /* REPNE/REPNZ */
+    case 0xf3:  /* REP or REPE/REPZ */
+      if (prefixGroups[0])
+        dbgprintf(insn, "Redundant Group 1 prefix");
+      prefixGroups[0] = true;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    case 0x2e:  /* CS segment override -OR- Branch not taken */
+    case 0x36:  /* SS segment override -OR- Branch taken */
+    case 0x3e:  /* DS segment override */
+    case 0x26:  /* ES segment override */
+    case 0x64:  /* FS segment override */
+    case 0x65:  /* GS segment override */
+      switch (byte) {
+      case 0x2e:
+        insn->segmentOverride = SEG_OVERRIDE_CS;
+        break;
+      case 0x36:
+        insn->segmentOverride = SEG_OVERRIDE_SS;
+        break;
+      case 0x3e:
+        insn->segmentOverride = SEG_OVERRIDE_DS;
+        break;
+      case 0x26:
+        insn->segmentOverride = SEG_OVERRIDE_ES;
+        break;
+      case 0x64:
+        insn->segmentOverride = SEG_OVERRIDE_FS;
+        break;
+      case 0x65:
+        insn->segmentOverride = SEG_OVERRIDE_GS;
+        break;
+      default:
+        debug("Unhandled override");
+        return -1;
+      }
+      if (prefixGroups[1])
+        dbgprintf(insn, "Redundant Group 2 prefix");
+      prefixGroups[1] = true;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    case 0x66:  /* Operand-size override */
+      if (prefixGroups[2])
+        dbgprintf(insn, "Redundant Group 3 prefix");
+      prefixGroups[2] = true;
+      hasOpSize = true;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    case 0x67:  /* Address-size override */
+      if (prefixGroups[3])
+        dbgprintf(insn, "Redundant Group 4 prefix");
+      prefixGroups[3] = true;
+      hasAdSize = true;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    default:    /* Not a prefix byte */
+      isPrefix = false;
+      break;
+    }
+
+    if (isPrefix)
+      dbgprintf(insn, "Found prefix 0x%hhx", byte);
+  }
+
+  insn->vectorExtensionType = TYPE_NO_VEX_XOP;
+
+  if (byte == 0x62) {
+    uint8_t byte1, byte2;
+
+    if (consumeByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of EVEX prefix");
+      return -1;
+    }
+
+    if (lookAtByte(insn, &byte2)) {
+      dbgprintf(insn, "Couldn't read third byte of EVEX prefix");
+      return -1;
+    }
+
+    if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) &&
+       ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) {
+      insn->vectorExtensionType = TYPE_EVEX;
+    } else {
+      unconsumeByte(insn); /* unconsume byte1 */
+      unconsumeByte(insn); /* unconsume byte  */
+      insn->necessaryPrefixLocation = insn->readerCursor - 2;
+    }
+
+    if (insn->vectorExtensionType == TYPE_EVEX) {
+      insn->vectorExtensionPrefix[0] = byte;
+      insn->vectorExtensionPrefix[1] = byte1;
+      if (consumeByte(insn, &insn->vectorExtensionPrefix[2])) {
+        dbgprintf(insn, "Couldn't read third byte of EVEX prefix");
+        return -1;
+      }
+      if (consumeByte(insn, &insn->vectorExtensionPrefix[3])) {
+        dbgprintf(insn, "Couldn't read fourth byte of EVEX prefix");
+        return -1;
+      }
+
+      /* We simulate the REX prefix for simplicity's sake */
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (wFromEVEX3of4(insn->vectorExtensionPrefix[2]) << 3)
+                        | (rFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 2)
+                        | (xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 1)
+                        | (bFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 0);
+      }
+
+      dbgprintf(insn, "Found EVEX prefix 0x%hhx 0x%hhx 0x%hhx 0x%hhx",
+              insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+              insn->vectorExtensionPrefix[2], insn->vectorExtensionPrefix[3]);
+    }
+  } else if (byte == 0xc4) {
+    uint8_t byte1;
+
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of VEX");
+      return -1;
+    }
+
+    if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
+      insn->vectorExtensionType = TYPE_VEX_3B;
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    } else {
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+
+    if (insn->vectorExtensionType == TYPE_VEX_3B) {
+      insn->vectorExtensionPrefix[0] = byte;
+      consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+      consumeByte(insn, &insn->vectorExtensionPrefix[2]);
+
+      /* We simulate the REX prefix for simplicity's sake */
+
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (wFromVEX3of3(insn->vectorExtensionPrefix[2]) << 3)
+                        | (rFromVEX2of3(insn->vectorExtensionPrefix[1]) << 2)
+                        | (xFromVEX2of3(insn->vectorExtensionPrefix[1]) << 1)
+                        | (bFromVEX2of3(insn->vectorExtensionPrefix[1]) << 0);
+      }
+
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx",
+                insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+                insn->vectorExtensionPrefix[2]);
+    }
+  } else if (byte == 0xc5) {
+    uint8_t byte1;
+
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of VEX");
+      return -1;
+    }
+
+    if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
+      insn->vectorExtensionType = TYPE_VEX_2B;
+    } else {
+      unconsumeByte(insn);
+    }
+
+    if (insn->vectorExtensionType == TYPE_VEX_2B) {
+      insn->vectorExtensionPrefix[0] = byte;
+      consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2);
+      }
+
+      switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
+      default:
+        break;
+      case VEX_PREFIX_66:
+        hasOpSize = true;
+        break;
+      }
+
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx",
+                insn->vectorExtensionPrefix[0],
+                insn->vectorExtensionPrefix[1]);
+    }
+  } else if (byte == 0x8f) {
+    uint8_t byte1;
+
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of XOP");
+      return -1;
+    }
+
+    if ((byte1 & 0x38) != 0x0) { /* 0 in these 3 bits is a POP instruction. */
+      insn->vectorExtensionType = TYPE_XOP;
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    } else {
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+
+    if (insn->vectorExtensionType == TYPE_XOP) {
+      insn->vectorExtensionPrefix[0] = byte;
+      consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+      consumeByte(insn, &insn->vectorExtensionPrefix[2]);
+
+      /* We simulate the REX prefix for simplicity's sake */
+
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (wFromXOP3of3(insn->vectorExtensionPrefix[2]) << 3)
+                        | (rFromXOP2of3(insn->vectorExtensionPrefix[1]) << 2)
+                        | (xFromXOP2of3(insn->vectorExtensionPrefix[1]) << 1)
+                        | (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0);
+      }
+
+      switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
+      default:
+        break;
+      case VEX_PREFIX_66:
+        hasOpSize = true;
+        break;
+      }
+
+      dbgprintf(insn, "Found XOP prefix 0x%hhx 0x%hhx 0x%hhx",
+                insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+                insn->vectorExtensionPrefix[2]);
+    }
+  } else {
+    if (insn->mode == MODE_64BIT) {
+      if ((byte & 0xf0) == 0x40) {
+        uint8_t opcodeByte;
+
+        if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) {
+          dbgprintf(insn, "Redundant REX prefix");
+          return -1;
+        }
+
+        insn->rexPrefix = byte;
+        insn->necessaryPrefixLocation = insn->readerCursor - 2;
+
+        dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
+      } else {
+        unconsumeByte(insn);
+        insn->necessaryPrefixLocation = insn->readerCursor - 1;
+      }
+    } else {
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+  }
+
+  if (insn->mode == MODE_16BIT) {
+    insn->registerSize       = (hasOpSize ? 4 : 2);
+    insn->addressSize        = (hasAdSize ? 4 : 2);
+    insn->displacementSize   = (hasAdSize ? 4 : 2);
+    insn->immediateSize      = (hasOpSize ? 4 : 2);
+  } else if (insn->mode == MODE_32BIT) {
+    insn->registerSize       = (hasOpSize ? 2 : 4);
+    insn->addressSize        = (hasAdSize ? 2 : 4);
+    insn->displacementSize   = (hasAdSize ? 2 : 4);
+    insn->immediateSize      = (hasOpSize ? 2 : 4);
+  } else if (insn->mode == MODE_64BIT) {
+    if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
+      insn->registerSize       = 8;
+      insn->addressSize        = (hasAdSize ? 4 : 8);
+      insn->displacementSize   = 4;
+      insn->immediateSize      = 4;
+    } else if (insn->rexPrefix) {
+      insn->registerSize       = (hasOpSize ? 2 : 4);
+      insn->addressSize        = (hasAdSize ? 4 : 8);
+      insn->displacementSize   = (hasOpSize ? 2 : 4);
+      insn->immediateSize      = (hasOpSize ? 2 : 4);
+    } else {
+      insn->registerSize       = (hasOpSize ? 2 : 4);
+      insn->addressSize        = (hasAdSize ? 4 : 8);
+      insn->displacementSize   = (hasOpSize ? 2 : 4);
+      insn->immediateSize      = (hasOpSize ? 2 : 4);
+    }
+  }
+
+  return 0;
+}
+
+/*
+ * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of
+ *   extended or escape opcodes).
+ *
+ * @param insn  - The instruction whose opcode is to be read.
+ * @return      - 0 if the opcode could be read successfully; nonzero otherwise.
+ */
+static int readOpcode(struct InternalInstruction* insn) {
+  /* Determine the length of the primary opcode */
+
+  uint8_t current;
+
+  dbgprintf(insn, "readOpcode()");
+
+  insn->opcodeType = ONEBYTE;
+
+  if (insn->vectorExtensionType == TYPE_EVEX) {
+    switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) {
+    default:
+      dbgprintf(insn, "Unhandled mm field for instruction (0x%hhx)",
+                mmFromEVEX2of4(insn->vectorExtensionPrefix[1]));
+      return -1;
+    case VEX_LOB_0F:
+      insn->opcodeType = TWOBYTE;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F38:
+      insn->opcodeType = THREEBYTE_38;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F3A:
+      insn->opcodeType = THREEBYTE_3A;
+      return consumeByte(insn, &insn->opcode);
+    }
+  } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+    switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) {
+    default:
+      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
+                mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]));
+      return -1;
+    case VEX_LOB_0F:
+      insn->opcodeType = TWOBYTE;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F38:
+      insn->opcodeType = THREEBYTE_38;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F3A:
+      insn->opcodeType = THREEBYTE_3A;
+      return consumeByte(insn, &insn->opcode);
+    }
+  } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+    insn->opcodeType = TWOBYTE;
+    return consumeByte(insn, &insn->opcode);
+  } else if (insn->vectorExtensionType == TYPE_XOP) {
+    switch (mmmmmFromXOP2of3(insn->vectorExtensionPrefix[1])) {
+    default:
+      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
+                mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]));
+      return -1;
+    case XOP_MAP_SELECT_8:
+      insn->opcodeType = XOP8_MAP;
+      return consumeByte(insn, &insn->opcode);
+    case XOP_MAP_SELECT_9:
+      insn->opcodeType = XOP9_MAP;
+      return consumeByte(insn, &insn->opcode);
+    case XOP_MAP_SELECT_A:
+      insn->opcodeType = XOPA_MAP;
+      return consumeByte(insn, &insn->opcode);
+    }
+  }
+
+  if (consumeByte(insn, &current))
+    return -1;
+
+  if (current == 0x0f) {
+    dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current);
+
+    if (consumeByte(insn, &current))
+      return -1;
+
+    if (current == 0x38) {
+      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+      if (consumeByte(insn, &current))
+        return -1;
+
+      insn->opcodeType = THREEBYTE_38;
+    } else if (current == 0x3a) {
+      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+      if (consumeByte(insn, &current))
+        return -1;
+
+      insn->opcodeType = THREEBYTE_3A;
+    } else {
+      dbgprintf(insn, "Didn't find a three-byte escape prefix");
+
+      insn->opcodeType = TWOBYTE;
+    }
+  }
+
+  /*
+   * At this point we have consumed the full opcode.
+   * Anything we consume from here on must be unconsumed.
+   */
+
+  insn->opcode = current;
+
+  return 0;
+}
+
+static int readModRM(struct InternalInstruction* insn);
+
+/*
+ * getIDWithAttrMask - Determines the ID of an instruction, consuming
+ *   the ModR/M byte as appropriate for extended and escape opcodes,
+ *   and using a supplied attribute mask.
+ *
+ * @param instructionID - A pointer whose target is filled in with the ID of the
+ *                        instruction.
+ * @param insn          - The instruction whose ID is to be determined.
+ * @param attrMask      - The attribute mask to search.
+ * @return              - 0 if the ModR/M could be read when needed or was not
+ *                        needed; nonzero otherwise.
+ */
+static int getIDWithAttrMask(uint16_t* instructionID,
+                             struct InternalInstruction* insn,
+                             uint16_t attrMask) {
+  bool hasModRMExtension;
+
+  InstructionContext instructionClass = contextForAttrs(attrMask);
+
+  hasModRMExtension = modRMRequired(insn->opcodeType,
+                                    instructionClass,
+                                    insn->opcode);
+
+  if (hasModRMExtension) {
+    if (readModRM(insn))
+      return -1;
+
+    *instructionID = decode(insn->opcodeType,
+                            instructionClass,
+                            insn->opcode,
+                            insn->modRM);
+  } else {
+    *instructionID = decode(insn->opcodeType,
+                            instructionClass,
+                            insn->opcode,
+                            0);
+  }
+
+  return 0;
+}
+
+/*
+ * is16BitEquivalent - Determines whether two instruction names refer to
+ * equivalent instructions but one is 16-bit whereas the other is not.
+ *
+ * @param orig  - The instruction that is not 16-bit
+ * @param equiv - The instruction that is 16-bit
+ */
+static bool is16BitEquivalent(const char* orig, const char* equiv) {
+  off_t i;
+
+  for (i = 0;; i++) {
+    if (orig[i] == '\0' && equiv[i] == '\0')
+      return true;
+    if (orig[i] == '\0' || equiv[i] == '\0')
+      return false;
+    if (orig[i] != equiv[i]) {
+      if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
+        continue;
+      if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1')
+        continue;
+      if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
+        continue;
+      return false;
+    }
+  }
+}
+
+/*
+ * is64Bit - Determines whether this instruction is a 64-bit instruction.
+ *
+ * @param name - The instruction that is not 16-bit
+ */
+static bool is64Bit(const char* name) {
+  off_t i;
+
+  for (i = 0;; ++i) {
+    if (name[i] == '\0')
+      return false;
+    if (name[i] == '6' && name[i+1] == '4')
+      return true;
+  }
+}
+
+/*
+ * getID - Determines the ID of an instruction, consuming the ModR/M byte as
+ *   appropriate for extended and escape opcodes.  Determines the attributes and
+ *   context for the instruction before doing so.
+ *
+ * @param insn  - The instruction whose ID is to be determined.
+ * @return      - 0 if the ModR/M could be read when needed or was not needed;
+ *                nonzero otherwise.
+ */
+static int getID(struct InternalInstruction* insn, const void *miiArg) {
+  uint16_t attrMask;
+  uint16_t instructionID;
+
+  dbgprintf(insn, "getID()");
+
+  attrMask = ATTR_NONE;
+
+  if (insn->mode == MODE_64BIT)
+    attrMask |= ATTR_64BIT;
+
+  if (insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
+    attrMask |= (insn->vectorExtensionType == TYPE_EVEX) ? ATTR_EVEX : ATTR_VEX;
+
+    if (insn->vectorExtensionType == TYPE_EVEX) {
+      switch (ppFromEVEX3of4(insn->vectorExtensionPrefix[2])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (zFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXKZ;
+      if (bFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXB;
+      if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXK;
+      if (lFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXL;
+      if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXL2;
+    } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+      switch (ppFromVEX3of3(insn->vectorExtensionPrefix[2])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (lFromVEX3of3(insn->vectorExtensionPrefix[2]))
+        attrMask |= ATTR_VEXL;
+    } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+      switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (lFromVEX2of2(insn->vectorExtensionPrefix[1]))
+        attrMask |= ATTR_VEXL;
+    } else if (insn->vectorExtensionType == TYPE_XOP) {
+      switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (lFromXOP3of3(insn->vectorExtensionPrefix[2]))
+        attrMask |= ATTR_VEXL;
+    } else {
+      return -1;
+    }
+  } else {
+    if (insn->mode != MODE_16BIT && isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_OPSIZE;
+    else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_ADSIZE;
+    else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_XS;
+    else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_XD;
+  }
+
+  if (insn->rexPrefix & 0x08)
+    attrMask |= ATTR_REXW;
+
+  /*
+   * JCXZ/JECXZ need special handling for 16-bit mode because the meaning
+   * of the AdSize prefix is inverted w.r.t. 32-bit mode.
+   */
+  if (insn->mode == MODE_16BIT && insn->opcodeType == ONEBYTE &&
+      insn->opcode == 0xE3)
+    attrMask ^= ATTR_ADSIZE;
+
+  /*
+   * In 64-bit mode all f64 superscripted opcodes ignore opcode size prefix
+   * CALL/JMP/JCC instructions need to ignore 0x66 and consume 4 bytes
+   */
+
+  if (insn->mode == MODE_64BIT &&
+      isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) {
+    switch (insn->opcode) {
+    case 0xE8:
+    case 0xE9:
+      // Take care of psubsb and other mmx instructions.
+      if (insn->opcodeType == ONEBYTE) {
+        attrMask ^= ATTR_OPSIZE;
+        insn->immediateSize = 4;
+        insn->displacementSize = 4;
+      }
+      break;
+    case 0x82:
+    case 0x83:
+    case 0x84:
+    case 0x85:
+    case 0x86:
+    case 0x87:
+    case 0x88:
+    case 0x89:
+    case 0x8A:
+    case 0x8B:
+    case 0x8C:
+    case 0x8D:
+    case 0x8E:
+    case 0x8F:
+      // Take care of lea and three byte ops.
+      if (insn->opcodeType == TWOBYTE) {
+        attrMask ^= ATTR_OPSIZE;
+        insn->immediateSize = 4;
+        insn->displacementSize = 4;
+      }
+      break;
+    }
+  }
+
+  if (getIDWithAttrMask(&instructionID, insn, attrMask))
+    return -1;
+
+  /* The following clauses compensate for limitations of the tables. */
+
+  if (insn->mode != MODE_64BIT &&
+      insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
+    /*
+     * The tables can't distinquish between cases where the W-bit is used to
+     * select register size and cases where its a required part of the opcode.
+     */
+    if ((insn->vectorExtensionType == TYPE_EVEX &&
+         wFromEVEX3of4(insn->vectorExtensionPrefix[2])) ||
+        (insn->vectorExtensionType == TYPE_VEX_3B &&
+         wFromVEX3of3(insn->vectorExtensionPrefix[2])) ||
+        (insn->vectorExtensionType == TYPE_XOP &&
+         wFromXOP3of3(insn->vectorExtensionPrefix[2]))) {
+
+      uint16_t instructionIDWithREXW;
+      if (getIDWithAttrMask(&instructionIDWithREXW,
+                            insn, attrMask | ATTR_REXW)) {
+        insn->instructionID = instructionID;
+        insn->spec = specifierForUID(instructionID);
+        return 0;
+      }
+
+      const char *SpecName = GetInstrName(instructionIDWithREXW, miiArg);
+      // If not a 64-bit instruction. Switch the opcode.
+      if (!is64Bit(SpecName)) {
+        insn->instructionID = instructionIDWithREXW;
+        insn->spec = specifierForUID(instructionIDWithREXW);
+        return 0;
+      }
+    }
+  }
+
+  /*
+   * Absolute moves need special handling.
+   * -For 16-bit mode because the meaning of the AdSize and OpSize prefixes are
+   *  inverted w.r.t.
+   * -For 32-bit mode we need to ensure the ADSIZE prefix is observed in
+   *  any position.
+   */
+  if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) {
+    /* Make sure we observed the prefixes in any position. */
+    if (insn->prefixPresent[0x67])
+      attrMask |= ATTR_ADSIZE;
+    if (insn->prefixPresent[0x66])
+      attrMask |= ATTR_OPSIZE;
+
+    /* In 16-bit, invert the attributes. */
+    if (insn->mode == MODE_16BIT)
+      attrMask ^= ATTR_ADSIZE | ATTR_OPSIZE;
+
+    if (getIDWithAttrMask(&instructionID, insn, attrMask))
+      return -1;
+
+    insn->instructionID = instructionID;
+    insn->spec = specifierForUID(instructionID);
+    return 0;
+  }
+
+  if ((insn->mode == MODE_16BIT || insn->prefixPresent[0x66]) &&
+      !(attrMask & ATTR_OPSIZE)) {
+    /*
+     * The instruction tables make no distinction between instructions that
+     * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a
+     * particular spot (i.e., many MMX operations).  In general we're
+     * conservative, but in the specific case where OpSize is present but not
+     * in the right place we check if there's a 16-bit operation.
+     */
+
+    const struct InstructionSpecifier *spec;
+    uint16_t instructionIDWithOpsize;
+    const char *specName, *specWithOpSizeName;
+
+    spec = specifierForUID(instructionID);
+
+    if (getIDWithAttrMask(&instructionIDWithOpsize,
+                          insn,
+                          attrMask | ATTR_OPSIZE)) {
+      /*
+       * ModRM required with OpSize but not present; give up and return version
+       * without OpSize set
+       */
+
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+      return 0;
+    }
+
+    specName = GetInstrName(instructionID, miiArg);
+    specWithOpSizeName = GetInstrName(instructionIDWithOpsize, miiArg);
+
+    if (is16BitEquivalent(specName, specWithOpSizeName) &&
+        (insn->mode == MODE_16BIT) ^ insn->prefixPresent[0x66]) {
+      insn->instructionID = instructionIDWithOpsize;
+      insn->spec = specifierForUID(instructionIDWithOpsize);
+    } else {
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+    }
+    return 0;
+  }
+
+  if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 &&
+      insn->rexPrefix & 0x01) {
+    /*
+     * NOOP shouldn't decode as NOOP if REX.b is set. Instead
+     * it should decode as XCHG %r8, %eax.
+     */
+
+    const struct InstructionSpecifier *spec;
+    uint16_t instructionIDWithNewOpcode;
+    const struct InstructionSpecifier *specWithNewOpcode;
+
+    spec = specifierForUID(instructionID);
+
+    /* Borrow opcode from one of the other XCHGar opcodes */
+    insn->opcode = 0x91;
+
+    if (getIDWithAttrMask(&instructionIDWithNewOpcode,
+                          insn,
+                          attrMask)) {
+      insn->opcode = 0x90;
+
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+      return 0;
+    }
+
+    specWithNewOpcode = specifierForUID(instructionIDWithNewOpcode);
+
+    /* Change back */
+    insn->opcode = 0x90;
+
+    insn->instructionID = instructionIDWithNewOpcode;
+    insn->spec = specWithNewOpcode;
+
+    return 0;
+  }
+
+  insn->instructionID = instructionID;
+  insn->spec = specifierForUID(insn->instructionID);
+
+  return 0;
+}
+
+/*
+ * readSIB - Consumes the SIB byte to determine addressing information for an
+ *   instruction.
+ *
+ * @param insn  - The instruction whose SIB byte is to be read.
+ * @return      - 0 if the SIB byte was successfully read; nonzero otherwise.
+ */
+static int readSIB(struct InternalInstruction* insn) {
+  SIBIndex sibIndexBase = SIB_INDEX_NONE;
+  SIBBase sibBaseBase = SIB_BASE_NONE;
+  uint8_t index, base;
+
+  dbgprintf(insn, "readSIB()");
+
+  if (insn->consumedSIB)
+    return 0;
+
+  insn->consumedSIB = true;
+
+  switch (insn->addressSize) {
+  case 2:
+    dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode");
+    return -1;
+  case 4:
+    sibIndexBase = SIB_INDEX_EAX;
+    sibBaseBase = SIB_BASE_EAX;
+    break;
+  case 8:
+    sibIndexBase = SIB_INDEX_RAX;
+    sibBaseBase = SIB_BASE_RAX;
+    break;
+  }
+
+  if (consumeByte(insn, &insn->sib))
+    return -1;
+
+  index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
+
+  // FIXME: The fifth bit (bit index 4) is only to be used for instructions
+  // that understand VSIB indexing. ORing the bit in here is mildy dangerous
+  // because performing math on an 'enum SIBIndex' can produce garbage.
+  // Excluding the "none" value, it should cover 6 spaces of register names:
+  //   - 16 possibilities for 16-bit GPR starting at SIB_INDEX_BX_SI
+  //   - 16 possibilities for 32-bit GPR starting at SIB_INDEX_EAX
+  //   - 16 possibilities for 64-bit GPR starting at SIB_INDEX_RAX
+  //   - 32 possibilities for each of XMM, YMM, ZMM registers
+  // When sibIndexBase gets assigned SIB_INDEX_RAX as it does in 64-bit mode,
+  // summing in a fully decoded index between 0 and 31 can end up with a value
+  // that looks like something in the low half of the XMM range.
+  // translateRMMemory() tries to reverse the damage, with only partial success,
+  // as evidenced by known bugs in "test/MC/Disassembler/X86/x86-64.txt"
+  if (insn->vectorExtensionType == TYPE_EVEX)
+    index |= v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4;
+
+  if (index == 0x4) {
+    insn->sibIndex = SIB_INDEX_NONE;
+  } else {
+    insn->sibIndex = (SIBIndex)(sibIndexBase + index);
+  }
+
+  insn->sibScale = 1 << scaleFromSIB(insn->sib);
+
+  base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3);
+
+  switch (base) {
+  case 0x5:
+  case 0xd:
+    switch (modFromModRM(insn->modRM)) {
+    case 0x0:
+      insn->eaDisplacement = EA_DISP_32;
+      insn->sibBase = SIB_BASE_NONE;
+      break;
+    case 0x1:
+      insn->eaDisplacement = EA_DISP_8;
+      insn->sibBase = (SIBBase)(sibBaseBase + base);
+      break;
+    case 0x2:
+      insn->eaDisplacement = EA_DISP_32;
+      insn->sibBase = (SIBBase)(sibBaseBase + base);
+      break;
+    case 0x3:
+      debug("Cannot have Mod = 0b11 and a SIB byte");
+      return -1;
+    }
+    break;
+  default:
+    insn->sibBase = (SIBBase)(sibBaseBase + base);
+    break;
+  }
+
+  return 0;
+}
+
+/*
+ * readDisplacement - Consumes the displacement of an instruction.
+ *
+ * @param insn  - The instruction whose displacement is to be read.
+ * @return      - 0 if the displacement byte was successfully read; nonzero
+ *                otherwise.
+ */
+static int readDisplacement(struct InternalInstruction* insn) {
+  int8_t d8;
+  int16_t d16;
+  int32_t d32;
+
+  dbgprintf(insn, "readDisplacement()");
+
+  if (insn->consumedDisplacement)
+    return 0;
+
+  insn->consumedDisplacement = true;
+  insn->displacementOffset = insn->readerCursor - insn->startLocation;
+
+  switch (insn->eaDisplacement) {
+  case EA_DISP_NONE:
+    insn->consumedDisplacement = false;
+    break;
+  case EA_DISP_8:
+    if (consumeInt8(insn, &d8))
+      return -1;
+    insn->displacement = d8;
+    break;
+  case EA_DISP_16:
+    if (consumeInt16(insn, &d16))
+      return -1;
+    insn->displacement = d16;
+    break;
+  case EA_DISP_32:
+    if (consumeInt32(insn, &d32))
+      return -1;
+    insn->displacement = d32;
+    break;
+  }
+
+  insn->consumedDisplacement = true;
+  return 0;
+}
+
+/*
+ * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and
+ *   displacement) for an instruction and interprets it.
+ *
+ * @param insn  - The instruction whose addressing information is to be read.
+ * @return      - 0 if the information was successfully read; nonzero otherwise.
+ */
+static int readModRM(struct InternalInstruction* insn) {
+  uint8_t mod, rm, reg;
+
+  dbgprintf(insn, "readModRM()");
+
+  if (insn->consumedModRM)
+    return 0;
+
+  if (consumeByte(insn, &insn->modRM))
+    return -1;
+  insn->consumedModRM = true;
+
+  mod     = modFromModRM(insn->modRM);
+  rm      = rmFromModRM(insn->modRM);
+  reg     = regFromModRM(insn->modRM);
+
+  /*
+   * This goes by insn->registerSize to pick the correct register, which messes
+   * up if we're using (say) XMM or 8-bit register operands.  That gets fixed in
+   * fixupReg().
+   */
+  switch (insn->registerSize) {
+  case 2:
+    insn->regBase = MODRM_REG_AX;
+    insn->eaRegBase = EA_REG_AX;
+    break;
+  case 4:
+    insn->regBase = MODRM_REG_EAX;
+    insn->eaRegBase = EA_REG_EAX;
+    break;
+  case 8:
+    insn->regBase = MODRM_REG_RAX;
+    insn->eaRegBase = EA_REG_RAX;
+    break;
+  }
+
+  reg |= rFromREX(insn->rexPrefix) << 3;
+  rm  |= bFromREX(insn->rexPrefix) << 3;
+  if (insn->vectorExtensionType == TYPE_EVEX) {
+    reg |= r2FromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+    rm  |=  xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+  }
+
+  insn->reg = (Reg)(insn->regBase + reg);
+
+  switch (insn->addressSize) {
+  case 2:
+    insn->eaBaseBase = EA_BASE_BX_SI;
+
+    switch (mod) {
+    case 0x0:
+      if (rm == 0x6) {
+        insn->eaBase = EA_BASE_NONE;
+        insn->eaDisplacement = EA_DISP_16;
+        if (readDisplacement(insn))
+          return -1;
+      } else {
+        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        insn->eaDisplacement = EA_DISP_NONE;
+      }
+      break;
+    case 0x1:
+      insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+      insn->eaDisplacement = EA_DISP_8;
+      insn->displacementSize = 1;
+      if (readDisplacement(insn))
+        return -1;
+      break;
+    case 0x2:
+      insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+      insn->eaDisplacement = EA_DISP_16;
+      if (readDisplacement(insn))
+        return -1;
+      break;
+    case 0x3:
+      insn->eaBase = (EABase)(insn->eaRegBase + rm);
+      if (readDisplacement(insn))
+        return -1;
+      break;
+    }
+    break;
+  case 4:
+  case 8:
+    insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
+
+    switch (mod) {
+    case 0x0:
+      insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */
+      // In determining whether RIP-relative mode is used (rm=5),
+      // or whether a SIB byte is present (rm=4),
+      // the extension bits (REX.b and EVEX.x) are ignored.
+      switch (rm & 7) {
+      case 0x4: // SIB byte is present
+        insn->eaBase = (insn->addressSize == 4 ?
+                        EA_BASE_sib : EA_BASE_sib64);
+        if (readSIB(insn) || readDisplacement(insn))
+          return -1;
+        break;
+      case 0x5: // RIP-relative
+        insn->eaBase = EA_BASE_NONE;
+        insn->eaDisplacement = EA_DISP_32;
+        if (readDisplacement(insn))
+          return -1;
+        break;
+      default:
+        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        break;
+      }
+      break;
+    case 0x1:
+      insn->displacementSize = 1;
+      /* FALLTHROUGH */
+    case 0x2:
+      insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
+      switch (rm & 7) {
+      case 0x4: // SIB byte is present
+        insn->eaBase = EA_BASE_sib;
+        if (readSIB(insn) || readDisplacement(insn))
+          return -1;
+        break;
+      default:
+        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        if (readDisplacement(insn))
+          return -1;
+        break;
+      }
+      break;
+    case 0x3:
+      insn->eaDisplacement = EA_DISP_NONE;
+      insn->eaBase = (EABase)(insn->eaRegBase + rm);
+      break;
+    }
+    break;
+  } /* switch (insn->addressSize) */
+
+  return 0;
+}
+
+#define GENERIC_FIXUP_FUNC(name, base, prefix)            \
+  static uint8_t name(struct InternalInstruction *insn,   \
+                      OperandType type,                   \
+                      uint8_t index,                      \
+                      uint8_t *valid) {                   \
+    *valid = 1;                                           \
+    switch (type) {                                       \
+    default:                                              \
+      debug("Unhandled register type");                   \
+      *valid = 0;                                         \
+      return 0;                                           \
+    case TYPE_Rv:                                         \
+      return base + index;                                \
+    case TYPE_R8:                                         \
+      if (insn->rexPrefix &&                              \
+         index >= 4 && index <= 7) {                      \
+        return prefix##_SPL + (index - 4);                \
+      } else {                                            \
+        return prefix##_AL + index;                       \
+      }                                                   \
+    case TYPE_R16:                                        \
+      return prefix##_AX + index;                         \
+    case TYPE_R32:                                        \
+      return prefix##_EAX + index;                        \
+    case TYPE_R64:                                        \
+      return prefix##_RAX + index;                        \
+    case TYPE_XMM512:                                     \
+      return prefix##_ZMM0 + index;                       \
+    case TYPE_XMM256:                                     \
+      return prefix##_YMM0 + index;                       \
+    case TYPE_XMM128:                                     \
+    case TYPE_XMM64:                                      \
+    case TYPE_XMM32:                                      \
+    case TYPE_XMM:                                        \
+      return prefix##_XMM0 + index;                       \
+    case TYPE_VK1:                                        \
+    case TYPE_VK2:                                        \
+    case TYPE_VK4:                                        \
+    case TYPE_VK8:                                        \
+    case TYPE_VK16:                                       \
+    case TYPE_VK32:                                       \
+    case TYPE_VK64:                                       \
+      if (index > 7)                                      \
+        *valid = 0;                                       \
+      return prefix##_K0 + index;                         \
+    case TYPE_MM64:                                       \
+      return prefix##_MM0 + (index & 0x7);                \
+    case TYPE_SEGMENTREG:                                 \
+      if (index > 5)                                      \
+        *valid = 0;                                       \
+      return prefix##_ES + index;                         \
+    case TYPE_DEBUGREG:                                   \
+      return prefix##_DR0 + index;                        \
+    case TYPE_CONTROLREG:                                 \
+      return prefix##_CR0 + index;                        \
+    }                                                     \
+  }
+
+/*
+ * fixup*Value - Consults an operand type to determine the meaning of the
+ *   reg or R/M field.  If the operand is an XMM operand, for example, an
+ *   operand would be XMM0 instead of AX, which readModRM() would otherwise
+ *   misinterpret it as.
+ *
+ * @param insn  - The instruction containing the operand.
+ * @param type  - The operand type.
+ * @param index - The existing value of the field as reported by readModRM().
+ * @param valid - The address of a uint8_t.  The target is set to 1 if the
+ *                field is valid for the register class; 0 if not.
+ * @return      - The proper value.
+ */
+GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase,    MODRM_REG)
+GENERIC_FIXUP_FUNC(fixupRMValue,  insn->eaRegBase,  EA_REG)
+
+/*
+ * fixupReg - Consults an operand specifier to determine which of the
+ *   fixup*Value functions to use in correcting readModRM()'ss interpretation.
+ *
+ * @param insn  - See fixup*Value().
+ * @param op    - The operand specifier.
+ * @return      - 0 if fixup was successful; -1 if the register returned was
+ *                invalid for its class.
+ */
+static int fixupReg(struct InternalInstruction *insn,
+                    const struct OperandSpecifier *op) {
+  uint8_t valid;
+
+  dbgprintf(insn, "fixupReg()");
+
+  switch ((OperandEncoding)op->encoding) {
+  default:
+    debug("Expected a REG or R/M encoding in fixupReg");
+    return -1;
+  case ENCODING_VVVV:
+    insn->vvvv = (Reg)fixupRegValue(insn,
+                                    (OperandType)op->type,
+                                    insn->vvvv,
+                                    &valid);
+    if (!valid)
+      return -1;
+    break;
+  case ENCODING_REG:
+    insn->reg = (Reg)fixupRegValue(insn,
+                                   (OperandType)op->type,
+                                   insn->reg - insn->regBase,
+                                   &valid);
+    if (!valid)
+      return -1;
+    break;
+  CASE_ENCODING_RM:
+    if (insn->eaBase >= insn->eaRegBase) {
+      insn->eaBase = (EABase)fixupRMValue(insn,
+                                          (OperandType)op->type,
+                                          insn->eaBase - insn->eaRegBase,
+                                          &valid);
+      if (!valid)
+        return -1;
+    }
+    break;
+  }
+
+  return 0;
+}
+
+/*
+ * readOpcodeRegister - Reads an operand from the opcode field of an
+ *   instruction and interprets it appropriately given the operand width.
+ *   Handles AddRegFrm instructions.
+ *
+ * @param insn  - the instruction whose opcode field is to be read.
+ * @param size  - The width (in bytes) of the register being specified.
+ *                1 means AL and friends, 2 means AX, 4 means EAX, and 8 means
+ *                RAX.
+ * @return      - 0 on success; nonzero otherwise.
+ */
+static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) {
+  dbgprintf(insn, "readOpcodeRegister()");
+
+  if (size == 0)
+    size = insn->registerSize;
+
+  switch (size) {
+  case 1:
+    insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3)
+                                                  | (insn->opcode & 7)));
+    if (insn->rexPrefix &&
+        insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
+        insn->opcodeRegister < MODRM_REG_AL + 0x8) {
+      insn->opcodeRegister = (Reg)(MODRM_REG_SPL
+                                   + (insn->opcodeRegister - MODRM_REG_AL - 4));
+    }
+
+    break;
+  case 2:
+    insn->opcodeRegister = (Reg)(MODRM_REG_AX
+                                 + ((bFromREX(insn->rexPrefix) << 3)
+                                    | (insn->opcode & 7)));
+    break;
+  case 4:
+    insn->opcodeRegister = (Reg)(MODRM_REG_EAX
+                                 + ((bFromREX(insn->rexPrefix) << 3)
+                                    | (insn->opcode & 7)));
+    break;
+  case 8:
+    insn->opcodeRegister = (Reg)(MODRM_REG_RAX
+                                 + ((bFromREX(insn->rexPrefix) << 3)
+                                    | (insn->opcode & 7)));
+    break;
+  }
+
+  return 0;
+}
+
+/*
+ * readImmediate - Consumes an immediate operand from an instruction, given the
+ *   desired operand size.
+ *
+ * @param insn  - The instruction whose operand is to be read.
+ * @param size  - The width (in bytes) of the operand.
+ * @return      - 0 if the immediate was successfully consumed; nonzero
+ *                otherwise.
+ */
+static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
+  uint8_t imm8;
+  uint16_t imm16;
+  uint32_t imm32;
+  uint64_t imm64;
+
+  dbgprintf(insn, "readImmediate()");
+
+  if (insn->numImmediatesConsumed == 2) {
+    debug("Already consumed two immediates");
+    return -1;
+  }
+
+  if (size == 0)
+    size = insn->immediateSize;
+  else
+    insn->immediateSize = size;
+  insn->immediateOffset = insn->readerCursor - insn->startLocation;
+
+  switch (size) {
+  case 1:
+    if (consumeByte(insn, &imm8))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm8;
+    break;
+  case 2:
+    if (consumeUInt16(insn, &imm16))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm16;
+    break;
+  case 4:
+    if (consumeUInt32(insn, &imm32))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm32;
+    break;
+  case 8:
+    if (consumeUInt64(insn, &imm64))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm64;
+    break;
+  }
+
+  insn->numImmediatesConsumed++;
+
+  return 0;
+}
+
+/*
+ * readVVVV - Consumes vvvv from an instruction if it has a VEX prefix.
+ *
+ * @param insn  - The instruction whose operand is to be read.
+ * @return      - 0 if the vvvv was successfully consumed; nonzero
+ *                otherwise.
+ */
+static int readVVVV(struct InternalInstruction* insn) {
+  dbgprintf(insn, "readVVVV()");
+
+  int vvvv;
+  if (insn->vectorExtensionType == TYPE_EVEX)
+    vvvv = (v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4 |
+            vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]));
+  else if (insn->vectorExtensionType == TYPE_VEX_3B)
+    vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]);
+  else if (insn->vectorExtensionType == TYPE_VEX_2B)
+    vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]);
+  else if (insn->vectorExtensionType == TYPE_XOP)
+    vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]);
+  else
+    return -1;
+
+  if (insn->mode != MODE_64BIT)
+    vvvv &= 0x7;
+
+  insn->vvvv = static_cast<Reg>(vvvv);
+  return 0;
+}
+
+/*
+ * readMaskRegister - Reads an mask register from the opcode field of an
+ *   instruction.
+ *
+ * @param insn    - The instruction whose opcode field is to be read.
+ * @return        - 0 on success; nonzero otherwise.
+ */
+static int readMaskRegister(struct InternalInstruction* insn) {
+  dbgprintf(insn, "readMaskRegister()");
+
+  if (insn->vectorExtensionType != TYPE_EVEX)
+    return -1;
+
+  insn->writemask =
+      static_cast<Reg>(aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]));
+  return 0;
+}
+
+/*
+ * readOperands - Consults the specifier for an instruction and consumes all
+ *   operands for that instruction, interpreting them as it goes.
+ *
+ * @param insn  - The instruction whose operands are to be read and interpreted.
+ * @return      - 0 if all operands could be read; nonzero otherwise.
+ */
+static int readOperands(struct InternalInstruction* insn) {
+  int hasVVVV, needVVVV;
+  int sawRegImm = 0;
+
+  dbgprintf(insn, "readOperands()");
+
+  /* If non-zero vvvv specified, need to make sure one of the operands
+     uses it. */
+  hasVVVV = !readVVVV(insn);
+  needVVVV = hasVVVV && (insn->vvvv != 0);
+
+  for (const auto &Op : x86OperandSets[insn->spec->operands]) {
+    switch (Op.encoding) {
+    case ENCODING_NONE:
+    case ENCODING_SI:
+    case ENCODING_DI:
+      break;
+    case ENCODING_REG:
+    CASE_ENCODING_RM:
+      if (readModRM(insn))
+        return -1;
+      if (fixupReg(insn, &Op))
+        return -1;
+      // Apply the AVX512 compressed displacement scaling factor.
+      if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
+        insn->displacement *= 1 << (Op.encoding - ENCODING_RM);
+      break;
+    case ENCODING_CB:
+    case ENCODING_CW:
+    case ENCODING_CD:
+    case ENCODING_CP:
+    case ENCODING_CO:
+    case ENCODING_CT:
+      dbgprintf(insn, "We currently don't hande code-offset encodings");
+      return -1;
+    case ENCODING_IB:
+      if (sawRegImm) {
+        /* Saw a register immediate so don't read again and instead split the
+           previous immediate.  FIXME: This is a hack. */
+        insn->immediates[insn->numImmediatesConsumed] =
+          insn->immediates[insn->numImmediatesConsumed - 1] & 0xf;
+        ++insn->numImmediatesConsumed;
+        break;
+      }
+      if (readImmediate(insn, 1))
+        return -1;
+      if (Op.type == TYPE_XMM128 ||
+          Op.type == TYPE_XMM256)
+        sawRegImm = 1;
+      break;
+    case ENCODING_IW:
+      if (readImmediate(insn, 2))
+        return -1;
+      break;
+    case ENCODING_ID:
+      if (readImmediate(insn, 4))
+        return -1;
+      break;
+    case ENCODING_IO:
+      if (readImmediate(insn, 8))
+        return -1;
+      break;
+    case ENCODING_Iv:
+      if (readImmediate(insn, insn->immediateSize))
+        return -1;
+      break;
+    case ENCODING_Ia:
+      if (readImmediate(insn, insn->addressSize))
+        return -1;
+      break;
+    case ENCODING_RB:
+      if (readOpcodeRegister(insn, 1))
+        return -1;
+      break;
+    case ENCODING_RW:
+      if (readOpcodeRegister(insn, 2))
+        return -1;
+      break;
+    case ENCODING_RD:
+      if (readOpcodeRegister(insn, 4))
+        return -1;
+      break;
+    case ENCODING_RO:
+      if (readOpcodeRegister(insn, 8))
+        return -1;
+      break;
+    case ENCODING_Rv:
+      if (readOpcodeRegister(insn, 0))
+        return -1;
+      break;
+    case ENCODING_FP:
+      break;
+    case ENCODING_VVVV:
+      needVVVV = 0; /* Mark that we have found a VVVV operand. */
+      if (!hasVVVV)
+        return -1;
+      if (fixupReg(insn, &Op))
+        return -1;
+      break;
+    case ENCODING_WRITEMASK:
+      if (readMaskRegister(insn))
+        return -1;
+      break;
+    case ENCODING_DUP:
+      break;
+    default:
+      dbgprintf(insn, "Encountered an operand with an unknown encoding.");
+      return -1;
+    }
+  }
+
+  /* If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail */
+  if (needVVVV) return -1;
+
+  return 0;
+}
+
+/*
+ * decodeInstruction - Reads and interprets a full instruction provided by the
+ *   user.
+ *
+ * @param insn      - A pointer to the instruction to be populated.  Must be
+ *                    pre-allocated.
+ * @param reader    - The function to be used to read the instruction's bytes.
+ * @param readerArg - A generic argument to be passed to the reader to store
+ *                    any internal state.
+ * @param logger    - If non-NULL, the function to be used to write log messages
+ *                    and warnings.
+ * @param loggerArg - A generic argument to be passed to the logger to store
+ *                    any internal state.
+ * @param startLoc  - The address (in the reader's address space) of the first
+ *                    byte in the instruction.
+ * @param mode      - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to
+ *                    decode the instruction in.
+ * @return          - 0 if the instruction's memory could be read; nonzero if
+ *                    not.
+ */
+int llvm::X86Disassembler::decodeInstruction(
+    struct InternalInstruction *insn, byteReader_t reader,
+    const void *readerArg, dlog_t logger, void *loggerArg, const void *miiArg,
+    uint64_t startLoc, DisassemblerMode mode) {
+  memset(insn, 0, sizeof(struct InternalInstruction));
+
+  insn->reader = reader;
+  insn->readerArg = readerArg;
+  insn->dlog = logger;
+  insn->dlogArg = loggerArg;
+  insn->startLocation = startLoc;
+  insn->readerCursor = startLoc;
+  insn->mode = mode;
+  insn->numImmediatesConsumed = 0;
+
+  if (readPrefixes(insn)       ||
+      readOpcode(insn)         ||
+      getID(insn, miiArg)      ||
+      insn->instructionID == 0 ||
+      readOperands(insn))
+    return -1;
+
+  insn->operands = x86OperandSets[insn->spec->operands];
+
+  insn->length = insn->readerCursor - insn->startLocation;
+
+  dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu",
+            startLoc, insn->readerCursor, insn->length);
+
+  if (insn->length > 15)
+    dbgprintf(insn, "Instruction exceeds 15-byte limit");
+
+  return 0;
+}
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
new file mode 100644
index 0000000..28a628e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -0,0 +1,675 @@
+//===-- X86DisassemblerDecoderInternal.h - Disassembler decoder -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains the public interface of the instruction decoder.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H
+#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H
+
+#include "X86DisassemblerDecoderCommon.h"
+#include "llvm/ADT/ArrayRef.h"
+
+namespace llvm {
+namespace X86Disassembler {
+
+// Accessor functions for various fields of an Intel instruction
+#define modFromModRM(modRM)  (((modRM) & 0xc0) >> 6)
+#define regFromModRM(modRM)  (((modRM) & 0x38) >> 3)
+#define rmFromModRM(modRM)   ((modRM) & 0x7)
+#define scaleFromSIB(sib)    (((sib) & 0xc0) >> 6)
+#define indexFromSIB(sib)    (((sib) & 0x38) >> 3)
+#define baseFromSIB(sib)     ((sib) & 0x7)
+#define wFromREX(rex)        (((rex) & 0x8) >> 3)
+#define rFromREX(rex)        (((rex) & 0x4) >> 2)
+#define xFromREX(rex)        (((rex) & 0x2) >> 1)
+#define bFromREX(rex)        ((rex) & 0x1)
+
+#define rFromEVEX2of4(evex)     (((~(evex)) & 0x80) >> 7)
+#define xFromEVEX2of4(evex)     (((~(evex)) & 0x40) >> 6)
+#define bFromEVEX2of4(evex)     (((~(evex)) & 0x20) >> 5)
+#define r2FromEVEX2of4(evex)    (((~(evex)) & 0x10) >> 4)
+#define mmFromEVEX2of4(evex)    ((evex) & 0x3)
+#define wFromEVEX3of4(evex)     (((evex) & 0x80) >> 7)
+#define vvvvFromEVEX3of4(evex)  (((~(evex)) & 0x78) >> 3)
+#define ppFromEVEX3of4(evex)    ((evex) & 0x3)
+#define zFromEVEX4of4(evex)     (((evex) & 0x80) >> 7)
+#define l2FromEVEX4of4(evex)    (((evex) & 0x40) >> 6)
+#define lFromEVEX4of4(evex)     (((evex) & 0x20) >> 5)
+#define bFromEVEX4of4(evex)     (((evex) & 0x10) >> 4)
+#define v2FromEVEX4of4(evex)    (((~evex) & 0x8) >> 3)
+#define aaaFromEVEX4of4(evex)   ((evex) & 0x7)
+
+#define rFromVEX2of3(vex)       (((~(vex)) & 0x80) >> 7)
+#define xFromVEX2of3(vex)       (((~(vex)) & 0x40) >> 6)
+#define bFromVEX2of3(vex)       (((~(vex)) & 0x20) >> 5)
+#define mmmmmFromVEX2of3(vex)   ((vex) & 0x1f)
+#define wFromVEX3of3(vex)       (((vex) & 0x80) >> 7)
+#define vvvvFromVEX3of3(vex)    (((~(vex)) & 0x78) >> 3)
+#define lFromVEX3of3(vex)       (((vex) & 0x4) >> 2)
+#define ppFromVEX3of3(vex)      ((vex) & 0x3)
+
+#define rFromVEX2of2(vex)       (((~(vex)) & 0x80) >> 7)
+#define vvvvFromVEX2of2(vex)    (((~(vex)) & 0x78) >> 3)
+#define lFromVEX2of2(vex)       (((vex) & 0x4) >> 2)
+#define ppFromVEX2of2(vex)      ((vex) & 0x3)
+
+#define rFromXOP2of3(xop)       (((~(xop)) & 0x80) >> 7)
+#define xFromXOP2of3(xop)       (((~(xop)) & 0x40) >> 6)
+#define bFromXOP2of3(xop)       (((~(xop)) & 0x20) >> 5)
+#define mmmmmFromXOP2of3(xop)   ((xop) & 0x1f)
+#define wFromXOP3of3(xop)       (((xop) & 0x80) >> 7)
+#define vvvvFromXOP3of3(vex)    (((~(vex)) & 0x78) >> 3)
+#define lFromXOP3of3(xop)       (((xop) & 0x4) >> 2)
+#define ppFromXOP3of3(xop)      ((xop) & 0x3)
+
+// These enums represent Intel registers for use by the decoder.
+#define REGS_8BIT     \
+  ENTRY(AL)           \
+  ENTRY(CL)           \
+  ENTRY(DL)           \
+  ENTRY(BL)           \
+  ENTRY(AH)           \
+  ENTRY(CH)           \
+  ENTRY(DH)           \
+  ENTRY(BH)           \
+  ENTRY(R8B)          \
+  ENTRY(R9B)          \
+  ENTRY(R10B)         \
+  ENTRY(R11B)         \
+  ENTRY(R12B)         \
+  ENTRY(R13B)         \
+  ENTRY(R14B)         \
+  ENTRY(R15B)         \
+  ENTRY(SPL)          \
+  ENTRY(BPL)          \
+  ENTRY(SIL)          \
+  ENTRY(DIL)
+
+#define EA_BASES_16BIT  \
+  ENTRY(BX_SI)          \
+  ENTRY(BX_DI)          \
+  ENTRY(BP_SI)          \
+  ENTRY(BP_DI)          \
+  ENTRY(SI)             \
+  ENTRY(DI)             \
+  ENTRY(BP)             \
+  ENTRY(BX)             \
+  ENTRY(R8W)            \
+  ENTRY(R9W)            \
+  ENTRY(R10W)           \
+  ENTRY(R11W)           \
+  ENTRY(R12W)           \
+  ENTRY(R13W)           \
+  ENTRY(R14W)           \
+  ENTRY(R15W)
+
+#define REGS_16BIT    \
+  ENTRY(AX)           \
+  ENTRY(CX)           \
+  ENTRY(DX)           \
+  ENTRY(BX)           \
+  ENTRY(SP)           \
+  ENTRY(BP)           \
+  ENTRY(SI)           \
+  ENTRY(DI)           \
+  ENTRY(R8W)          \
+  ENTRY(R9W)          \
+  ENTRY(R10W)         \
+  ENTRY(R11W)         \
+  ENTRY(R12W)         \
+  ENTRY(R13W)         \
+  ENTRY(R14W)         \
+  ENTRY(R15W)
+
+#define EA_BASES_32BIT  \
+  ENTRY(EAX)            \
+  ENTRY(ECX)            \
+  ENTRY(EDX)            \
+  ENTRY(EBX)            \
+  ENTRY(sib)            \
+  ENTRY(EBP)            \
+  ENTRY(ESI)            \
+  ENTRY(EDI)            \
+  ENTRY(R8D)            \
+  ENTRY(R9D)            \
+  ENTRY(R10D)           \
+  ENTRY(R11D)           \
+  ENTRY(R12D)           \
+  ENTRY(R13D)           \
+  ENTRY(R14D)           \
+  ENTRY(R15D)
+
+#define REGS_32BIT  \
+  ENTRY(EAX)        \
+  ENTRY(ECX)        \
+  ENTRY(EDX)        \
+  ENTRY(EBX)        \
+  ENTRY(ESP)        \
+  ENTRY(EBP)        \
+  ENTRY(ESI)        \
+  ENTRY(EDI)        \
+  ENTRY(R8D)        \
+  ENTRY(R9D)        \
+  ENTRY(R10D)       \
+  ENTRY(R11D)       \
+  ENTRY(R12D)       \
+  ENTRY(R13D)       \
+  ENTRY(R14D)       \
+  ENTRY(R15D)
+
+#define EA_BASES_64BIT  \
+  ENTRY(RAX)            \
+  ENTRY(RCX)            \
+  ENTRY(RDX)            \
+  ENTRY(RBX)            \
+  ENTRY(sib64)          \
+  ENTRY(RBP)            \
+  ENTRY(RSI)            \
+  ENTRY(RDI)            \
+  ENTRY(R8)             \
+  ENTRY(R9)             \
+  ENTRY(R10)            \
+  ENTRY(R11)            \
+  ENTRY(R12)            \
+  ENTRY(R13)            \
+  ENTRY(R14)            \
+  ENTRY(R15)
+
+#define REGS_64BIT  \
+  ENTRY(RAX)        \
+  ENTRY(RCX)        \
+  ENTRY(RDX)        \
+  ENTRY(RBX)        \
+  ENTRY(RSP)        \
+  ENTRY(RBP)        \
+  ENTRY(RSI)        \
+  ENTRY(RDI)        \
+  ENTRY(R8)         \
+  ENTRY(R9)         \
+  ENTRY(R10)        \
+  ENTRY(R11)        \
+  ENTRY(R12)        \
+  ENTRY(R13)        \
+  ENTRY(R14)        \
+  ENTRY(R15)
+
+#define REGS_MMX  \
+  ENTRY(MM0)      \
+  ENTRY(MM1)      \
+  ENTRY(MM2)      \
+  ENTRY(MM3)      \
+  ENTRY(MM4)      \
+  ENTRY(MM5)      \
+  ENTRY(MM6)      \
+  ENTRY(MM7)
+
+#define REGS_XMM  \
+  ENTRY(XMM0)     \
+  ENTRY(XMM1)     \
+  ENTRY(XMM2)     \
+  ENTRY(XMM3)     \
+  ENTRY(XMM4)     \
+  ENTRY(XMM5)     \
+  ENTRY(XMM6)     \
+  ENTRY(XMM7)     \
+  ENTRY(XMM8)     \
+  ENTRY(XMM9)     \
+  ENTRY(XMM10)    \
+  ENTRY(XMM11)    \
+  ENTRY(XMM12)    \
+  ENTRY(XMM13)    \
+  ENTRY(XMM14)    \
+  ENTRY(XMM15)    \
+  ENTRY(XMM16)    \
+  ENTRY(XMM17)    \
+  ENTRY(XMM18)    \
+  ENTRY(XMM19)    \
+  ENTRY(XMM20)    \
+  ENTRY(XMM21)    \
+  ENTRY(XMM22)    \
+  ENTRY(XMM23)    \
+  ENTRY(XMM24)    \
+  ENTRY(XMM25)    \
+  ENTRY(XMM26)    \
+  ENTRY(XMM27)    \
+  ENTRY(XMM28)    \
+  ENTRY(XMM29)    \
+  ENTRY(XMM30)    \
+  ENTRY(XMM31)
+
+#define REGS_YMM  \
+  ENTRY(YMM0)     \
+  ENTRY(YMM1)     \
+  ENTRY(YMM2)     \
+  ENTRY(YMM3)     \
+  ENTRY(YMM4)     \
+  ENTRY(YMM5)     \
+  ENTRY(YMM6)     \
+  ENTRY(YMM7)     \
+  ENTRY(YMM8)     \
+  ENTRY(YMM9)     \
+  ENTRY(YMM10)    \
+  ENTRY(YMM11)    \
+  ENTRY(YMM12)    \
+  ENTRY(YMM13)    \
+  ENTRY(YMM14)    \
+  ENTRY(YMM15)    \
+  ENTRY(YMM16)    \
+  ENTRY(YMM17)    \
+  ENTRY(YMM18)    \
+  ENTRY(YMM19)    \
+  ENTRY(YMM20)    \
+  ENTRY(YMM21)    \
+  ENTRY(YMM22)    \
+  ENTRY(YMM23)    \
+  ENTRY(YMM24)    \
+  ENTRY(YMM25)    \
+  ENTRY(YMM26)    \
+  ENTRY(YMM27)    \
+  ENTRY(YMM28)    \
+  ENTRY(YMM29)    \
+  ENTRY(YMM30)    \
+  ENTRY(YMM31)
+
+#define REGS_ZMM  \
+  ENTRY(ZMM0)     \
+  ENTRY(ZMM1)     \
+  ENTRY(ZMM2)     \
+  ENTRY(ZMM3)     \
+  ENTRY(ZMM4)     \
+  ENTRY(ZMM5)     \
+  ENTRY(ZMM6)     \
+  ENTRY(ZMM7)     \
+  ENTRY(ZMM8)     \
+  ENTRY(ZMM9)     \
+  ENTRY(ZMM10)    \
+  ENTRY(ZMM11)    \
+  ENTRY(ZMM12)    \
+  ENTRY(ZMM13)    \
+  ENTRY(ZMM14)    \
+  ENTRY(ZMM15)    \
+  ENTRY(ZMM16)    \
+  ENTRY(ZMM17)    \
+  ENTRY(ZMM18)    \
+  ENTRY(ZMM19)    \
+  ENTRY(ZMM20)    \
+  ENTRY(ZMM21)    \
+  ENTRY(ZMM22)    \
+  ENTRY(ZMM23)    \
+  ENTRY(ZMM24)    \
+  ENTRY(ZMM25)    \
+  ENTRY(ZMM26)    \
+  ENTRY(ZMM27)    \
+  ENTRY(ZMM28)    \
+  ENTRY(ZMM29)    \
+  ENTRY(ZMM30)    \
+  ENTRY(ZMM31)
+
+#define REGS_MASKS \
+  ENTRY(K0)        \
+  ENTRY(K1)        \
+  ENTRY(K2)        \
+  ENTRY(K3)        \
+  ENTRY(K4)        \
+  ENTRY(K5)        \
+  ENTRY(K6)        \
+  ENTRY(K7)
+
+#define REGS_SEGMENT \
+  ENTRY(ES)          \
+  ENTRY(CS)          \
+  ENTRY(SS)          \
+  ENTRY(DS)          \
+  ENTRY(FS)          \
+  ENTRY(GS)
+
+#define REGS_DEBUG  \
+  ENTRY(DR0)        \
+  ENTRY(DR1)        \
+  ENTRY(DR2)        \
+  ENTRY(DR3)        \
+  ENTRY(DR4)        \
+  ENTRY(DR5)        \
+  ENTRY(DR6)        \
+  ENTRY(DR7)        \
+  ENTRY(DR8)        \
+  ENTRY(DR9)        \
+  ENTRY(DR10)       \
+  ENTRY(DR11)       \
+  ENTRY(DR12)       \
+  ENTRY(DR13)       \
+  ENTRY(DR14)       \
+  ENTRY(DR15)
+
+#define REGS_CONTROL  \
+  ENTRY(CR0)          \
+  ENTRY(CR1)          \
+  ENTRY(CR2)          \
+  ENTRY(CR3)          \
+  ENTRY(CR4)          \
+  ENTRY(CR5)          \
+  ENTRY(CR6)          \
+  ENTRY(CR7)          \
+  ENTRY(CR8)          \
+  ENTRY(CR9)          \
+  ENTRY(CR10)         \
+  ENTRY(CR11)         \
+  ENTRY(CR12)         \
+  ENTRY(CR13)         \
+  ENTRY(CR14)         \
+  ENTRY(CR15)
+
+#define ALL_EA_BASES  \
+  EA_BASES_16BIT      \
+  EA_BASES_32BIT      \
+  EA_BASES_64BIT
+
+#define ALL_SIB_BASES \
+  REGS_32BIT          \
+  REGS_64BIT
+
+#define ALL_REGS      \
+  REGS_8BIT           \
+  REGS_16BIT          \
+  REGS_32BIT          \
+  REGS_64BIT          \
+  REGS_MMX            \
+  REGS_XMM            \
+  REGS_YMM            \
+  REGS_ZMM            \
+  REGS_MASKS          \
+  REGS_SEGMENT        \
+  REGS_DEBUG          \
+  REGS_CONTROL        \
+  ENTRY(RIP)
+
+/// \brief All possible values of the base field for effective-address
+/// computations, a.k.a. the Mod and R/M fields of the ModR/M byte.
+/// We distinguish between bases (EA_BASE_*) and registers that just happen
+/// to be referred to when Mod == 0b11 (EA_REG_*).
+enum EABase {
+  EA_BASE_NONE,
+#define ENTRY(x) EA_BASE_##x,
+  ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) EA_REG_##x,
+  ALL_REGS
+#undef ENTRY
+  EA_max
+};
+
+/// \brief All possible values of the SIB index field.
+/// borrows entries from ALL_EA_BASES with the special case that
+/// sib is synonymous with NONE.
+/// Vector SIB: index can be XMM or YMM.
+enum SIBIndex {
+  SIB_INDEX_NONE,
+#define ENTRY(x) SIB_INDEX_##x,
+  ALL_EA_BASES
+  REGS_XMM
+  REGS_YMM
+  REGS_ZMM
+#undef ENTRY
+  SIB_INDEX_max
+};
+
+/// \brief All possible values of the SIB base field.
+enum SIBBase {
+  SIB_BASE_NONE,
+#define ENTRY(x) SIB_BASE_##x,
+  ALL_SIB_BASES
+#undef ENTRY
+  SIB_BASE_max
+};
+
+/// \brief Possible displacement types for effective-address computations.
+typedef enum {
+  EA_DISP_NONE,
+  EA_DISP_8,
+  EA_DISP_16,
+  EA_DISP_32
+} EADisplacement;
+
+/// \brief All possible values of the reg field in the ModR/M byte.
+enum Reg {
+#define ENTRY(x) MODRM_REG_##x,
+  ALL_REGS
+#undef ENTRY
+  MODRM_REG_max
+};
+
+/// \brief All possible segment overrides.
+enum SegmentOverride {
+  SEG_OVERRIDE_NONE,
+  SEG_OVERRIDE_CS,
+  SEG_OVERRIDE_SS,
+  SEG_OVERRIDE_DS,
+  SEG_OVERRIDE_ES,
+  SEG_OVERRIDE_FS,
+  SEG_OVERRIDE_GS,
+  SEG_OVERRIDE_max
+};
+
+/// \brief Possible values for the VEX.m-mmmm field
+enum VEXLeadingOpcodeByte {
+  VEX_LOB_0F = 0x1,
+  VEX_LOB_0F38 = 0x2,
+  VEX_LOB_0F3A = 0x3
+};
+
+enum XOPMapSelect {
+  XOP_MAP_SELECT_8 = 0x8,
+  XOP_MAP_SELECT_9 = 0x9,
+  XOP_MAP_SELECT_A = 0xA
+};
+
+/// \brief Possible values for the VEX.pp/EVEX.pp field
+enum VEXPrefixCode {
+  VEX_PREFIX_NONE = 0x0,
+  VEX_PREFIX_66 = 0x1,
+  VEX_PREFIX_F3 = 0x2,
+  VEX_PREFIX_F2 = 0x3
+};
+
+enum VectorExtensionType {
+  TYPE_NO_VEX_XOP   = 0x0,
+  TYPE_VEX_2B       = 0x1,
+  TYPE_VEX_3B       = 0x2,
+  TYPE_EVEX         = 0x3,
+  TYPE_XOP          = 0x4
+};
+
+/// \brief Type for the byte reader that the consumer must provide to
+/// the decoder. Reads a single byte from the instruction's address space.
+/// \param arg     A baton that the consumer can associate with any internal
+///                state that it needs.
+/// \param byte    A pointer to a single byte in memory that should be set to
+///                contain the value at address.
+/// \param address The address in the instruction's address space that should
+///                be read from.
+/// \return        -1 if the byte cannot be read for any reason; 0 otherwise.
+typedef int (*byteReader_t)(const void *arg, uint8_t *byte, uint64_t address);
+
+/// \brief Type for the logging function that the consumer can provide to
+/// get debugging output from the decoder.
+/// \param arg A baton that the consumer can associate with any internal
+///            state that it needs.
+/// \param log A string that contains the message.  Will be reused after
+///            the logger returns.
+typedef void (*dlog_t)(void *arg, const char *log);
+
+/// The specification for how to extract and interpret a full instruction and
+/// its operands.
+struct InstructionSpecifier {
+  uint16_t operands;
+};
+
+/// The x86 internal instruction, which is produced by the decoder.
+struct InternalInstruction {
+  // Reader interface (C)
+  byteReader_t reader;
+  // Opaque value passed to the reader
+  const void* readerArg;
+  // The address of the next byte to read via the reader
+  uint64_t readerCursor;
+
+  // Logger interface (C)
+  dlog_t dlog;
+  // Opaque value passed to the logger
+  void* dlogArg;
+
+  // General instruction information
+
+  // The mode to disassemble for (64-bit, protected, real)
+  DisassemblerMode mode;
+  // The start of the instruction, usable with the reader
+  uint64_t startLocation;
+  // The length of the instruction, in bytes
+  size_t length;
+
+  // Prefix state
+
+  // 1 if the prefix byte corresponding to the entry is present; 0 if not
+  uint8_t prefixPresent[0x100];
+  // contains the location (for use with the reader) of the prefix byte
+  uint64_t prefixLocations[0x100];
+  // The value of the vector extension prefix(EVEX/VEX/XOP), if present
+  uint8_t vectorExtensionPrefix[4];
+  // The type of the vector extension prefix
+  VectorExtensionType vectorExtensionType;
+  // The value of the REX prefix, if present
+  uint8_t rexPrefix;
+  // The location where a mandatory prefix would have to be (i.e., right before
+  // the opcode, or right before the REX prefix if one is present).
+  uint64_t necessaryPrefixLocation;
+  // The segment override type
+  SegmentOverride segmentOverride;
+  // 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease
+  bool xAcquireRelease;
+
+  // Sizes of various critical pieces of data, in bytes
+  uint8_t registerSize;
+  uint8_t addressSize;
+  uint8_t displacementSize;
+  uint8_t immediateSize;
+
+  // Offsets from the start of the instruction to the pieces of data, which is
+  // needed to find relocation entries for adding symbolic operands.
+  uint8_t displacementOffset;
+  uint8_t immediateOffset;
+
+  // opcode state
+
+  // The last byte of the opcode, not counting any ModR/M extension
+  uint8_t opcode;
+
+  // decode state
+
+  // The type of opcode, used for indexing into the array of decode tables
+  OpcodeType opcodeType;
+  // The instruction ID, extracted from the decode table
+  uint16_t instructionID;
+  // The specifier for the instruction, from the instruction info table
+  const InstructionSpecifier *spec;
+
+  // state for additional bytes, consumed during operand decode.  Pattern:
+  // consumed___ indicates that the byte was already consumed and does not
+  // need to be consumed again.
+
+  // The VEX.vvvv field, which contains a third register operand for some AVX
+  // instructions.
+  Reg                           vvvv;
+
+  // The writemask for AVX-512 instructions which is contained in EVEX.aaa
+  Reg                           writemask;
+
+  // The ModR/M byte, which contains most register operands and some portion of
+  // all memory operands.
+  bool                          consumedModRM;
+  uint8_t                       modRM;
+
+  // The SIB byte, used for more complex 32- or 64-bit memory operands
+  bool                          consumedSIB;
+  uint8_t                       sib;
+
+  // The displacement, used for memory operands
+  bool                          consumedDisplacement;
+  int32_t                       displacement;
+
+  // Immediates.  There can be two in some cases
+  uint8_t                       numImmediatesConsumed;
+  uint8_t                       numImmediatesTranslated;
+  uint64_t                      immediates[2];
+
+  // A register or immediate operand encoded into the opcode
+  Reg                           opcodeRegister;
+
+  // Portions of the ModR/M byte
+
+  // These fields determine the allowable values for the ModR/M fields, which
+  // depend on operand and address widths.
+  EABase                        eaBaseBase;
+  EABase                        eaRegBase;
+  Reg                           regBase;
+
+  // The Mod and R/M fields can encode a base for an effective address, or a
+  // register.  These are separated into two fields here.
+  EABase                        eaBase;
+  EADisplacement                eaDisplacement;
+  // The reg field always encodes a register
+  Reg                           reg;
+
+  // SIB state
+  SIBIndex                      sibIndex;
+  uint8_t                       sibScale;
+  SIBBase                       sibBase;
+
+  ArrayRef<OperandSpecifier> operands;
+};
+
+/// \brief Decode one instruction and store the decoding results in
+/// a buffer provided by the consumer.
+/// \param insn      The buffer to store the instruction in.  Allocated by the
+///                  consumer.
+/// \param reader    The byteReader_t for the bytes to be read.
+/// \param readerArg An argument to pass to the reader for storing context
+///                  specific to the consumer.  May be NULL.
+/// \param logger    The dlog_t to be used in printing status messages from the
+///                  disassembler.  May be NULL.
+/// \param loggerArg An argument to pass to the logger for storing context
+///                  specific to the logger.  May be NULL.
+/// \param startLoc  The address (in the reader's address space) of the first
+///                  byte in the instruction.
+/// \param mode      The mode (16-bit, 32-bit, 64-bit) to decode in.
+/// \return          Nonzero if there was an error during decode, 0 otherwise.
+int decodeInstruction(InternalInstruction *insn,
+                      byteReader_t reader,
+                      const void *readerArg,
+                      dlog_t logger,
+                      void *loggerArg,
+                      const void *miiArg,
+                      uint64_t startLoc,
+                      DisassemblerMode mode);
+
+/// \brief Print a message to debugs()
+/// \param file The name of the file printing the debug message.
+/// \param line The line number that printed the debug message.
+/// \param s    The message to print.
+void Debug(const char *file, unsigned line, const char *s);
+
+const char *GetInstrName(unsigned Opcode, const void *mii);
+
+} // namespace X86Disassembler
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
new file mode 100644
index 0000000..301db72
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -0,0 +1,503 @@
+//===-- X86DisassemblerDecoderCommon.h - Disassembler decoder ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains common definitions used by both the disassembler and the table
+//  generator.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODERCOMMON_H
+#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODERCOMMON_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+namespace X86Disassembler {
+
+#define INSTRUCTIONS_SYM  x86DisassemblerInstrSpecifiers
+#define CONTEXTS_SYM      x86DisassemblerContexts
+#define ONEBYTE_SYM       x86DisassemblerOneByteOpcodes
+#define TWOBYTE_SYM       x86DisassemblerTwoByteOpcodes
+#define THREEBYTE38_SYM   x86DisassemblerThreeByte38Opcodes
+#define THREEBYTE3A_SYM   x86DisassemblerThreeByte3AOpcodes
+#define XOP8_MAP_SYM      x86DisassemblerXOP8Opcodes
+#define XOP9_MAP_SYM      x86DisassemblerXOP9Opcodes
+#define XOPA_MAP_SYM      x86DisassemblerXOPAOpcodes
+
+#define INSTRUCTIONS_STR  "x86DisassemblerInstrSpecifiers"
+#define CONTEXTS_STR      "x86DisassemblerContexts"
+#define ONEBYTE_STR       "x86DisassemblerOneByteOpcodes"
+#define TWOBYTE_STR       "x86DisassemblerTwoByteOpcodes"
+#define THREEBYTE38_STR   "x86DisassemblerThreeByte38Opcodes"
+#define THREEBYTE3A_STR   "x86DisassemblerThreeByte3AOpcodes"
+#define XOP8_MAP_STR      "x86DisassemblerXOP8Opcodes"
+#define XOP9_MAP_STR      "x86DisassemblerXOP9Opcodes"
+#define XOPA_MAP_STR      "x86DisassemblerXOPAOpcodes"
+
+// Attributes of an instruction that must be known before the opcode can be
+// processed correctly.  Most of these indicate the presence of particular
+// prefixes, but ATTR_64BIT is simply an attribute of the decoding context.
+#define ATTRIBUTE_BITS                  \
+  ENUM_ENTRY(ATTR_NONE,   0x00)         \
+  ENUM_ENTRY(ATTR_64BIT,  (0x1 << 0))   \
+  ENUM_ENTRY(ATTR_XS,     (0x1 << 1))   \
+  ENUM_ENTRY(ATTR_XD,     (0x1 << 2))   \
+  ENUM_ENTRY(ATTR_REXW,   (0x1 << 3))   \
+  ENUM_ENTRY(ATTR_OPSIZE, (0x1 << 4))   \
+  ENUM_ENTRY(ATTR_ADSIZE, (0x1 << 5))   \
+  ENUM_ENTRY(ATTR_VEX,    (0x1 << 6))   \
+  ENUM_ENTRY(ATTR_VEXL,   (0x1 << 7))   \
+  ENUM_ENTRY(ATTR_EVEX,   (0x1 << 8))   \
+  ENUM_ENTRY(ATTR_EVEXL,  (0x1 << 9))   \
+  ENUM_ENTRY(ATTR_EVEXL2, (0x1 << 10))  \
+  ENUM_ENTRY(ATTR_EVEXK,  (0x1 << 11))  \
+  ENUM_ENTRY(ATTR_EVEXKZ, (0x1 << 12))  \
+  ENUM_ENTRY(ATTR_EVEXB,  (0x1 << 13))
+
+#define ENUM_ENTRY(n, v) n = v,
+enum attributeBits {
+  ATTRIBUTE_BITS
+  ATTR_max
+};
+#undef ENUM_ENTRY
+
+// Combinations of the above attributes that are relevant to instruction
+// decode. Although other combinations are possible, they can be reduced to
+// these without affecting the ultimately decoded instruction.
+
+//           Class name           Rank  Rationale for rank assignment
+#define INSTRUCTION_CONTEXTS                                                   \
+  ENUM_ENTRY(IC,                    0,  "says nothing about the instruction")  \
+  ENUM_ENTRY(IC_64BIT,              1,  "says the instruction applies in "     \
+                                        "64-bit mode but no more")             \
+  ENUM_ENTRY(IC_OPSIZE,             3,  "requires an OPSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_ADSIZE,             3,  "requires an ADSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_OPSIZE_ADSIZE,      4,  "requires ADSIZE and OPSIZE prefixes") \
+  ENUM_ENTRY(IC_XD,                 2,  "may say something about the opcode "  \
+                                        "but not the operands")                \
+  ENUM_ENTRY(IC_XS,                 2,  "may say something about the opcode "  \
+                                        "but not the operands")                \
+  ENUM_ENTRY(IC_XD_OPSIZE,          3,  "requires an OPSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_XS_OPSIZE,          3,  "requires an OPSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_64BIT_REXW,         5,  "requires a REX.W prefix, so operands "\
+                                        "change width; overrides IC_OPSIZE")   \
+  ENUM_ENTRY(IC_64BIT_REXW_ADSIZE,  6,  "requires a REX.W prefix and 0x67 "    \
+                                        "prefix")                              \
+  ENUM_ENTRY(IC_64BIT_OPSIZE,       3,  "Just as meaningful as IC_OPSIZE")     \
+  ENUM_ENTRY(IC_64BIT_ADSIZE,       3,  "Just as meaningful as IC_ADSIZE")     \
+  ENUM_ENTRY(IC_64BIT_OPSIZE_ADSIZE, 4, "Just as meaningful as IC_OPSIZE/"     \
+                                        "IC_ADSIZE")                           \
+  ENUM_ENTRY(IC_64BIT_XD,           6,  "XD instructions are SSE; REX.W is "   \
+                                        "secondary")                           \
+  ENUM_ENTRY(IC_64BIT_XS,           6,  "Just as meaningful as IC_64BIT_XD")   \
+  ENUM_ENTRY(IC_64BIT_XD_OPSIZE,    3,  "Just as meaningful as IC_XD_OPSIZE")  \
+  ENUM_ENTRY(IC_64BIT_XS_OPSIZE,    3,  "Just as meaningful as IC_XS_OPSIZE")  \
+  ENUM_ENTRY(IC_64BIT_REXW_XS,      7,  "OPSIZE could mean a different "       \
+                                        "opcode")                              \
+  ENUM_ENTRY(IC_64BIT_REXW_XD,      7,  "Just as meaningful as "               \
+                                        "IC_64BIT_REXW_XS")                    \
+  ENUM_ENTRY(IC_64BIT_REXW_OPSIZE,  8,  "The Dynamic Duo!  Prefer over all "   \
+                                        "else because this changes most "      \
+                                        "operands' meaning")                   \
+  ENUM_ENTRY(IC_VEX,                1,  "requires a VEX prefix")               \
+  ENUM_ENTRY(IC_VEX_XS,             2,  "requires VEX and the XS prefix")      \
+  ENUM_ENTRY(IC_VEX_XD,             2,  "requires VEX and the XD prefix")      \
+  ENUM_ENTRY(IC_VEX_OPSIZE,         2,  "requires VEX and the OpSize prefix")  \
+  ENUM_ENTRY(IC_VEX_W,              3,  "requires VEX and the W prefix")       \
+  ENUM_ENTRY(IC_VEX_W_XS,           4,  "requires VEX, W, and XS prefix")      \
+  ENUM_ENTRY(IC_VEX_W_XD,           4,  "requires VEX, W, and XD prefix")      \
+  ENUM_ENTRY(IC_VEX_W_OPSIZE,       4,  "requires VEX, W, and OpSize")         \
+  ENUM_ENTRY(IC_VEX_L,              3,  "requires VEX and the L prefix")       \
+  ENUM_ENTRY(IC_VEX_L_XS,           4,  "requires VEX and the L and XS prefix")\
+  ENUM_ENTRY(IC_VEX_L_XD,           4,  "requires VEX and the L and XD prefix")\
+  ENUM_ENTRY(IC_VEX_L_OPSIZE,       4,  "requires VEX, L, and OpSize")         \
+  ENUM_ENTRY(IC_VEX_L_W,            4,  "requires VEX, L and W")               \
+  ENUM_ENTRY(IC_VEX_L_W_XS,         5,  "requires VEX, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_VEX_L_W_XD,         5,  "requires VEX, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_VEX_L_W_OPSIZE,     5,  "requires VEX, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX,               1,  "requires an EVEX prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS,            2,  "requires EVEX and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD,            2,  "requires EVEX and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE,        2,  "requires EVEX and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W,             3,  "requires EVEX and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS,          4,  "requires EVEX, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD,          4,  "requires EVEX, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE,      4,  "requires EVEX, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L,             3,  "requires EVEX and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS,          4,  "requires EVEX and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD,          4,  "requires EVEX and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE,      4,  "requires EVEX, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W,           3,  "requires EVEX, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS,        4,  "requires EVEX, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD,        4,  "requires EVEX, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE,    4,  "requires EVEX, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2,            3,  "requires EVEX and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS,         4,  "requires EVEX and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD,         4,  "requires EVEX and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE,     4,  "requires EVEX, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W,          3,  "requires EVEX, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS,       4,  "requires EVEX, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD,       4,  "requires EVEX, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE,   4,  "requires EVEX, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_K,             1,  "requires an EVEX_K prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_K,          2,  "requires EVEX_K and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_K,          2,  "requires EVEX_K and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_K,      2,  "requires EVEX_K and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_K,           3,  "requires EVEX_K and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_K,        4,  "requires EVEX_K, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_K,        4,  "requires EVEX_K, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_K,    4,  "requires EVEX_K, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_K,           3,  "requires EVEX_K and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_K,        4,  "requires EVEX_K and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_K,        4,  "requires EVEX_K and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_K,    4,  "requires EVEX_K, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_K,         3,  "requires EVEX_K, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_K,      4,  "requires EVEX_K, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_K,      4,  "requires EVEX_K, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K,  4,  "requires EVEX_K, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_K,          3,  "requires EVEX_K and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_K,       4,  "requires EVEX_K and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_K,       4,  "requires EVEX_K and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K,   4,  "requires EVEX_K, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_K,        3,  "requires EVEX_K, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_K,     4,  "requires EVEX_K, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_K,     4,  "requires EVEX_K, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K, 4,  "requires EVEX_K, L2, W and OpSize")     \
+  ENUM_ENTRY(IC_EVEX_B,             1,  "requires an EVEX_B prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_B,          2,  "requires EVEX_B and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_B,          2,  "requires EVEX_B and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_B,      2,  "requires EVEX_B and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_B,           3,  "requires EVEX_B and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_B,        4,  "requires EVEX_B, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_B,        4,  "requires EVEX_B, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_B,    4,  "requires EVEX_B, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_B,           3,  "requires EVEX_B and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_B,        4,  "requires EVEX_B and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_B,        4,  "requires EVEX_B and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_B,    4,  "requires EVEX_B, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_B,         3,  "requires EVEX_B, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_B,      4,  "requires EVEX_B, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_B,      4,  "requires EVEX_B, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_B,  4,  "requires EVEX_B, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_B,          3,  "requires EVEX_B and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_B,       4,  "requires EVEX_B and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_B,       4,  "requires EVEX_B and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_B,   4,  "requires EVEX_B, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_B,        3,  "requires EVEX_B, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_B,     4,  "requires EVEX_B, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_B,     4,  "requires EVEX_B, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_B, 4,  "requires EVEX_B, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_K_B,           1,  "requires EVEX_B and EVEX_K prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_K_B,        2,  "requires EVEX_B, EVEX_K and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_K_B,        2,  "requires EVEX_B, EVEX_K and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_K_B,    2,  "requires EVEX_B, EVEX_K and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_K_B,         3,  "requires EVEX_B, EVEX_K and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_K_B,      4,  "requires EVEX_B, EVEX_K, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_K_B,      4,  "requires EVEX_B, EVEX_K, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_K_B,  4,  "requires EVEX_B, EVEX_K, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_K_B,         3,  "requires EVEX_B, EVEX_K and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_K_B,      4,  "requires EVEX_B, EVEX_K and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_K_B,      4,  "requires EVEX_B, EVEX_K and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_K_B,  4,  "requires EVEX_B, EVEX_K, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_K_B,       3,  "requires EVEX_B, EVEX_K, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_K_B,    4,  "requires EVEX_B, EVEX_K, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_K_B,    4,  "requires EVEX_B, EVEX_K, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K_B,4,  "requires EVEX_B, EVEX_K, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_K_B,        3,  "requires EVEX_B, EVEX_K and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_K_B,     4,  "requires EVEX_B, EVEX_K and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_K_B,     4,  "requires EVEX_B, EVEX_K and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K_B, 4,  "requires EVEX_B, EVEX_K, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_K_B,      3,  "requires EVEX_B, EVEX_K, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_K_B,   4,  "requires EVEX_B, EVEX_K, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_K_B,   4,  "requires EVEX_B, EVEX_K, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B,4,  "requires EVEX_B, EVEX_K, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_KZ_B,           1,  "requires EVEX_B and EVEX_KZ prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_KZ_B,        2,  "requires EVEX_B, EVEX_KZ and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_KZ_B,        2,  "requires EVEX_B, EVEX_KZ and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_KZ_B,    2,  "requires EVEX_B, EVEX_KZ and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_KZ_B,         3,  "requires EVEX_B, EVEX_KZ and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ_B,  4,  "requires EVEX_B, EVEX_KZ, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_KZ_B,           3,  "requires EVEX_B, EVEX_KZ and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_KZ_B,        4,  "requires EVEX_B, EVEX_KZ and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_KZ_B,        4,  "requires EVEX_B, EVEX_KZ and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ_B,    4,  "requires EVEX_B, EVEX_KZ, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_KZ_B,         3,  "requires EVEX_B, EVEX_KZ, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ_B,  4,  "requires EVEX_B, EVEX_KZ, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_KZ_B,          3,  "requires EVEX_B, EVEX_KZ and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_KZ_B,       4,  "requires EVEX_B, EVEX_KZ and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_KZ_B,       4,  "requires EVEX_B, EVEX_KZ and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ_B,   4,  "requires EVEX_B, EVEX_KZ, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_KZ_B,        3,  "requires EVEX_B, EVEX_KZ, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ_B,     4,  "requires EVEX_B, EVEX_KZ, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ_B,     4,  "requires EVEX_B, EVEX_KZ, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ_B, 4,  "requires EVEX_B, EVEX_KZ, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_KZ,             1,  "requires an EVEX_KZ prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_KZ,          2,  "requires EVEX_KZ and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_KZ,          2,  "requires EVEX_KZ and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_KZ,      2,  "requires EVEX_KZ and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_KZ,           3,  "requires EVEX_KZ and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_KZ,        4,  "requires EVEX_KZ, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_KZ,        4,  "requires EVEX_KZ, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ,    4,  "requires EVEX_KZ, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_KZ,           3,  "requires EVEX_KZ and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_KZ,        4,  "requires EVEX_KZ and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_KZ,        4,  "requires EVEX_KZ and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ,    4,  "requires EVEX_KZ, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_KZ,         3,  "requires EVEX_KZ, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_KZ,      4,  "requires EVEX_KZ, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_KZ,      4,  "requires EVEX_KZ, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ,  4,  "requires EVEX_KZ, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_KZ,          3,  "requires EVEX_KZ and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_KZ,       4,  "requires EVEX_KZ and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_KZ,       4,  "requires EVEX_KZ and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ,   4,  "requires EVEX_KZ, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_KZ,        3,  "requires EVEX_KZ, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ,     4,  "requires EVEX_KZ, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ,     4,  "requires EVEX_KZ, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4,  "requires EVEX_KZ, L2, W and OpSize")
+
+#define ENUM_ENTRY(n, r, d) n,
+enum InstructionContext {
+  INSTRUCTION_CONTEXTS
+  IC_max
+};
+#undef ENUM_ENTRY
+
+// Opcode types, which determine which decode table to use, both in the Intel
+// manual and also for the decoder.
+enum OpcodeType {
+  ONEBYTE       = 0,
+  TWOBYTE       = 1,
+  THREEBYTE_38  = 2,
+  THREEBYTE_3A  = 3,
+  XOP8_MAP      = 4,
+  XOP9_MAP      = 5,
+  XOPA_MAP      = 6
+};
+
+// The following structs are used for the hierarchical decode table.  After
+// determining the instruction's class (i.e., which IC_* constant applies to
+// it), the decoder reads the opcode.  Some instructions require specific
+// values of the ModR/M byte, so the ModR/M byte indexes into the final table.
+//
+// If a ModR/M byte is not required, "required" is left unset, and the values
+// for each instructionID are identical.
+typedef uint16_t InstrUID;
+
+// ModRMDecisionType - describes the type of ModR/M decision, allowing the
+// consumer to determine the number of entries in it.
+//
+// MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded
+//                  instruction is the same.
+// MODRM_SPLITRM  - If the ModR/M byte is between 0x00 and 0xbf, the opcode
+//                  corresponds to one instruction; otherwise, it corresponds to
+//                  a different instruction.
+// MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte
+//                  divided by 8 is used to select instruction; otherwise, each
+//                  value of the ModR/M byte could correspond to a different
+//                  instruction.
+// MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This
+//                  corresponds to instructions that use reg field as opcode
+// MODRM_FULL     - Potentially, each value of the ModR/M byte could correspond
+//                  to a different instruction.
+#define MODRMTYPES            \
+  ENUM_ENTRY(MODRM_ONEENTRY)  \
+  ENUM_ENTRY(MODRM_SPLITRM)   \
+  ENUM_ENTRY(MODRM_SPLITMISC)  \
+  ENUM_ENTRY(MODRM_SPLITREG)  \
+  ENUM_ENTRY(MODRM_FULL)
+
+#define ENUM_ENTRY(n) n,
+enum ModRMDecisionType {
+  MODRMTYPES
+  MODRM_max
+};
+#undef ENUM_ENTRY
+
+#define CASE_ENCODING_RM     \
+    case ENCODING_RM:        \
+    case ENCODING_RM_CD2:    \
+    case ENCODING_RM_CD4:    \
+    case ENCODING_RM_CD8:    \
+    case ENCODING_RM_CD16:   \
+    case ENCODING_RM_CD32:   \
+    case ENCODING_RM_CD64
+
+// Physical encodings of instruction operands.
+#define ENCODINGS                                                              \
+  ENUM_ENTRY(ENCODING_NONE,   "")                                              \
+  ENUM_ENTRY(ENCODING_REG,    "Register operand in ModR/M byte.")              \
+  ENUM_ENTRY(ENCODING_RM,     "R/M operand in ModR/M byte.")                   \
+  ENUM_ENTRY(ENCODING_RM_CD2, "R/M operand with CDisp scaling of 2")           \
+  ENUM_ENTRY(ENCODING_RM_CD4, "R/M operand with CDisp scaling of 4")           \
+  ENUM_ENTRY(ENCODING_RM_CD8, "R/M operand with CDisp scaling of 8")           \
+  ENUM_ENTRY(ENCODING_RM_CD16,"R/M operand with CDisp scaling of 16")          \
+  ENUM_ENTRY(ENCODING_RM_CD32,"R/M operand with CDisp scaling of 32")          \
+  ENUM_ENTRY(ENCODING_RM_CD64,"R/M operand with CDisp scaling of 64")          \
+  ENUM_ENTRY(ENCODING_VVVV,   "Register operand in VEX.vvvv byte.")            \
+  ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.")         \
+  ENUM_ENTRY(ENCODING_CB,     "1-byte code offset (possible new CS value)")    \
+  ENUM_ENTRY(ENCODING_CW,     "2-byte")                                        \
+  ENUM_ENTRY(ENCODING_CD,     "4-byte")                                        \
+  ENUM_ENTRY(ENCODING_CP,     "6-byte")                                        \
+  ENUM_ENTRY(ENCODING_CO,     "8-byte")                                        \
+  ENUM_ENTRY(ENCODING_CT,     "10-byte")                                       \
+  ENUM_ENTRY(ENCODING_IB,     "1-byte immediate")                              \
+  ENUM_ENTRY(ENCODING_IW,     "2-byte")                                        \
+  ENUM_ENTRY(ENCODING_ID,     "4-byte")                                        \
+  ENUM_ENTRY(ENCODING_IO,     "8-byte")                                        \
+  ENUM_ENTRY(ENCODING_RB,     "(AL..DIL, R8L..R15L) Register code added to "   \
+                              "the opcode byte")                               \
+  ENUM_ENTRY(ENCODING_RW,     "(AX..DI, R8W..R15W)")                           \
+  ENUM_ENTRY(ENCODING_RD,     "(EAX..EDI, R8D..R15D)")                         \
+  ENUM_ENTRY(ENCODING_RO,     "(RAX..RDI, R8..R15)")                           \
+  ENUM_ENTRY(ENCODING_FP,     "Position on floating-point stack in ModR/M "    \
+                              "byte.")                                         \
+                                                                               \
+  ENUM_ENTRY(ENCODING_Iv,     "Immediate of operand size")                     \
+  ENUM_ENTRY(ENCODING_Ia,     "Immediate of address size")                     \
+  ENUM_ENTRY(ENCODING_Rv,     "Register code of operand size added to the "    \
+                              "opcode byte")                                   \
+  ENUM_ENTRY(ENCODING_DUP,    "Duplicate of another operand; ID is encoded "   \
+                              "in type")                                       \
+  ENUM_ENTRY(ENCODING_SI,     "Source index; encoded in OpSize/Adsize prefix") \
+  ENUM_ENTRY(ENCODING_DI,     "Destination index; encoded in prefixes")
+
+#define ENUM_ENTRY(n, d) n,
+enum OperandEncoding {
+  ENCODINGS
+  ENCODING_max
+};
+#undef ENUM_ENTRY
+
+// Semantic interpretations of instruction operands.
+#define TYPES                                                                  \
+  ENUM_ENTRY(TYPE_NONE,       "")                                              \
+  ENUM_ENTRY(TYPE_REL8,       "1-byte immediate address")                      \
+  ENUM_ENTRY(TYPE_REL16,      "2-byte")                                        \
+  ENUM_ENTRY(TYPE_REL32,      "4-byte")                                        \
+  ENUM_ENTRY(TYPE_REL64,      "8-byte")                                        \
+  ENUM_ENTRY(TYPE_PTR1616,    "2+2-byte segment+offset address")               \
+  ENUM_ENTRY(TYPE_PTR1632,    "2+4-byte")                                      \
+  ENUM_ENTRY(TYPE_PTR1664,    "2+8-byte")                                      \
+  ENUM_ENTRY(TYPE_R8,         "1-byte register operand")                       \
+  ENUM_ENTRY(TYPE_R16,        "2-byte")                                        \
+  ENUM_ENTRY(TYPE_R32,        "4-byte")                                        \
+  ENUM_ENTRY(TYPE_R64,        "8-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM8,       "1-byte immediate operand")                      \
+  ENUM_ENTRY(TYPE_IMM16,      "2-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM32,      "4-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM64,      "8-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM3,       "1-byte immediate operand between 0 and 7")      \
+  ENUM_ENTRY(TYPE_IMM5,       "1-byte immediate operand between 0 and 31")     \
+  ENUM_ENTRY(TYPE_AVX512ICC,  "1-byte immediate operand for AVX512 icmp")      \
+  ENUM_ENTRY(TYPE_UIMM8,      "1-byte unsigned immediate operand")             \
+  ENUM_ENTRY(TYPE_RM8,        "1-byte register or memory operand")             \
+  ENUM_ENTRY(TYPE_RM16,       "2-byte")                                        \
+  ENUM_ENTRY(TYPE_RM32,       "4-byte")                                        \
+  ENUM_ENTRY(TYPE_RM64,       "8-byte")                                        \
+  ENUM_ENTRY(TYPE_M,          "Memory operand")                                \
+  ENUM_ENTRY(TYPE_M8,         "1-byte")                                        \
+  ENUM_ENTRY(TYPE_M16,        "2-byte")                                        \
+  ENUM_ENTRY(TYPE_M32,        "4-byte")                                        \
+  ENUM_ENTRY(TYPE_M64,        "8-byte")                                        \
+  ENUM_ENTRY(TYPE_LEA,        "Effective address")                             \
+  ENUM_ENTRY(TYPE_M128,       "16-byte (SSE/SSE2)")                            \
+  ENUM_ENTRY(TYPE_M256,       "256-byte (AVX)")                                \
+  ENUM_ENTRY(TYPE_M1616,      "2+2-byte segment+offset address")               \
+  ENUM_ENTRY(TYPE_M1632,      "2+4-byte")                                      \
+  ENUM_ENTRY(TYPE_M1664,      "2+8-byte")                                      \
+  ENUM_ENTRY(TYPE_SRCIDX8,    "1-byte memory at source index")                 \
+  ENUM_ENTRY(TYPE_SRCIDX16,   "2-byte memory at source index")                 \
+  ENUM_ENTRY(TYPE_SRCIDX32,   "4-byte memory at source index")                 \
+  ENUM_ENTRY(TYPE_SRCIDX64,   "8-byte memory at source index")                 \
+  ENUM_ENTRY(TYPE_DSTIDX8,    "1-byte memory at destination index")            \
+  ENUM_ENTRY(TYPE_DSTIDX16,   "2-byte memory at destination index")            \
+  ENUM_ENTRY(TYPE_DSTIDX32,   "4-byte memory at destination index")            \
+  ENUM_ENTRY(TYPE_DSTIDX64,   "8-byte memory at destination index")            \
+  ENUM_ENTRY(TYPE_MOFFS8,     "1-byte memory offset (relative to segment "     \
+                              "base)")                                         \
+  ENUM_ENTRY(TYPE_MOFFS16,    "2-byte")                                        \
+  ENUM_ENTRY(TYPE_MOFFS32,    "4-byte")                                        \
+  ENUM_ENTRY(TYPE_MOFFS64,    "8-byte")                                        \
+  ENUM_ENTRY(TYPE_SREG,       "Byte with single bit set: 0 = ES, 1 = CS, "     \
+                              "2 = SS, 3 = DS, 4 = FS, 5 = GS")                \
+  ENUM_ENTRY(TYPE_M32FP,      "32-bit IEE754 memory floating-point operand")   \
+  ENUM_ENTRY(TYPE_M64FP,      "64-bit")                                        \
+  ENUM_ENTRY(TYPE_M80FP,      "80-bit extended")                               \
+  ENUM_ENTRY(TYPE_ST,         "Position on the floating-point stack")          \
+  ENUM_ENTRY(TYPE_MM64,       "8-byte MMX register")                           \
+  ENUM_ENTRY(TYPE_XMM,        "XMM register operand")                          \
+  ENUM_ENTRY(TYPE_XMM32,      "4-byte XMM register or memory operand")         \
+  ENUM_ENTRY(TYPE_XMM64,      "8-byte")                                        \
+  ENUM_ENTRY(TYPE_XMM128,     "16-byte")                                       \
+  ENUM_ENTRY(TYPE_XMM256,     "32-byte")                                       \
+  ENUM_ENTRY(TYPE_XMM512,     "64-byte")                                       \
+  ENUM_ENTRY(TYPE_VK1,        "1-bit")                                         \
+  ENUM_ENTRY(TYPE_VK2,        "2-bit")                                         \
+  ENUM_ENTRY(TYPE_VK4,        "4-bit")                                         \
+  ENUM_ENTRY(TYPE_VK8,        "8-bit")                                         \
+  ENUM_ENTRY(TYPE_VK16,       "16-bit")                                        \
+  ENUM_ENTRY(TYPE_VK32,       "32-bit")                                        \
+  ENUM_ENTRY(TYPE_VK64,       "64-bit")                                        \
+  ENUM_ENTRY(TYPE_XMM0,       "Implicit use of XMM0")                          \
+  ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand")                      \
+  ENUM_ENTRY(TYPE_DEBUGREG,   "Debug register operand")                        \
+  ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand")                      \
+  ENUM_ENTRY(TYPE_BNDR,       "MPX bounds register")                           \
+                                                                               \
+  ENUM_ENTRY(TYPE_Mv,         "Memory operand of operand size")                \
+  ENUM_ENTRY(TYPE_Rv,         "Register operand of operand size")              \
+  ENUM_ENTRY(TYPE_IMMv,       "Immediate operand of operand size")             \
+  ENUM_ENTRY(TYPE_RELv,       "Immediate address of operand size")             \
+  ENUM_ENTRY(TYPE_DUP0,       "Duplicate of operand 0")                        \
+  ENUM_ENTRY(TYPE_DUP1,       "operand 1")                                     \
+  ENUM_ENTRY(TYPE_DUP2,       "operand 2")                                     \
+  ENUM_ENTRY(TYPE_DUP3,       "operand 3")                                     \
+  ENUM_ENTRY(TYPE_DUP4,       "operand 4")                                     \
+  ENUM_ENTRY(TYPE_M512,       "512-bit FPU/MMX/XMM/MXCSR state")
+
+#define ENUM_ENTRY(n, d) n,
+enum OperandType {
+  TYPES
+  TYPE_max
+};
+#undef ENUM_ENTRY
+
+/// \brief The specification for how to extract and interpret one operand.
+struct OperandSpecifier {
+  uint8_t encoding;
+  uint8_t type;
+};
+
+static const unsigned X86_MAX_OPERANDS = 6;
+
+/// Decoding mode for the Intel disassembler.  16-bit, 32-bit, and 64-bit mode
+/// are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode,
+/// respectively.
+enum DisassemblerMode {
+  MODE_16BIT,
+  MODE_32BIT,
+  MODE_64BIT
+};
+
+} // namespace X86Disassembler
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
new file mode 100644
index 0000000..b4c0bc4
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -0,0 +1,289 @@
+//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as AT&T-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
+#include <map>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "X86GenAsmWriter.inc"
+
+void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << markup("<reg:") << '%' << getRegisterName(RegNo) << markup(">");
+}
+
+void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                  StringRef Annot, const MCSubtargetInfo &STI) {
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+
+  // If verbose assembly is enabled, we can print some informative comments.
+  if (CommentStream)
+    HasCustomInstComment =
+        EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+
+  if (TSFlags & X86II::LOCK)
+    OS << "\tlock\t";
+
+  // Output CALLpcrel32 as "callq" in 64-bit mode.
+  // In Intel annotation it's always emitted as "call".
+  //
+  // TODO: Probably this hack should be redesigned via InstAlias in
+  // InstrInfo.td as soon as Requires clause is supported properly
+  // for InstAlias.
+  if (MI->getOpcode() == X86::CALLpcrel32 &&
+      (STI.getFeatureBits()[X86::Mode64Bit])) {
+    OS << "\tcallq\t";
+    printPCRelImm(MI, 0, OS);
+  }
+  // Try to print any aliases first.
+  else if (!printAliasInstr(MI, OS))
+    printInstruction(MI, OS);
+
+  // Next always print the annotation.
+  printAnnotation(OS, Annot);
+}
+
+void X86ATTInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op,
+                                      raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm();
+  switch (Imm) {
+  default: llvm_unreachable("Invalid ssecc/avxcc argument!");
+  case    0: O << "eq"; break;
+  case    1: O << "lt"; break;
+  case    2: O << "le"; break;
+  case    3: O << "unord"; break;
+  case    4: O << "neq"; break;
+  case    5: O << "nlt"; break;
+  case    6: O << "nle"; break;
+  case    7: O << "ord"; break;
+  case    8: O << "eq_uq"; break;
+  case    9: O << "nge"; break;
+  case  0xa: O << "ngt"; break;
+  case  0xb: O << "false"; break;
+  case  0xc: O << "neq_oq"; break;
+  case  0xd: O << "ge"; break;
+  case  0xe: O << "gt"; break;
+  case  0xf: O << "true"; break;
+  case 0x10: O << "eq_os"; break;
+  case 0x11: O << "lt_oq"; break;
+  case 0x12: O << "le_oq"; break;
+  case 0x13: O << "unord_s"; break;
+  case 0x14: O << "neq_us"; break;
+  case 0x15: O << "nlt_uq"; break;
+  case 0x16: O << "nle_uq"; break;
+  case 0x17: O << "ord_s"; break;
+  case 0x18: O << "eq_us"; break;
+  case 0x19: O << "nge_uq"; break;
+  case 0x1a: O << "ngt_uq"; break;
+  case 0x1b: O << "false_os"; break;
+  case 0x1c: O << "neq_os"; break;
+  case 0x1d: O << "ge_oq"; break;
+  case 0x1e: O << "gt_oq"; break;
+  case 0x1f: O << "true_us"; break;
+  }
+}
+
+void X86ATTInstPrinter::printXOPCC(const MCInst *MI, unsigned Op,
+                                   raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm();
+  switch (Imm) {
+  default: llvm_unreachable("Invalid xopcc argument!");
+  case 0: O << "lt"; break;
+  case 1: O << "le"; break;
+  case 2: O << "gt"; break;
+  case 3: O << "ge"; break;
+  case 4: O << "eq"; break;
+  case 5: O << "neq"; break;
+  case 6: O << "false"; break;
+  case 7: O << "true"; break;
+  }
+}
+
+void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
+                                            raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
+  switch (Imm) {
+  case 0: O << "{rn-sae}"; break;
+  case 1: O << "{rd-sae}"; break;
+  case 2: O << "{ru-sae}"; break;
+  case 3: O << "{rz-sae}"; break;
+  }
+}
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value (e.g. for jumps and calls).  These
+/// print slightly differently than normal immediates.  For example, a $ is not
+/// emitted.
+void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << formatImm(Op.getImm());
+  else {
+    assert(Op.isExpr() && "unknown pcrel immediate operand");
+    // If a symbolic branch target was added as a constant expression then print
+    // that address in hex.
+    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+    int64_t Address;
+    if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
+      O << formatHex((uint64_t)Address);
+    } else {
+      // Otherwise, just print the expression.
+      Op.getExpr()->print(O, &MAI);
+    }
+  }
+}
+
+void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    printRegName(O, Op.getReg());
+  } else if (Op.isImm()) {
+    // Print X86 immediates as signed values.
+    O << markup("<imm:") << '$' << formatImm((int64_t)Op.getImm())
+      << markup(">");
+
+    // If there are no instruction-specific comments, add a comment clarifying
+    // the hex value of the immediate operand when it isn't in the range
+    // [-256,255].
+    if (CommentStream && !HasCustomInstComment &&
+        (Op.getImm() > 255 || Op.getImm() < -256))
+      *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm());
+
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << markup("<imm:") << '$';
+    Op.getExpr()->print(O, &MAI);
+    O << markup(">");
+  }
+}
+
+void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+                                          raw_ostream &O) {
+  const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg);
+  const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg);
+  const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp);
+  const MCOperand &SegReg = MI->getOperand(Op + X86::AddrSegmentReg);
+
+  O << markup("<mem:");
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op + X86::AddrSegmentReg, O);
+    O << ':';
+  }
+
+  if (DispSpec.isImm()) {
+    int64_t DispVal = DispSpec.getImm();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
+      O << formatImm(DispVal);
+  } else {
+    assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+    DispSpec.getExpr()->print(O, &MAI);
+  }
+
+  if (IndexReg.getReg() || BaseReg.getReg()) {
+    O << '(';
+    if (BaseReg.getReg())
+      printOperand(MI, Op + X86::AddrBaseReg, O);
+
+    if (IndexReg.getReg()) {
+      O << ',';
+      printOperand(MI, Op + X86::AddrIndexReg, O);
+      unsigned ScaleVal = MI->getOperand(Op + X86::AddrScaleAmt).getImm();
+      if (ScaleVal != 1) {
+        O << ',' << markup("<imm:") << ScaleVal // never printed in hex.
+          << markup(">");
+      }
+    }
+    O << ')';
+  }
+
+  O << markup(">");
+}
+
+void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
+                                    raw_ostream &O) {
+  const MCOperand &SegReg = MI->getOperand(Op + 1);
+
+  O << markup("<mem:");
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op + 1, O);
+    O << ':';
+  }
+
+  O << "(";
+  printOperand(MI, Op, O);
+  O << ")";
+
+  O << markup(">");
+}
+
+void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
+                                    raw_ostream &O) {
+  O << markup("<mem:");
+
+  O << "%es:(";
+  printOperand(MI, Op, O);
+  O << ")";
+
+  O << markup(">");
+}
+
+void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+                                       raw_ostream &O) {
+  const MCOperand &DispSpec = MI->getOperand(Op);
+  const MCOperand &SegReg = MI->getOperand(Op + 1);
+
+  O << markup("<mem:");
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op + 1, O);
+    O << ':';
+  }
+
+  if (DispSpec.isImm()) {
+    O << formatImm(DispSpec.getImm());
+  } else {
+    assert(DispSpec.isExpr() && "non-immediate displacement?");
+    DispSpec.getExpr()->print(O, &MAI);
+  }
+
+  O << markup(">");
+}
+
+void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
+                                   raw_ostream &O) {
+  O << markup("<imm:") << '$' << formatImm(MI->getOperand(Op).getImm() & 0xff)
+    << markup(">");
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
new file mode 100644
index 0000000..bbb3090
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -0,0 +1,142 @@
+//==- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to AT&T style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
+#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class X86ATTInstPrinter final : public MCInstPrinter {
+public:
+  X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                    const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
+
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+
+  // Autogenerated by tblgen, returns true if we successfully printed an
+  // alias.
+  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx, raw_ostream &O);
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &OS);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS);
+
+  void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+
+  void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+
+  void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+
+  void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printDstIdx(MI, OpNo, O);
+  }
+  void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+
+private:
+  bool HasCustomInstComment;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
new file mode 100644
index 0000000..73f654c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -0,0 +1,820 @@
+//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstComments.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static unsigned getVectorRegSize(unsigned RegNo) {
+  if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31)
+    return 512;
+  if (X86::YMM0 <= RegNo && RegNo <= X86::YMM31)
+    return 256;
+  if (X86::XMM0 <= RegNo && RegNo <= X86::XMM31)
+    return 128;
+  if (X86::MM0 <= RegNo && RegNo <= X86::MM7)
+    return 64;
+
+  llvm_unreachable("Unknown vector reg!");
+}
+
+static MVT getRegOperandVectorVT(const MCInst *MI, const MVT &ScalarVT,
+                                 unsigned OperandIndex) {
+  unsigned OpReg = MI->getOperand(OperandIndex).getReg();
+  return MVT::getVectorVT(ScalarVT,
+                          getVectorRegSize(OpReg)/ScalarVT.getSizeInBits());
+}
+
+/// \brief Extracts the src/dst types for a given zero extension instruction.
+/// \note While the number of elements in DstVT type correct, the
+/// number in the SrcVT type is expanded to fill the src xmm register and the
+/// upper elements may not be included in the dst xmm/ymm register.
+static void getZeroExtensionTypes(const MCInst *MI, MVT &SrcVT, MVT &DstVT) {
+  switch (MI->getOpcode()) {
+  default:
+    llvm_unreachable("Unknown zero extension instruction");
+  // i8 zero extension
+  case X86::PMOVZXBWrm:
+  case X86::PMOVZXBWrr:
+  case X86::VPMOVZXBWrm:
+  case X86::VPMOVZXBWrr:
+    SrcVT = MVT::v16i8;
+    DstVT = MVT::v8i16;
+    break;
+  case X86::VPMOVZXBWYrm:
+  case X86::VPMOVZXBWYrr:
+    SrcVT = MVT::v16i8;
+    DstVT = MVT::v16i16;
+    break;
+  case X86::PMOVZXBDrm:
+  case X86::PMOVZXBDrr:
+  case X86::VPMOVZXBDrm:
+  case X86::VPMOVZXBDrr:
+    SrcVT = MVT::v16i8;
+    DstVT = MVT::v4i32;
+    break;
+  case X86::VPMOVZXBDYrm:
+  case X86::VPMOVZXBDYrr:
+    SrcVT = MVT::v16i8;
+    DstVT = MVT::v8i32;
+    break;
+  case X86::PMOVZXBQrm:
+  case X86::PMOVZXBQrr:
+  case X86::VPMOVZXBQrm:
+  case X86::VPMOVZXBQrr:
+    SrcVT = MVT::v16i8;
+    DstVT = MVT::v2i64;
+    break;
+  case X86::VPMOVZXBQYrm:
+  case X86::VPMOVZXBQYrr:
+    SrcVT = MVT::v16i8;
+    DstVT = MVT::v4i64;
+    break;
+  // i16 zero extension
+  case X86::PMOVZXWDrm:
+  case X86::PMOVZXWDrr:
+  case X86::VPMOVZXWDrm:
+  case X86::VPMOVZXWDrr:
+    SrcVT = MVT::v8i16;
+    DstVT = MVT::v4i32;
+    break;
+  case X86::VPMOVZXWDYrm:
+  case X86::VPMOVZXWDYrr:
+    SrcVT = MVT::v8i16;
+    DstVT = MVT::v8i32;
+    break;
+  case X86::PMOVZXWQrm:
+  case X86::PMOVZXWQrr:
+  case X86::VPMOVZXWQrm:
+  case X86::VPMOVZXWQrr:
+    SrcVT = MVT::v8i16;
+    DstVT = MVT::v2i64;
+    break;
+  case X86::VPMOVZXWQYrm:
+  case X86::VPMOVZXWQYrr:
+    SrcVT = MVT::v8i16;
+    DstVT = MVT::v4i64;
+    break;
+  // i32 zero extension
+  case X86::PMOVZXDQrm:
+  case X86::PMOVZXDQrr:
+  case X86::VPMOVZXDQrm:
+  case X86::VPMOVZXDQrr:
+    SrcVT = MVT::v4i32;
+    DstVT = MVT::v2i64;
+    break;
+  case X86::VPMOVZXDQYrm:
+  case X86::VPMOVZXDQYrr:
+    SrcVT = MVT::v4i32;
+    DstVT = MVT::v4i64;
+    break;
+  }
+}
+
+#define CASE_MASK_INS_COMMON(Inst, Suffix, src)  \
+  case X86::V##Inst##Suffix##src:                \
+  case X86::V##Inst##Suffix##src##k:             \
+  case X86::V##Inst##Suffix##src##kz:
+
+#define CASE_SSE_INS_COMMON(Inst, src)           \
+  case X86::Inst##src:
+
+#define CASE_AVX_INS_COMMON(Inst, Suffix, src)  \
+  case X86::V##Inst##Suffix##src:
+
+#define CASE_MOVDUP(Inst, src)                  \
+  CASE_MASK_INS_COMMON(Inst, Z, r##src)         \
+  CASE_MASK_INS_COMMON(Inst, Z256, r##src)      \
+  CASE_MASK_INS_COMMON(Inst, Z128, r##src)      \
+  CASE_AVX_INS_COMMON(Inst, , r##src)           \
+  CASE_AVX_INS_COMMON(Inst, Y, r##src)          \
+  CASE_SSE_INS_COMMON(Inst, r##src)             \
+
+#define CASE_UNPCK(Inst, src)                   \
+  CASE_MASK_INS_COMMON(Inst, Z, r##src)         \
+  CASE_MASK_INS_COMMON(Inst, Z256, r##src)      \
+  CASE_MASK_INS_COMMON(Inst, Z128, r##src)      \
+  CASE_AVX_INS_COMMON(Inst, , r##src)           \
+  CASE_AVX_INS_COMMON(Inst, Y, r##src)          \
+  CASE_SSE_INS_COMMON(Inst, r##src)             \
+
+#define CASE_SHUF(Inst, src)                    \
+  CASE_MASK_INS_COMMON(Inst, Z, r##src##i)      \
+  CASE_MASK_INS_COMMON(Inst, Z256, r##src##i)   \
+  CASE_MASK_INS_COMMON(Inst, Z128, r##src##i)   \
+  CASE_AVX_INS_COMMON(Inst, , r##src##i)        \
+  CASE_AVX_INS_COMMON(Inst, Y, r##src##i)       \
+  CASE_SSE_INS_COMMON(Inst, r##src##i)          \
+
+#define CASE_VPERM(Inst, src)                   \
+  CASE_MASK_INS_COMMON(Inst, Z, src##i)         \
+  CASE_MASK_INS_COMMON(Inst, Z256, src##i)      \
+  CASE_MASK_INS_COMMON(Inst, Z128, src##i)      \
+  CASE_AVX_INS_COMMON(Inst, , src##i)           \
+  CASE_AVX_INS_COMMON(Inst, Y, src##i)          \
+
+#define CASE_VSHUF(Inst, src)                          \
+  CASE_MASK_INS_COMMON(SHUFF##Inst, Z, r##src##i)      \
+  CASE_MASK_INS_COMMON(SHUFI##Inst, Z, r##src##i)      \
+  CASE_MASK_INS_COMMON(SHUFF##Inst, Z256, r##src##i)   \
+  CASE_MASK_INS_COMMON(SHUFI##Inst, Z256, r##src##i)   \
+
+/// \brief Extracts the types and if it has memory operand for a given
+/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) instruction.
+static void getVSHUF64x2FamilyInfo(const MCInst *MI, MVT &VT, bool &HasMemOp) {
+  HasMemOp = false;
+  switch (MI->getOpcode()) {
+  default:
+    llvm_unreachable("Unknown VSHUF64x2 family instructions.");
+    break;
+  CASE_VSHUF(64X2, m)
+    HasMemOp = true;        // FALL THROUGH.
+  CASE_VSHUF(64X2, r)
+    VT = getRegOperandVectorVT(MI, MVT::i64, 0);
+    break;
+  CASE_VSHUF(32X4, m)
+    HasMemOp = true;        // FALL THROUGH.
+  CASE_VSHUF(32X4, r)
+    VT = getRegOperandVectorVT(MI, MVT::i32, 0);
+    break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Top Level Entrypoint
+//===----------------------------------------------------------------------===//
+
+/// EmitAnyX86InstComments - This function decodes x86 instructions and prints
+/// newline terminated strings to the specified string if desired.  This
+/// information is shown in disassembly dumps when verbose assembly is enabled.
+bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+                                  const char *(*getRegName)(unsigned)) {
+  // If this is a shuffle operation, the switch should fill in this state.
+  SmallVector<int, 8> ShuffleMask;
+  const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr;
+
+  switch (MI->getOpcode()) {
+  default:
+    // Not an instruction for which we can decode comments.
+    return false;
+
+  case X86::BLENDPDrri:
+  case X86::VBLENDPDrri:
+  case X86::VBLENDPDYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::BLENDPDrmi:
+  case X86::VBLENDPDrmi:
+  case X86::VBLENDPDYrmi:
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::BLENDPSrri:
+  case X86::VBLENDPSrri:
+  case X86::VBLENDPSYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::BLENDPSrmi:
+  case X86::VBLENDPSrmi:
+  case X86::VBLENDPSYrmi:
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f32, 0),
+                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::PBLENDWrri:
+  case X86::VPBLENDWrri:
+  case X86::VPBLENDWYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PBLENDWrmi:
+  case X86::VPBLENDWrmi:
+  case X86::VPBLENDWYrmi:
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i16, 0),
+                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::VPBLENDDrri:
+  case X86::VPBLENDDYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPBLENDDrmi:
+  case X86::VPBLENDDYrmi:
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::INSERTPSrr:
+  case X86::VINSERTPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::INSERTPSrm:
+  case X86::VINSERTPSrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodeINSERTPSMask(MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                         ShuffleMask);
+    break;
+
+  case X86::MOVLHPSrr:
+  case X86::VMOVLHPSrr:
+  case X86::VMOVLHPSZrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVLHPSMask(2, ShuffleMask);
+    break;
+
+  case X86::MOVHLPSrr:
+  case X86::VMOVHLPSrr:
+  case X86::VMOVHLPSZrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVHLPSMask(2, ShuffleMask);
+    break;
+
+  CASE_MOVDUP(MOVSLDUP, r)
+    Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg());
+    // FALL THROUGH.
+  CASE_MOVDUP(MOVSLDUP, m)
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVSLDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+    break;
+
+  CASE_MOVDUP(MOVSHDUP, r)
+    Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg());
+    // FALL THROUGH.
+  CASE_MOVDUP(MOVSHDUP, m)
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVSHDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+    break;
+
+  CASE_MOVDUP(MOVDDUP, r)
+    Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg());
+    // FALL THROUGH.
+  CASE_MOVDUP(MOVDDUP, m)
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVDDUPMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
+    break;
+
+  case X86::PSLLDQri:
+  case X86::VPSLLDQri:
+  case X86::VPSLLDQYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodePSLLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
+                       MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                       ShuffleMask);
+    break;
+
+  case X86::PSRLDQri:
+  case X86::VPSRLDQri:
+  case X86::VPSRLDQYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodePSRLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
+                       MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                       ShuffleMask);
+    break;
+
+  case X86::PALIGNR128rr:
+  case X86::VPALIGNR128rr:
+  case X86::VPALIGNR256rr:
+    Src1Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PALIGNR128rm:
+  case X86::VPALIGNR128rm:
+  case X86::VPALIGNR256rm:
+    Src2Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodePALIGNRMask(getRegOperandVectorVT(MI, MVT::i8, 0),
+                        MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                        ShuffleMask);
+    break;
+
+  case X86::PSHUFDri:
+  case X86::VPSHUFDri:
+  case X86::VPSHUFDYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::PSHUFDmi:
+  case X86::VPSHUFDmi:
+  case X86::VPSHUFDYmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      ShuffleMask);
+    break;
+
+  case X86::PSHUFHWri:
+  case X86::VPSHUFHWri:
+  case X86::VPSHUFHWYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::PSHUFHWmi:
+  case X86::VPSHUFHWmi:
+  case X86::VPSHUFHWYmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodePSHUFHWMask(getRegOperandVectorVT(MI, MVT::i16, 0),
+                        MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                        ShuffleMask);
+    break;
+
+  case X86::PSHUFLWri:
+  case X86::VPSHUFLWri:
+  case X86::VPSHUFLWYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::PSHUFLWmi:
+  case X86::VPSHUFLWmi:
+  case X86::VPSHUFLWYmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodePSHUFLWMask(getRegOperandVectorVT(MI, MVT::i16, 0),
+                        MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                        ShuffleMask);
+    break;
+
+  case X86::MMX_PSHUFWri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::MMX_PSHUFWmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodePSHUFMask(MVT::v4i16,
+                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      ShuffleMask);
+    break;
+
+  case X86::PSWAPDrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::PSWAPDrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePSWAPMask(MVT::v2i32, ShuffleMask);
+    break;
+
+  CASE_UNPCK(PUNPCKHBW, r)
+  case X86::MMX_PUNPCKHBWirr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  CASE_UNPCK(PUNPCKHBW, m)
+  case X86::MMX_PUNPCKHBWirm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask);
+    break;
+
+  CASE_UNPCK(PUNPCKHWD, r)
+  case X86::MMX_PUNPCKHWDirr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  CASE_UNPCK(PUNPCKHWD, m)
+  case X86::MMX_PUNPCKHWDirm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask);
+    break;
+
+  CASE_UNPCK(PUNPCKHDQ, r)
+  case X86::MMX_PUNPCKHDQirr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  CASE_UNPCK(PUNPCKHDQ, m)
+  case X86::MMX_PUNPCKHDQirm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask);
+    break;
+
+  CASE_UNPCK(PUNPCKHQDQ, r)
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  CASE_UNPCK(PUNPCKHQDQ, m)
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask);
+    break;
+
+  CASE_UNPCK(PUNPCKLBW, r)
+  case X86::MMX_PUNPCKLBWirr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  CASE_UNPCK(PUNPCKLBW, m)
+  case X86::MMX_PUNPCKLBWirm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask);
+    break;
+
+  CASE_UNPCK(PUNPCKLWD, r)
+  case X86::MMX_PUNPCKLWDirr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  CASE_UNPCK(PUNPCKLWD, m)
+  case X86::MMX_PUNPCKLWDirm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask);
+    break;
+
+  CASE_UNPCK(PUNPCKLDQ, r)
+  case X86::MMX_PUNPCKLDQirr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  CASE_UNPCK(PUNPCKLDQ, m)
+  case X86::MMX_PUNPCKLDQirm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask);
+    break;
+
+  CASE_UNPCK(PUNPCKLQDQ, r)
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  CASE_UNPCK(PUNPCKLQDQ, m)
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask);
+    break;
+
+  CASE_SHUF(SHUFPD, r)
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  CASE_SHUF(SHUFPD, m)
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_SHUF(SHUFPS, r)
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  CASE_SHUF(SHUFPS, m)
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f32, 0),
+                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_VSHUF(64X2, r)
+  CASE_VSHUF(64X2, m)
+  CASE_VSHUF(32X4, r)
+  CASE_VSHUF(32X4, m) {
+    MVT VT;
+    bool HasMemOp;
+    unsigned NumOp = MI->getNumOperands();
+    getVSHUF64x2FamilyInfo(MI, VT, HasMemOp);
+    decodeVSHUF64x2FamilyMask(VT, MI->getOperand(NumOp - 1).getImm(),
+                              ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if (HasMemOp) {
+      assert((NumOp >= 8) && "Expected at least 8 operands!");
+      Src1Name = getRegName(MI->getOperand(NumOp - 7).getReg());
+    } else {
+      assert((NumOp >= 4) && "Expected at least 4 operands!");
+      Src2Name = getRegName(MI->getOperand(NumOp - 2).getReg());
+      Src1Name = getRegName(MI->getOperand(NumOp - 3).getReg());
+    }
+    break;
+  }
+
+  CASE_UNPCK(UNPCKLPD, r)
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  CASE_UNPCK(UNPCKLPD, m)
+    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_UNPCK(UNPCKLPS, r)
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  CASE_UNPCK(UNPCKLPS, m)
+    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_UNPCK(UNPCKHPD, r)
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  CASE_UNPCK(UNPCKHPD, m)
+    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_UNPCK(UNPCKHPS, r)
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  CASE_UNPCK(UNPCKHPS, m)
+    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_VPERM(PERMILPS, r)
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  CASE_VPERM(PERMILPS, m)
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f32, 0),
+                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_VPERM(PERMILPD, r)
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  CASE_VPERM(PERMILPD, m)
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+                      MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::VPERM2F128rr:
+  case X86::VPERM2I128rr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPERM2F128rm:
+  case X86::VPERM2I128rm:
+    // For instruction comments purpose, assume the 256-bit vector is v4i64.
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodeVPERM2X128Mask(MVT::v4i64,
+                           MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                           ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::VPERMQYri:
+  case X86::VPERMPDYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMQYmi:
+  case X86::VPERMPDYmi:
+    if (MI->getOperand(MI->getNumOperands() - 1).isImm())
+      DecodeVPERMMask(MI->getOperand(MI->getNumOperands() - 1).getImm(),
+                      ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::MOVSDrr:
+  case X86::VMOVSDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::MOVSDrm:
+  case X86::VMOVSDrm:
+    DecodeScalarMoveMask(MVT::v2f64, nullptr == Src2Name, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::MOVSSrr:
+  case X86::VMOVSSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::MOVSSrm:
+  case X86::VMOVSSrm:
+    DecodeScalarMoveMask(MVT::v4f32, nullptr == Src2Name, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::MOVPQI2QIrr:
+  case X86::MOVZPQILo2PQIrr:
+  case X86::VMOVPQI2QIrr:
+  case X86::VMOVZPQILo2PQIrr:
+  case X86::VMOVZPQILo2PQIZrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+  // FALL THROUGH.
+  case X86::MOVQI2PQIrm:
+  case X86::MOVZQI2PQIrm:
+  case X86::MOVZPQILo2PQIrm:
+  case X86::VMOVQI2PQIrm:
+  case X86::VMOVZQI2PQIrm:
+  case X86::VMOVZPQILo2PQIrm:
+  case X86::VMOVZPQILo2PQIZrm:
+    DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::MOVDI2PDIrm:
+  case X86::VMOVDI2PDIrm:
+    DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::EXTRQI:
+    if (MI->getOperand(2).isImm() &&
+        MI->getOperand(3).isImm())
+      DecodeEXTRQIMask(MI->getOperand(2).getImm(),
+                       MI->getOperand(3).getImm(),
+                       ShuffleMask);
+
+    DestName = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    break;
+
+  case X86::INSERTQI:
+    if (MI->getOperand(3).isImm() &&
+        MI->getOperand(4).isImm())
+      DecodeINSERTQIMask(MI->getOperand(3).getImm(),
+                         MI->getOperand(4).getImm(),
+                         ShuffleMask);
+
+    DestName = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    break;
+
+  case X86::PMOVZXBWrr:
+  case X86::PMOVZXBDrr:
+  case X86::PMOVZXBQrr:
+  case X86::PMOVZXWDrr:
+  case X86::PMOVZXWQrr:
+  case X86::PMOVZXDQrr:
+  case X86::VPMOVZXBWrr:
+  case X86::VPMOVZXBDrr:
+  case X86::VPMOVZXBQrr:
+  case X86::VPMOVZXWDrr:
+  case X86::VPMOVZXWQrr:
+  case X86::VPMOVZXDQrr:
+  case X86::VPMOVZXBWYrr:
+  case X86::VPMOVZXBDYrr:
+  case X86::VPMOVZXBQYrr:
+  case X86::VPMOVZXWDYrr:
+  case X86::VPMOVZXWQYrr:
+  case X86::VPMOVZXDQYrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+  // FALL THROUGH.
+  case X86::PMOVZXBWrm:
+  case X86::PMOVZXBDrm:
+  case X86::PMOVZXBQrm:
+  case X86::PMOVZXWDrm:
+  case X86::PMOVZXWQrm:
+  case X86::PMOVZXDQrm:
+  case X86::VPMOVZXBWrm:
+  case X86::VPMOVZXBDrm:
+  case X86::VPMOVZXBQrm:
+  case X86::VPMOVZXWDrm:
+  case X86::VPMOVZXWQrm:
+  case X86::VPMOVZXDQrm:
+  case X86::VPMOVZXBWYrm:
+  case X86::VPMOVZXBDYrm:
+  case X86::VPMOVZXBQYrm:
+  case X86::VPMOVZXWDYrm:
+  case X86::VPMOVZXWQYrm:
+  case X86::VPMOVZXDQYrm: {
+    MVT SrcVT, DstVT;
+    getZeroExtensionTypes(MI, SrcVT, DstVT);
+    DecodeZeroExtendMask(SrcVT, DstVT, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+  } break;
+  }
+
+  // The only comments we decode are shuffles, so give up if we were unable to
+  // decode a shuffle mask.
+  if (ShuffleMask.empty())
+    return false;
+
+  if (!DestName) DestName = Src1Name;
+  OS << (DestName ? DestName : "mem") << " = ";
+
+  // If the two sources are the same, canonicalize the input elements to be
+  // from the first src so that we get larger element spans.
+  if (Src1Name == Src2Name) {
+    for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+      if ((int)ShuffleMask[i] >= 0 && // Not sentinel.
+          ShuffleMask[i] >= (int)e)   // From second mask.
+        ShuffleMask[i] -= e;
+    }
+  }
+
+  // The shuffle mask specifies which elements of the src1/src2 fill in the
+  // destination, with a few sentinel values.  Loop through and print them
+  // out.
+  for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+    if (i != 0)
+      OS << ',';
+    if (ShuffleMask[i] == SM_SentinelZero) {
+      OS << "zero";
+      continue;
+    }
+
+    // Otherwise, it must come from src1 or src2.  Print the span of elements
+    // that comes from this src.
+    bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size();
+    const char *SrcName = isSrc1 ? Src1Name : Src2Name;
+    OS << (SrcName ? SrcName : "mem") << '[';
+    bool IsFirst = true;
+    while (i != e && (int)ShuffleMask[i] != SM_SentinelZero &&
+           (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) {
+      if (!IsFirst)
+        OS << ',';
+      else
+        IsFirst = false;
+      if (ShuffleMask[i] == SM_SentinelUndef)
+        OS << "u";
+      else
+        OS << ShuffleMask[i] % ShuffleMask.size();
+      ++i;
+    }
+    OS << ']';
+    --i; // For loop increments element #.
+  }
+  //MI->print(OS, 0);
+  OS << "\n";
+
+  // We successfully added a comment to this instruction.
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
new file mode 100644
index 0000000..687581b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
@@ -0,0 +1,25 @@
+//=- X86InstComments.h - Generate verbose-asm comments for instrs -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
+#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
+
+namespace llvm {
+  class MCInst;
+  class raw_ostream;
+  bool EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+                              const char *(*getRegName)(unsigned));
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
new file mode 100644
index 0000000..879378f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -0,0 +1,257 @@
+//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as Intel-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86IntelInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include <cctype>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "X86GenAsmWriter1.inc"
+
+void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << getRegisterName(RegNo);
+}
+
+void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                    StringRef Annot,
+                                    const MCSubtargetInfo &STI) {
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+
+  if (TSFlags & X86II::LOCK)
+    OS << "\tlock\n";
+
+  printInstruction(MI, OS);
+
+  // Next always print the annotation.
+  printAnnotation(OS, Annot);
+
+  // If verbose assembly is enabled, we can print some informative comments.
+  if (CommentStream)
+    EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+}
+
+void X86IntelInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op,
+                                        raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm();
+  switch (Imm) {
+  default: llvm_unreachable("Invalid avxcc argument!");
+  case    0: O << "eq"; break;
+  case    1: O << "lt"; break;
+  case    2: O << "le"; break;
+  case    3: O << "unord"; break;
+  case    4: O << "neq"; break;
+  case    5: O << "nlt"; break;
+  case    6: O << "nle"; break;
+  case    7: O << "ord"; break;
+  case    8: O << "eq_uq"; break;
+  case    9: O << "nge"; break;
+  case  0xa: O << "ngt"; break;
+  case  0xb: O << "false"; break;
+  case  0xc: O << "neq_oq"; break;
+  case  0xd: O << "ge"; break;
+  case  0xe: O << "gt"; break;
+  case  0xf: O << "true"; break;
+  case 0x10: O << "eq_os"; break;
+  case 0x11: O << "lt_oq"; break;
+  case 0x12: O << "le_oq"; break;
+  case 0x13: O << "unord_s"; break;
+  case 0x14: O << "neq_us"; break;
+  case 0x15: O << "nlt_uq"; break;
+  case 0x16: O << "nle_uq"; break;
+  case 0x17: O << "ord_s"; break;
+  case 0x18: O << "eq_us"; break;
+  case 0x19: O << "nge_uq"; break;
+  case 0x1a: O << "ngt_uq"; break;
+  case 0x1b: O << "false_os"; break;
+  case 0x1c: O << "neq_os"; break;
+  case 0x1d: O << "ge_oq"; break;
+  case 0x1e: O << "gt_oq"; break;
+  case 0x1f: O << "true_us"; break;
+  }
+}
+
+void X86IntelInstPrinter::printXOPCC(const MCInst *MI, unsigned Op,
+                                     raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm();
+  switch (Imm) {
+  default: llvm_unreachable("Invalid xopcc argument!");
+  case 0: O << "lt"; break;
+  case 1: O << "le"; break;
+  case 2: O << "gt"; break;
+  case 3: O << "ge"; break;
+  case 4: O << "eq"; break;
+  case 5: O << "neq"; break;
+  case 6: O << "false"; break;
+  case 7: O << "true"; break;
+  }
+}
+
+void X86IntelInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
+                                               raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
+  switch (Imm) {
+  case 0: O << "{rn-sae}"; break;
+  case 1: O << "{rd-sae}"; break;
+  case 2: O << "{ru-sae}"; break;
+  case 3: O << "{rz-sae}"; break;
+  }
+}
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value.
+void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << formatImm(Op.getImm());
+  else {
+    assert(Op.isExpr() && "unknown pcrel immediate operand");
+    // If a symbolic branch target was added as a constant expression then print
+    // that address in hex.
+    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+    int64_t Address;
+    if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
+      O << formatHex((uint64_t)Address);
+    }
+    else {
+      // Otherwise, just print the expression.
+      Op.getExpr()->print(O, &MAI);
+    }
+  }
+}
+
+void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    printRegName(O, Op.getReg());
+  } else if (Op.isImm()) {
+    O << formatImm((int64_t)Op.getImm());
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    Op.getExpr()->print(O, &MAI);
+  }
+}
+
+void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+                                            raw_ostream &O) {
+  const MCOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
+  unsigned ScaleVal         = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+  const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+  const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+  const MCOperand &SegReg   = MI->getOperand(Op+X86::AddrSegmentReg);
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+X86::AddrSegmentReg, O);
+    O << ':';
+  }
+
+  O << '[';
+
+  bool NeedPlus = false;
+  if (BaseReg.getReg()) {
+    printOperand(MI, Op+X86::AddrBaseReg, O);
+    NeedPlus = true;
+  }
+
+  if (IndexReg.getReg()) {
+    if (NeedPlus) O << " + ";
+    if (ScaleVal != 1)
+      O << ScaleVal << '*';
+    printOperand(MI, Op+X86::AddrIndexReg, O);
+    NeedPlus = true;
+  }
+
+  if (!DispSpec.isImm()) {
+    if (NeedPlus) O << " + ";
+    assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+    DispSpec.getExpr()->print(O, &MAI);
+  } else {
+    int64_t DispVal = DispSpec.getImm();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+      if (NeedPlus) {
+        if (DispVal > 0)
+          O << " + ";
+        else {
+          O << " - ";
+          DispVal = -DispVal;
+        }
+      }
+      O << formatImm(DispVal);
+    }
+  }
+
+  O << ']';
+}
+
+void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
+                                      raw_ostream &O) {
+  const MCOperand &SegReg   = MI->getOperand(Op+1);
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+1, O);
+    O << ':';
+  }
+  O << '[';
+  printOperand(MI, Op, O);
+  O << ']';
+}
+
+void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
+                                      raw_ostream &O) {
+  // DI accesses are always ES-based.
+  O << "es:[";
+  printOperand(MI, Op, O);
+  O << ']';
+}
+
+void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+                                         raw_ostream &O) {
+  const MCOperand &DispSpec = MI->getOperand(Op);
+  const MCOperand &SegReg   = MI->getOperand(Op+1);
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+1, O);
+    O << ':';
+  }
+
+  O << '[';
+
+  if (DispSpec.isImm()) {
+    O << formatImm(DispSpec.getImm());
+  } else {
+    assert(DispSpec.isExpr() && "non-immediate displacement?");
+    DispSpec.getExpr()->print(O, &MAI);
+  }
+
+  O << ']';
+}
+
+void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
+                                     raw_ostream &O) {
+  O << formatImm(MI->getOperand(Op).getImm() & 0xff);
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
new file mode 100644
index 0000000..20cd7ff
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -0,0 +1,162 @@
+//= X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to Intel style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
+#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class X86IntelInstPrinter final : public MCInstPrinter {
+public:
+  X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                      const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O);
+
+  void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+
+  void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "opaque ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+
+  void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "byte ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "word ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "xmmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "ymmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "zmmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "xword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "xmmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "ymmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "zmmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+
+
+  void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "byte ptr ";
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "word ptr ";
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "byte ptr ";
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "word ptr ";
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printDstIdx(MI, OpNo, O);
+  }
+  void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "byte ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "word ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
new file mode 100644
index 0000000..133bd0e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -0,0 +1,855 @@
+//===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("invalid fixup kind!");
+  case FK_PCRel_1:
+  case FK_SecRel_1:
+  case FK_Data_1:
+    return 0;
+  case FK_PCRel_2:
+  case FK_SecRel_2:
+  case FK_Data_2:
+    return 1;
+  case FK_PCRel_4:
+  case X86::reloc_riprel_4byte:
+  case X86::reloc_riprel_4byte_movq_load:
+  case X86::reloc_signed_4byte:
+  case X86::reloc_global_offset_table:
+  case FK_SecRel_4:
+  case FK_Data_4:
+    return 2;
+  case FK_PCRel_8:
+  case FK_SecRel_8:
+  case FK_Data_8:
+  case X86::reloc_global_offset_table8:
+    return 3;
+  }
+}
+
+namespace {
+
+class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine,
+                     bool HasRelocationAddend, bool foobar)
+    : MCELFObjectTargetWriter(is64Bit, OSABI, EMachine, HasRelocationAddend) {}
+};
+
+class X86AsmBackend : public MCAsmBackend {
+  const StringRef CPU;
+  bool HasNopl;
+  uint64_t MaxNopLength;
+public:
+  X86AsmBackend(const Target &T, StringRef CPU) : MCAsmBackend(), CPU(CPU) {
+    HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" &&
+              CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
+              CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
+              CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" &&
+              CPU != "c3" && CPU != "c3-2";
+    // Max length of true long nop instruction is 15 bytes.
+    // Max length of long nop replacement instruction is 7 bytes.
+    // Taking into account SilverMont architecture features max length of nops
+    // is reduced for it to achieve better performance.
+    MaxNopLength = (!HasNopl || CPU == "slm") ? 7 : 15;
+  }
+
+  unsigned getNumFixupKinds() const override {
+    return X86::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+    const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = {
+      { "reloc_riprel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel },
+      { "reloc_riprel_4byte_movq_load", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel},
+      { "reloc_signed_4byte", 0, 4 * 8, 0},
+      { "reloc_global_offset_table", 0, 4 * 8, 0}
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override {
+    unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
+
+    assert(Fixup.getOffset() + Size <= DataSize &&
+           "Invalid fixup offset!");
+
+    // Check that uppper bits are either all zeros or all ones.
+    // Specifically ignore overflow/underflow as long as the leakage is
+    // limited to the lower bits. This is to remain compatible with
+    // other assemblers.
+    assert(isIntN(Size * 8 + 1, Value) &&
+           "Value does not fit in the Fixup field");
+
+    for (unsigned i = 0; i != Size; ++i)
+      Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override;
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override;
+
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+} // end anonymous namespace
+
+static unsigned getRelaxedOpcodeBranch(unsigned Op) {
+  switch (Op) {
+  default:
+    return Op;
+
+  case X86::JAE_1: return X86::JAE_4;
+  case X86::JA_1:  return X86::JA_4;
+  case X86::JBE_1: return X86::JBE_4;
+  case X86::JB_1:  return X86::JB_4;
+  case X86::JE_1:  return X86::JE_4;
+  case X86::JGE_1: return X86::JGE_4;
+  case X86::JG_1:  return X86::JG_4;
+  case X86::JLE_1: return X86::JLE_4;
+  case X86::JL_1:  return X86::JL_4;
+  case X86::JMP_1: return X86::JMP_4;
+  case X86::JNE_1: return X86::JNE_4;
+  case X86::JNO_1: return X86::JNO_4;
+  case X86::JNP_1: return X86::JNP_4;
+  case X86::JNS_1: return X86::JNS_4;
+  case X86::JO_1:  return X86::JO_4;
+  case X86::JP_1:  return X86::JP_4;
+  case X86::JS_1:  return X86::JS_4;
+  }
+}
+
+static unsigned getRelaxedOpcodeArith(unsigned Op) {
+  switch (Op) {
+  default:
+    return Op;
+
+    // IMUL
+  case X86::IMUL16rri8: return X86::IMUL16rri;
+  case X86::IMUL16rmi8: return X86::IMUL16rmi;
+  case X86::IMUL32rri8: return X86::IMUL32rri;
+  case X86::IMUL32rmi8: return X86::IMUL32rmi;
+  case X86::IMUL64rri8: return X86::IMUL64rri32;
+  case X86::IMUL64rmi8: return X86::IMUL64rmi32;
+
+    // AND
+  case X86::AND16ri8: return X86::AND16ri;
+  case X86::AND16mi8: return X86::AND16mi;
+  case X86::AND32ri8: return X86::AND32ri;
+  case X86::AND32mi8: return X86::AND32mi;
+  case X86::AND64ri8: return X86::AND64ri32;
+  case X86::AND64mi8: return X86::AND64mi32;
+
+    // OR
+  case X86::OR16ri8: return X86::OR16ri;
+  case X86::OR16mi8: return X86::OR16mi;
+  case X86::OR32ri8: return X86::OR32ri;
+  case X86::OR32mi8: return X86::OR32mi;
+  case X86::OR64ri8: return X86::OR64ri32;
+  case X86::OR64mi8: return X86::OR64mi32;
+
+    // XOR
+  case X86::XOR16ri8: return X86::XOR16ri;
+  case X86::XOR16mi8: return X86::XOR16mi;
+  case X86::XOR32ri8: return X86::XOR32ri;
+  case X86::XOR32mi8: return X86::XOR32mi;
+  case X86::XOR64ri8: return X86::XOR64ri32;
+  case X86::XOR64mi8: return X86::XOR64mi32;
+
+    // ADD
+  case X86::ADD16ri8: return X86::ADD16ri;
+  case X86::ADD16mi8: return X86::ADD16mi;
+  case X86::ADD32ri8: return X86::ADD32ri;
+  case X86::ADD32mi8: return X86::ADD32mi;
+  case X86::ADD64ri8: return X86::ADD64ri32;
+  case X86::ADD64mi8: return X86::ADD64mi32;
+
+   // ADC
+  case X86::ADC16ri8: return X86::ADC16ri;
+  case X86::ADC16mi8: return X86::ADC16mi;
+  case X86::ADC32ri8: return X86::ADC32ri;
+  case X86::ADC32mi8: return X86::ADC32mi;
+  case X86::ADC64ri8: return X86::ADC64ri32;
+  case X86::ADC64mi8: return X86::ADC64mi32;
+
+    // SUB
+  case X86::SUB16ri8: return X86::SUB16ri;
+  case X86::SUB16mi8: return X86::SUB16mi;
+  case X86::SUB32ri8: return X86::SUB32ri;
+  case X86::SUB32mi8: return X86::SUB32mi;
+  case X86::SUB64ri8: return X86::SUB64ri32;
+  case X86::SUB64mi8: return X86::SUB64mi32;
+
+   // SBB
+  case X86::SBB16ri8: return X86::SBB16ri;
+  case X86::SBB16mi8: return X86::SBB16mi;
+  case X86::SBB32ri8: return X86::SBB32ri;
+  case X86::SBB32mi8: return X86::SBB32mi;
+  case X86::SBB64ri8: return X86::SBB64ri32;
+  case X86::SBB64mi8: return X86::SBB64mi32;
+
+    // CMP
+  case X86::CMP16ri8: return X86::CMP16ri;
+  case X86::CMP16mi8: return X86::CMP16mi;
+  case X86::CMP32ri8: return X86::CMP32ri;
+  case X86::CMP32mi8: return X86::CMP32mi;
+  case X86::CMP64ri8: return X86::CMP64ri32;
+  case X86::CMP64mi8: return X86::CMP64mi32;
+
+    // PUSH
+  case X86::PUSH32i8:  return X86::PUSHi32;
+  case X86::PUSH16i8:  return X86::PUSHi16;
+  case X86::PUSH64i8:  return X86::PUSH64i32;
+  }
+}
+
+static unsigned getRelaxedOpcode(unsigned Op) {
+  unsigned R = getRelaxedOpcodeArith(Op);
+  if (R != Op)
+    return R;
+  return getRelaxedOpcodeBranch(Op);
+}
+
+bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+  // Branches can always be relaxed.
+  if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode())
+    return true;
+
+  // Check if this instruction is ever relaxable.
+  if (getRelaxedOpcodeArith(Inst.getOpcode()) == Inst.getOpcode())
+    return false;
+
+
+  // Check if the relaxable operand has an expression. For the current set of
+  // relaxable instructions, the relaxable operand is always the last operand.
+  unsigned RelaxableOp = Inst.getNumOperands() - 1;
+  if (Inst.getOperand(RelaxableOp).isExpr())
+    return true;
+
+  return false;
+}
+
+bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                         uint64_t Value,
+                                         const MCRelaxableFragment *DF,
+                                         const MCAsmLayout &Layout) const {
+  // Relax if the value is too big for a (signed) i8.
+  return int64_t(Value) != int64_t(int8_t(Value));
+}
+
+// FIXME: Can tblgen help at all here to verify there aren't other instructions
+// we can relax?
+void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
+  unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
+
+  if (RelaxedOp == Inst.getOpcode()) {
+    SmallString<256> Tmp;
+    raw_svector_ostream OS(Tmp);
+    Inst.dump_pretty(OS);
+    OS << "\n";
+    report_fatal_error("unexpected instruction to relax: " + OS.str());
+  }
+
+  Res = Inst;
+  Res.setOpcode(RelaxedOp);
+}
+
+/// \brief Write a sequence of optimal nops to the output, covering \p Count
+/// bytes.
+/// \return - true on success, false on failure
+bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  static const uint8_t TrueNops[10][10] = {
+    // nop
+    {0x90},
+    // xchg %ax,%ax
+    {0x66, 0x90},
+    // nopl (%[re]ax)
+    {0x0f, 0x1f, 0x00},
+    // nopl 0(%[re]ax)
+    {0x0f, 0x1f, 0x40, 0x00},
+    // nopl 0(%[re]ax,%[re]ax,1)
+    {0x0f, 0x1f, 0x44, 0x00, 0x00},
+    // nopw 0(%[re]ax,%[re]ax,1)
+    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+    // nopl 0L(%[re]ax)
+    {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+    // nopl 0L(%[re]ax,%[re]ax,1)
+    {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    // nopw 0L(%[re]ax,%[re]ax,1)
+    {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    // nopw %cs:0L(%[re]ax,%[re]ax,1)
+    {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+  };
+
+  // Alternative nop instructions for CPUs which don't support long nops.
+  static const uint8_t AltNops[7][10] = {
+      // nop
+      {0x90},
+      // xchg %ax,%ax
+      {0x66, 0x90},
+      // lea 0x0(%esi),%esi
+      {0x8d, 0x76, 0x00},
+      // lea 0x0(%esi),%esi
+      {0x8d, 0x74, 0x26, 0x00},
+      // nop + lea 0x0(%esi),%esi
+      {0x90, 0x8d, 0x74, 0x26, 0x00},
+      // lea 0x0(%esi),%esi
+      {0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00 },
+      // lea 0x0(%esi),%esi
+      {0x8d, 0xb4, 0x26, 0x00, 0x00, 0x00, 0x00},
+  };
+
+  // Select the right NOP table.
+  // FIXME: Can we get if CPU supports long nops from the subtarget somehow?
+  const uint8_t (*Nops)[10] = HasNopl ? TrueNops : AltNops;
+  assert(HasNopl || MaxNopLength <= 7);
+
+  // Emit as many largest nops as needed, then emit a nop of the remaining
+  // length.
+  do {
+    const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength);
+    const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
+    for (uint8_t i = 0; i < Prefixes; i++)
+      OW->write8(0x66);
+    const uint8_t Rest = ThisNopLength - Prefixes;
+    for (uint8_t i = 0; i < Rest; i++)
+      OW->write8(Nops[Rest - 1][i]);
+    Count -= ThisNopLength;
+  } while (Count != 0);
+
+  return true;
+}
+
+/* *** */
+
+namespace {
+
+class ELFX86AsmBackend : public X86AsmBackend {
+public:
+  uint8_t OSABI;
+  ELFX86AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+      : X86AsmBackend(T, CPU), OSABI(OSABI) {}
+};
+
+class ELFX86_32AsmBackend : public ELFX86AsmBackend {
+public:
+  ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+    : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386);
+  }
+};
+
+class ELFX86_X32AsmBackend : public ELFX86AsmBackend {
+public:
+  ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+      : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
+                                    ELF::EM_X86_64);
+  }
+};
+
+class ELFX86_IAMCUAsmBackend : public ELFX86AsmBackend {
+public:
+  ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+      : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
+                                    ELF::EM_IAMCU);
+  }
+};
+
+class ELFX86_64AsmBackend : public ELFX86AsmBackend {
+public:
+  ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+    : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64);
+  }
+};
+
+class WindowsX86AsmBackend : public X86AsmBackend {
+  bool Is64Bit;
+
+public:
+  WindowsX86AsmBackend(const Target &T, bool is64Bit, StringRef CPU)
+    : X86AsmBackend(T, CPU)
+    , Is64Bit(is64Bit) {
+  }
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createX86WinCOFFObjectWriter(OS, Is64Bit);
+  }
+};
+
+namespace CU {
+
+  /// Compact unwind encoding values.
+  enum CompactUnwindEncodings {
+    /// [RE]BP based frame where [RE]BP is pused on the stack immediately after
+    /// the return address, then [RE]SP is moved to [RE]BP.
+    UNWIND_MODE_BP_FRAME                   = 0x01000000,
+
+    /// A frameless function with a small constant stack size.
+    UNWIND_MODE_STACK_IMMD                 = 0x02000000,
+
+    /// A frameless function with a large constant stack size.
+    UNWIND_MODE_STACK_IND                  = 0x03000000,
+
+    /// No compact unwind encoding is available.
+    UNWIND_MODE_DWARF                      = 0x04000000,
+
+    /// Mask for encoding the frame registers.
+    UNWIND_BP_FRAME_REGISTERS              = 0x00007FFF,
+
+    /// Mask for encoding the frameless registers.
+    UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF
+  };
+
+} // end CU namespace
+
+class DarwinX86AsmBackend : public X86AsmBackend {
+  const MCRegisterInfo &MRI;
+
+  /// \brief Number of registers that can be saved in a compact unwind encoding.
+  enum { CU_NUM_SAVED_REGS = 6 };
+
+  mutable unsigned SavedRegs[CU_NUM_SAVED_REGS];
+  bool Is64Bit;
+
+  unsigned OffsetSize;                   ///< Offset of a "push" instruction.
+  unsigned MoveInstrSize;                ///< Size of a "move" instruction.
+  unsigned StackDivide;                  ///< Amount to adjust stack size by.
+protected:
+  /// \brief Size of a "push" instruction for the given register.
+  unsigned PushInstrSize(unsigned Reg) const {
+    switch (Reg) {
+      case X86::EBX:
+      case X86::ECX:
+      case X86::EDX:
+      case X86::EDI:
+      case X86::ESI:
+      case X86::EBP:
+      case X86::RBX:
+      case X86::RBP:
+        return 1;
+      case X86::R12:
+      case X86::R13:
+      case X86::R14:
+      case X86::R15:
+        return 2;
+    }
+    return 1;
+  }
+
+  /// \brief Implementation of algorithm to generate the compact unwind encoding
+  /// for the CFI instructions.
+  uint32_t
+  generateCompactUnwindEncodingImpl(ArrayRef<MCCFIInstruction> Instrs) const {
+    if (Instrs.empty()) return 0;
+
+    // Reset the saved registers.
+    unsigned SavedRegIdx = 0;
+    memset(SavedRegs, 0, sizeof(SavedRegs));
+
+    bool HasFP = false;
+
+    // Encode that we are using EBP/RBP as the frame pointer.
+    uint32_t CompactUnwindEncoding = 0;
+
+    unsigned SubtractInstrIdx = Is64Bit ? 3 : 2;
+    unsigned InstrOffset = 0;
+    unsigned StackAdjust = 0;
+    unsigned StackSize = 0;
+    unsigned PrevStackSize = 0;
+    unsigned NumDefCFAOffsets = 0;
+
+    for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
+      const MCCFIInstruction &Inst = Instrs[i];
+
+      switch (Inst.getOperation()) {
+      default:
+        // Any other CFI directives indicate a frame that we aren't prepared
+        // to represent via compact unwind, so just bail out.
+        return 0;
+      case MCCFIInstruction::OpDefCfaRegister: {
+        // Defines a frame pointer. E.g.
+        //
+        //     movq %rsp, %rbp
+        //  L0:
+        //     .cfi_def_cfa_register %rbp
+        //
+        HasFP = true;
+        assert(MRI.getLLVMRegNum(Inst.getRegister(), true) ==
+               (Is64Bit ? X86::RBP : X86::EBP) && "Invalid frame pointer!");
+
+        // Reset the counts.
+        memset(SavedRegs, 0, sizeof(SavedRegs));
+        StackAdjust = 0;
+        SavedRegIdx = 0;
+        InstrOffset += MoveInstrSize;
+        break;
+      }
+      case MCCFIInstruction::OpDefCfaOffset: {
+        // Defines a new offset for the CFA. E.g.
+        //
+        //  With frame:
+        //
+        //     pushq %rbp
+        //  L0:
+        //     .cfi_def_cfa_offset 16
+        //
+        //  Without frame:
+        //
+        //     subq $72, %rsp
+        //  L0:
+        //     .cfi_def_cfa_offset 80
+        //
+        PrevStackSize = StackSize;
+        StackSize = std::abs(Inst.getOffset()) / StackDivide;
+        ++NumDefCFAOffsets;
+        break;
+      }
+      case MCCFIInstruction::OpOffset: {
+        // Defines a "push" of a callee-saved register. E.g.
+        //
+        //     pushq %r15
+        //     pushq %r14
+        //     pushq %rbx
+        //  L0:
+        //     subq $120, %rsp
+        //  L1:
+        //     .cfi_offset %rbx, -40
+        //     .cfi_offset %r14, -32
+        //     .cfi_offset %r15, -24
+        //
+        if (SavedRegIdx == CU_NUM_SAVED_REGS)
+          // If there are too many saved registers, we cannot use a compact
+          // unwind encoding.
+          return CU::UNWIND_MODE_DWARF;
+
+        unsigned Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
+        SavedRegs[SavedRegIdx++] = Reg;
+        StackAdjust += OffsetSize;
+        InstrOffset += PushInstrSize(Reg);
+        break;
+      }
+      }
+    }
+
+    StackAdjust /= StackDivide;
+
+    if (HasFP) {
+      if ((StackAdjust & 0xFF) != StackAdjust)
+        // Offset was too big for a compact unwind encoding.
+        return CU::UNWIND_MODE_DWARF;
+
+      // Get the encoding of the saved registers when we have a frame pointer.
+      uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame();
+      if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+      CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME;
+      CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16;
+      CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS;
+    } else {
+      // If the amount of the stack allocation is the size of a register, then
+      // we "push" the RAX/EAX register onto the stack instead of adjusting the
+      // stack pointer with a SUB instruction. We don't support the push of the
+      // RAX/EAX register with compact unwind. So we check for that situation
+      // here.
+      if ((NumDefCFAOffsets == SavedRegIdx + 1 &&
+           StackSize - PrevStackSize == 1) ||
+          (Instrs.size() == 1 && NumDefCFAOffsets == 1 && StackSize == 2))
+        return CU::UNWIND_MODE_DWARF;
+
+      SubtractInstrIdx += InstrOffset;
+      ++StackAdjust;
+
+      if ((StackSize & 0xFF) == StackSize) {
+        // Frameless stack with a small stack size.
+        CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD;
+
+        // Encode the stack size.
+        CompactUnwindEncoding |= (StackSize & 0xFF) << 16;
+      } else {
+        if ((StackAdjust & 0x7) != StackAdjust)
+          // The extra stack adjustments are too big for us to handle.
+          return CU::UNWIND_MODE_DWARF;
+
+        // Frameless stack with an offset too large for us to encode compactly.
+        CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND;
+
+        // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP'
+        // instruction.
+        CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
+
+        // Encode any extra stack stack adjustments (done via push
+        // instructions).
+        CompactUnwindEncoding |= (StackAdjust & 0x7) << 13;
+      }
+
+      // Encode the number of registers saved. (Reverse the list first.)
+      std::reverse(&SavedRegs[0], &SavedRegs[SavedRegIdx]);
+      CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10;
+
+      // Get the encoding of the saved registers when we don't have a frame
+      // pointer.
+      uint32_t RegEnc = encodeCompactUnwindRegistersWithoutFrame(SavedRegIdx);
+      if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+      // Encode the register encoding.
+      CompactUnwindEncoding |=
+        RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION;
+    }
+
+    return CompactUnwindEncoding;
+  }
+
+private:
+  /// \brief Get the compact unwind number for a given register. The number
+  /// corresponds to the enum lists in compact_unwind_encoding.h.
+  int getCompactUnwindRegNum(unsigned Reg) const {
+    static const MCPhysReg CU32BitRegs[7] = {
+      X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+    };
+    static const MCPhysReg CU64BitRegs[] = {
+      X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+    };
+    const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
+    for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
+      if (*CURegs == Reg)
+        return Idx;
+
+    return -1;
+  }
+
+  /// \brief Return the registers encoded for a compact encoding with a frame
+  /// pointer.
+  uint32_t encodeCompactUnwindRegistersWithFrame() const {
+    // Encode the registers in the order they were saved --- 3-bits per
+    // register. The list of saved registers is assumed to be in reverse
+    // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS.
+    uint32_t RegEnc = 0;
+    for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) {
+      unsigned Reg = SavedRegs[i];
+      if (Reg == 0) break;
+
+      int CURegNum = getCompactUnwindRegNum(Reg);
+      if (CURegNum == -1) return ~0U;
+
+      // Encode the 3-bit register number in order, skipping over 3-bits for
+      // each register.
+      RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
+    }
+
+    assert((RegEnc & 0x3FFFF) == RegEnc &&
+           "Invalid compact register encoding!");
+    return RegEnc;
+  }
+
+  /// \brief Create the permutation encoding used with frameless stacks. It is
+  /// passed the number of registers to be saved and an array of the registers
+  /// saved.
+  uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const {
+    // The saved registers are numbered from 1 to 6. In order to encode the
+    // order in which they were saved, we re-number them according to their
+    // place in the register order. The re-numbering is relative to the last
+    // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in
+    // that order:
+    //
+    //    Orig  Re-Num
+    //    ----  ------
+    //     6       6
+    //     2       2
+    //     4       3
+    //     5       3
+    //
+    for (unsigned i = 0; i < RegCount; ++i) {
+      int CUReg = getCompactUnwindRegNum(SavedRegs[i]);
+      if (CUReg == -1) return ~0U;
+      SavedRegs[i] = CUReg;
+    }
+
+    // Reverse the list.
+    std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]);
+
+    uint32_t RenumRegs[CU_NUM_SAVED_REGS];
+    for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){
+      unsigned Countless = 0;
+      for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
+        if (SavedRegs[j] < SavedRegs[i])
+          ++Countless;
+
+      RenumRegs[i] = SavedRegs[i] - Countless - 1;
+    }
+
+    // Take the renumbered values and encode them into a 10-bit number.
+    uint32_t permutationEncoding = 0;
+    switch (RegCount) {
+    case 6:
+      permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
+                             + 6 * RenumRegs[2] +  2 * RenumRegs[3]
+                             +     RenumRegs[4];
+      break;
+    case 5:
+      permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
+                             + 6 * RenumRegs[3] +  2 * RenumRegs[4]
+                             +     RenumRegs[5];
+      break;
+    case 4:
+      permutationEncoding |=  60 * RenumRegs[2] + 12 * RenumRegs[3]
+                             + 3 * RenumRegs[4] +      RenumRegs[5];
+      break;
+    case 3:
+      permutationEncoding |=  20 * RenumRegs[3] +  4 * RenumRegs[4]
+                             +     RenumRegs[5];
+      break;
+    case 2:
+      permutationEncoding |=   5 * RenumRegs[4] +      RenumRegs[5];
+      break;
+    case 1:
+      permutationEncoding |=       RenumRegs[5];
+      break;
+    }
+
+    assert((permutationEncoding & 0x3FF) == permutationEncoding &&
+           "Invalid compact register encoding!");
+    return permutationEncoding;
+  }
+
+public:
+  DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef CPU,
+                      bool Is64Bit)
+    : X86AsmBackend(T, CPU), MRI(MRI), Is64Bit(Is64Bit) {
+    memset(SavedRegs, 0, sizeof(SavedRegs));
+    OffsetSize = Is64Bit ? 8 : 4;
+    MoveInstrSize = Is64Bit ? 3 : 2;
+    StackDivide = Is64Bit ? 8 : 4;
+  }
+};
+
+class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
+public:
+  DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                         StringRef CPU)
+      : DarwinX86AsmBackend(T, MRI, CPU, false) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
+                                     MachO::CPU_TYPE_I386,
+                                     MachO::CPU_SUBTYPE_I386_ALL);
+  }
+
+  /// \brief Generate the compact unwind encoding for the CFI instructions.
+  uint32_t generateCompactUnwindEncoding(
+                             ArrayRef<MCCFIInstruction> Instrs) const override {
+    return generateCompactUnwindEncodingImpl(Instrs);
+  }
+};
+
+class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
+  const MachO::CPUSubTypeX86 Subtype;
+public:
+  DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                         StringRef CPU, MachO::CPUSubTypeX86 st)
+      : DarwinX86AsmBackend(T, MRI, CPU, true), Subtype(st) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
+                                     MachO::CPU_TYPE_X86_64, Subtype);
+  }
+
+  /// \brief Generate the compact unwind encoding for the CFI instructions.
+  uint32_t generateCompactUnwindEncoding(
+                             ArrayRef<MCCFIInstruction> Instrs) const override {
+    return generateCompactUnwindEncodingImpl(Instrs);
+  }
+};
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
+                                           const MCRegisterInfo &MRI,
+                                           const Triple &TheTriple,
+                                           StringRef CPU) {
+  if (TheTriple.isOSBinFormatMachO())
+    return new DarwinX86_32AsmBackend(T, MRI, CPU);
+
+  if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF())
+    return new WindowsX86AsmBackend(T, false, CPU);
+
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+
+  if (TheTriple.isOSIAMCU())
+    return new ELFX86_IAMCUAsmBackend(T, OSABI, CPU);
+
+  return new ELFX86_32AsmBackend(T, OSABI, CPU);
+}
+
+MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
+                                           const MCRegisterInfo &MRI,
+                                           const Triple &TheTriple,
+                                           StringRef CPU) {
+  if (TheTriple.isOSBinFormatMachO()) {
+    MachO::CPUSubTypeX86 CS =
+        StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName())
+            .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H)
+            .Default(MachO::CPU_SUBTYPE_X86_64_ALL);
+    return new DarwinX86_64AsmBackend(T, MRI, CPU, CS);
+  }
+
+  if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF())
+    return new WindowsX86AsmBackend(T, true, CPU);
+
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+
+  if (TheTriple.getEnvironment() == Triple::GNUX32)
+    return new ELFX86_X32AsmBackend(T, OSABI, CPU);
+  return new ELFX86_64AsmBackend(T, OSABI, CPU);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
new file mode 100644
index 0000000..9ff85b9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -0,0 +1,779 @@
+//===-- X86BaseInfo.h - Top level definitions for X86 -------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the X86 target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86BASEINFO_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86BASEINFO_H
+
+#include "X86MCTargetDesc.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+namespace X86 {
+  // Enums for memory operand decoding.  Each memory operand is represented with
+  // a 5 operand sequence in the form:
+  //   [BaseReg, ScaleAmt, IndexReg, Disp, Segment]
+  // These enums help decode this.
+  enum {
+    AddrBaseReg = 0,
+    AddrScaleAmt = 1,
+    AddrIndexReg = 2,
+    AddrDisp = 3,
+
+    /// AddrSegmentReg - The operand # of the segment in the memory operand.
+    AddrSegmentReg = 4,
+
+    /// AddrNumOperands - Total number of operands in a memory reference.
+    AddrNumOperands = 5
+  };
+
+  /// AVX512 static rounding constants.  These need to match the values in
+  /// avx512fintrin.h.
+  enum STATIC_ROUNDING {
+    TO_NEAREST_INT = 0,
+    TO_NEG_INF = 1,
+    TO_POS_INF = 2,
+    TO_ZERO = 3,
+    CUR_DIRECTION = 4
+  };
+} // end namespace X86;
+
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // X86 Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a
+    /// relocation of:
+    ///    SYMBOL_LABEL + [. - PICBASELABEL]
+    MO_GOT_ABSOLUTE_ADDRESS,
+
+    /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the
+    /// immediate should get the value of the symbol minus the PIC base label:
+    ///    SYMBOL_LABEL - PICBASELABEL
+    MO_PIC_BASE_OFFSET,
+
+    /// MO_GOT - On a symbol operand this indicates that the immediate is the
+    /// offset to the GOT entry for the symbol name from the base of the GOT.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @GOT
+    MO_GOT,
+
+    /// MO_GOTOFF - On a symbol operand this indicates that the immediate is
+    /// the offset to the location of the symbol name from the base of the GOT.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @GOTOFF
+    MO_GOTOFF,
+
+    /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is
+    /// offset to the GOT entry for the symbol name from the current code
+    /// location.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @GOTPCREL
+    MO_GOTPCREL,
+
+    /// MO_PLT - On a symbol operand this indicates that the immediate is
+    /// offset to the PLT entry of symbol name from the current code location.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @PLT
+    MO_PLT,
+
+    /// MO_TLSGD - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS index structure that contains
+    /// the module number and variable offset for the symbol. Used in the
+    /// general dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TLSGD
+    MO_TLSGD,
+
+    /// MO_TLSLD - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS index for the module that
+    /// contains the symbol. When this index is passed to a call to
+    /// __tls_get_addr, the function will return the base address of the TLS
+    /// block for the symbol. Used in the x86-64 local dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TLSLD
+    MO_TLSLD,
+
+    /// MO_TLSLDM - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS index for the module that
+    /// contains the symbol. When this index is passed to a call to
+    /// ___tls_get_addr, the function will return the base address of the TLS
+    /// block for the symbol. Used in the IA32 local dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TLSLDM
+    MO_TLSLDM,
+
+    /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the thread-pointer offset for the
+    /// symbol. Used in the x86-64 initial exec TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @GOTTPOFF
+    MO_GOTTPOFF,
+
+    /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
+    /// the absolute address of the GOT entry with the negative thread-pointer
+    /// offset for the symbol. Used in the non-PIC IA32 initial exec TLS access
+    /// model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @INDNTPOFF
+    MO_INDNTPOFF,
+
+    /// MO_TPOFF - On a symbol operand this indicates that the immediate is
+    /// the thread-pointer offset for the symbol. Used in the x86-64 local
+    /// exec TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TPOFF
+    MO_TPOFF,
+
+    /// MO_DTPOFF - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS offset of the symbol. Used
+    /// in the local dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @DTPOFF
+    MO_DTPOFF,
+
+    /// MO_NTPOFF - On a symbol operand this indicates that the immediate is
+    /// the negative thread-pointer offset for the symbol. Used in the IA32
+    /// local exec TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @NTPOFF
+    MO_NTPOFF,
+
+    /// MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the negative thread-pointer offset for
+    /// the symbol. Used in the PIC IA32 initial exec TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @GOTNTPOFF
+    MO_GOTNTPOFF,
+
+    /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "__imp_FOO" symbol.  This is used for
+    /// dllimport linkage on windows.
+    MO_DLLIMPORT,
+
+    /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "FOO$stub" symbol.  This is used for calls
+    /// and jumps to external functions on Tiger and earlier.
+    MO_DARWIN_STUB,
+
+    /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a
+    /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+    MO_DARWIN_NONLAZY,
+
+    /// MO_DARWIN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this indicates
+    /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is
+    /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+    MO_DARWIN_NONLAZY_PIC_BASE,
+
+    /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this
+    /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE",
+    /// which is a PIC-base-relative reference to a hidden dyld lazy pointer
+    /// stub.
+    MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE,
+
+    /// MO_TLVP - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// This is the TLS offset for the Darwin TLS mechanism.
+    MO_TLVP,
+
+    /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate
+    /// is some TLS offset from the picbase.
+    ///
+    /// This is the 32-bit TLS offset for Darwin TLS in PIC mode.
+    MO_TLVP_PIC_BASE,
+
+    /// MO_SECREL - On a symbol operand this indicates that the immediate is
+    /// the offset from beginning of section.
+    ///
+    /// This is the TLS offset for the COFF/Windows TLS mechanism.
+    MO_SECREL
+  };
+
+  enum : uint64_t {
+    //===------------------------------------------------------------------===//
+    // Instruction encodings.  These are the standard/most common forms for X86
+    // instructions.
+    //
+
+    // PseudoFrm - This represents an instruction that is a pseudo instruction
+    // or one that has not been implemented yet.  It is illegal to code generate
+    // it, but tolerated for intermediate implementation stages.
+    Pseudo         = 0,
+
+    /// Raw - This form is for instructions that don't have any operands, so
+    /// they are just a fixed opcode value, like 'leave'.
+    RawFrm         = 1,
+
+    /// AddRegFrm - This form is used for instructions like 'push r32' that have
+    /// their one register operand added to their opcode.
+    AddRegFrm      = 2,
+
+    /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is a register.
+    ///
+    MRMDestReg     = 3,
+
+    /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is memory.
+    ///
+    MRMDestMem     = 4,
+
+    /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is a register.
+    ///
+    MRMSrcReg      = 5,
+
+    /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is memory.
+    ///
+    MRMSrcMem      = 6,
+
+    /// RawFrmMemOffs - This form is for instructions that store an absolute
+    /// memory offset as an immediate with a possible segment override.
+    RawFrmMemOffs  = 7,
+
+    /// RawFrmSrc - This form is for instructions that use the source index
+    /// register SI/ESI/RSI with a possible segment override.
+    RawFrmSrc      = 8,
+
+    /// RawFrmDst - This form is for instructions that use the destination index
+    /// register DI/EDI/ESI.
+    RawFrmDst      = 9,
+
+    /// RawFrmSrc - This form is for instructions that use the source index
+    /// register SI/ESI/ERI with a possible segment override, and also the
+    /// destination index register DI/ESI/RDI.
+    RawFrmDstSrc   = 10,
+
+    /// RawFrmImm8 - This is used for the ENTER instruction, which has two
+    /// immediates, the first of which is a 16-bit immediate (specified by
+    /// the imm encoding) and the second is a 8-bit fixed value.
+    RawFrmImm8 = 11,
+
+    /// RawFrmImm16 - This is used for CALL FAR instructions, which have two
+    /// immediates, the first of which is a 16 or 32-bit immediate (specified by
+    /// the imm encoding) and the second is a 16-bit fixed value.  In the AMD
+    /// manual, this operand is described as pntr16:32 and pntr16:16
+    RawFrmImm16 = 12,
+
+    /// MRMX[rm] - The forms are used to represent instructions that use a
+    /// Mod/RM byte, and don't use the middle field for anything.
+    MRMXr = 14, MRMXm = 15,
+
+    /// MRM[0-7][rm] - These forms are used to represent instructions that use
+    /// a Mod/RM byte, and use the middle field to hold extended opcode
+    /// information.  In the intel manual these are represented as /0, /1, ...
+    ///
+
+    // First, instructions that operate on a register r/m operand...
+    MRM0r = 16,  MRM1r = 17,  MRM2r = 18,  MRM3r = 19, // Format /0 /1 /2 /3
+    MRM4r = 20,  MRM5r = 21,  MRM6r = 22,  MRM7r = 23, // Format /4 /5 /6 /7
+
+    // Next, instructions that operate on a memory r/m operand...
+    MRM0m = 24,  MRM1m = 25,  MRM2m = 26,  MRM3m = 27, // Format /0 /1 /2 /3
+    MRM4m = 28,  MRM5m = 29,  MRM6m = 30,  MRM7m = 31, // Format /4 /5 /6 /7
+
+    //// MRM_XX - A mod/rm byte of exactly 0xXX.
+    MRM_C0 = 32, MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35,
+    MRM_C4 = 36, MRM_C5 = 37, MRM_C6 = 38, MRM_C7 = 39,
+    MRM_C8 = 40, MRM_C9 = 41, MRM_CA = 42, MRM_CB = 43,
+    MRM_CC = 44, MRM_CD = 45, MRM_CE = 46, MRM_CF = 47,
+    MRM_D0 = 48, MRM_D1 = 49, MRM_D2 = 50, MRM_D3 = 51,
+    MRM_D4 = 52, MRM_D5 = 53, MRM_D6 = 54, MRM_D7 = 55,
+    MRM_D8 = 56, MRM_D9 = 57, MRM_DA = 58, MRM_DB = 59,
+    MRM_DC = 60, MRM_DD = 61, MRM_DE = 62, MRM_DF = 63,
+    MRM_E0 = 64, MRM_E1 = 65, MRM_E2 = 66, MRM_E3 = 67,
+    MRM_E4 = 68, MRM_E5 = 69, MRM_E6 = 70, MRM_E7 = 71,
+    MRM_E8 = 72, MRM_E9 = 73, MRM_EA = 74, MRM_EB = 75,
+    MRM_EC = 76, MRM_ED = 77, MRM_EE = 78, MRM_EF = 79,
+    MRM_F0 = 80, MRM_F1 = 81, MRM_F2 = 82, MRM_F3 = 83,
+    MRM_F4 = 84, MRM_F5 = 85, MRM_F6 = 86, MRM_F7 = 87,
+    MRM_F8 = 88, MRM_F9 = 89, MRM_FA = 90, MRM_FB = 91,
+    MRM_FC = 92, MRM_FD = 93, MRM_FE = 94, MRM_FF = 95,
+
+    FormMask       = 127,
+
+    //===------------------------------------------------------------------===//
+    // Actual flags...
+
+    // OpSize - OpSizeFixed implies instruction never needs a 0x66 prefix.
+    // OpSize16 means this is a 16-bit instruction and needs 0x66 prefix in
+    // 32-bit mode. OpSize32 means this is a 32-bit instruction needs a 0x66
+    // prefix in 16-bit mode.
+    OpSizeShift = 7,
+    OpSizeMask = 0x3 << OpSizeShift,
+
+    OpSizeFixed = 0 << OpSizeShift,
+    OpSize16    = 1 << OpSizeShift,
+    OpSize32    = 2 << OpSizeShift,
+
+    // AsSize - AdSizeX implies this instruction determines its need of 0x67
+    // prefix from a normal ModRM memory operand. The other types indicate that
+    // an operand is encoded with a specific width and a prefix is needed if
+    // it differs from the current mode.
+    AdSizeShift = OpSizeShift + 2,
+    AdSizeMask  = 0x3 << AdSizeShift,
+
+    AdSizeX  = 1 << AdSizeShift,
+    AdSize16 = 1 << AdSizeShift,
+    AdSize32 = 2 << AdSizeShift,
+    AdSize64 = 3 << AdSizeShift,
+
+    //===------------------------------------------------------------------===//
+    // OpPrefix - There are several prefix bytes that are used as opcode
+    // extensions. These are 0x66, 0xF3, and 0xF2. If this field is 0 there is
+    // no prefix.
+    //
+    OpPrefixShift = AdSizeShift + 2,
+    OpPrefixMask  = 0x7 << OpPrefixShift,
+
+    // PS, PD - Prefix code for packed single and double precision vector
+    // floating point operations performed in the SSE registers.
+    PS = 1 << OpPrefixShift, PD = 2 << OpPrefixShift,
+
+    // XS, XD - These prefix codes are for single and double precision scalar
+    // floating point operations performed in the SSE registers.
+    XS = 3 << OpPrefixShift,  XD = 4 << OpPrefixShift,
+
+    //===------------------------------------------------------------------===//
+    // OpMap - This field determines which opcode map this instruction
+    // belongs to. i.e. one-byte, two-byte, 0x0f 0x38, 0x0f 0x3a, etc.
+    //
+    OpMapShift = OpPrefixShift + 3,
+    OpMapMask  = 0x7 << OpMapShift,
+
+    // OB - OneByte - Set if this instruction has a one byte opcode.
+    OB = 0 << OpMapShift,
+
+    // TB - TwoByte - Set if this instruction has a two byte opcode, which
+    // starts with a 0x0F byte before the real opcode.
+    TB = 1 << OpMapShift,
+
+    // T8, TA - Prefix after the 0x0F prefix.
+    T8 = 2 << OpMapShift,  TA = 3 << OpMapShift,
+
+    // XOP8 - Prefix to include use of imm byte.
+    XOP8 = 4 << OpMapShift,
+
+    // XOP9 - Prefix to exclude use of imm byte.
+    XOP9 = 5 << OpMapShift,
+
+    // XOPA - Prefix to encode 0xA in VEX.MMMM of XOP instructions.
+    XOPA = 6 << OpMapShift,
+
+    //===------------------------------------------------------------------===//
+    // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
+    // They are used to specify GPRs and SSE registers, 64-bit operand size,
+    // etc. We only cares about REX.W and REX.R bits and only the former is
+    // statically determined.
+    //
+    REXShift    = OpMapShift + 3,
+    REX_W       = 1 << REXShift,
+
+    //===------------------------------------------------------------------===//
+    // This three-bit field describes the size of an immediate operand.  Zero is
+    // unused so that we can tell if we forgot to set a value.
+    ImmShift = REXShift + 1,
+    ImmMask    = 15 << ImmShift,
+    Imm8       = 1 << ImmShift,
+    Imm8PCRel  = 2 << ImmShift,
+    Imm16      = 3 << ImmShift,
+    Imm16PCRel = 4 << ImmShift,
+    Imm32      = 5 << ImmShift,
+    Imm32PCRel = 6 << ImmShift,
+    Imm32S     = 7 << ImmShift,
+    Imm64      = 8 << ImmShift,
+
+    //===------------------------------------------------------------------===//
+    // FP Instruction Classification...  Zero is non-fp instruction.
+
+    // FPTypeMask - Mask for all of the FP types...
+    FPTypeShift = ImmShift + 4,
+    FPTypeMask  = 7 << FPTypeShift,
+
+    // NotFP - The default, set for instructions that do not use FP registers.
+    NotFP      = 0 << FPTypeShift,
+
+    // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
+    ZeroArgFP  = 1 << FPTypeShift,
+
+    // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
+    OneArgFP   = 2 << FPTypeShift,
+
+    // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
+    // result back to ST(0).  For example, fcos, fsqrt, etc.
+    //
+    OneArgFPRW = 3 << FPTypeShift,
+
+    // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
+    // explicit argument, storing the result to either ST(0) or the implicit
+    // argument.  For example: fadd, fsub, fmul, etc...
+    TwoArgFP   = 4 << FPTypeShift,
+
+    // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
+    // explicit argument, but have no destination.  Example: fucom, fucomi, ...
+    CompareFP  = 5 << FPTypeShift,
+
+    // CondMovFP - "2 operand" floating point conditional move instructions.
+    CondMovFP  = 6 << FPTypeShift,
+
+    // SpecialFP - Special instruction forms.  Dispatch by opcode explicitly.
+    SpecialFP  = 7 << FPTypeShift,
+
+    // Lock prefix
+    LOCKShift = FPTypeShift + 3,
+    LOCK = 1 << LOCKShift,
+
+    // REP prefix
+    REPShift = LOCKShift + 1,
+    REP = 1 << REPShift,
+
+    // Execution domain for SSE instructions.
+    // 0 means normal, non-SSE instruction.
+    SSEDomainShift = REPShift + 1,
+
+    // Encoding
+    EncodingShift = SSEDomainShift + 2,
+    EncodingMask = 0x3 << EncodingShift,
+
+    // VEX - encoding using 0xC4/0xC5
+    VEX = 1 << EncodingShift,
+
+    /// XOP - Opcode prefix used by XOP instructions.
+    XOP = 2 << EncodingShift,
+
+    // VEX_EVEX - Specifies that this instruction use EVEX form which provides
+    // syntax support up to 32 512-bit register operands and up to 7 16-bit
+    // mask operands as well as source operand data swizzling/memory operand
+    // conversion, eviction hint, and rounding mode.
+    EVEX = 3 << EncodingShift,
+
+    // Opcode
+    OpcodeShift   = EncodingShift + 2,
+
+    /// VEX_W - Has a opcode specific functionality, but is used in the same
+    /// way as REX_W is for regular SSE instructions.
+    VEX_WShift  = OpcodeShift + 8,
+    VEX_W       = 1ULL << VEX_WShift,
+
+    /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2
+    /// address instructions in SSE are represented as 3 address ones in AVX
+    /// and the additional register is encoded in VEX_VVVV prefix.
+    VEX_4VShift = VEX_WShift + 1,
+    VEX_4V      = 1ULL << VEX_4VShift,
+
+    /// VEX_4VOp3 - Similar to VEX_4V, but used on instructions that encode
+    /// operand 3 with VEX.vvvv.
+    VEX_4VOp3Shift = VEX_4VShift + 1,
+    VEX_4VOp3   = 1ULL << VEX_4VOp3Shift,
+
+    /// VEX_I8IMM - Specifies that the last register used in a AVX instruction,
+    /// must be encoded in the i8 immediate field. This usually happens in
+    /// instructions with 4 operands.
+    VEX_I8IMMShift = VEX_4VOp3Shift + 1,
+    VEX_I8IMM   = 1ULL << VEX_I8IMMShift,
+
+    /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current
+    /// instruction uses 256-bit wide registers. This is usually auto detected
+    /// if a VR256 register is used, but some AVX instructions also have this
+    /// field marked when using a f256 memory references.
+    VEX_LShift = VEX_I8IMMShift + 1,
+    VEX_L       = 1ULL << VEX_LShift,
+
+    // VEX_LIG - Specifies that this instruction ignores the L-bit in the VEX
+    // prefix. Usually used for scalar instructions. Needed by disassembler.
+    VEX_LIGShift = VEX_LShift + 1,
+    VEX_LIG     = 1ULL << VEX_LIGShift,
+
+    // TODO: we should combine VEX_L and VEX_LIG together to form a 2-bit field
+    // with following encoding:
+    // - 00 V128
+    // - 01 V256
+    // - 10 V512
+    // - 11 LIG (but, in insn encoding, leave VEX.L and EVEX.L in zeros.
+    // this will save 1 tsflag bit
+
+    // EVEX_K - Set if this instruction requires masking
+    EVEX_KShift = VEX_LIGShift + 1,
+    EVEX_K      = 1ULL << EVEX_KShift,
+
+    // EVEX_Z - Set if this instruction has EVEX.Z field set.
+    EVEX_ZShift = EVEX_KShift + 1,
+    EVEX_Z      = 1ULL << EVEX_ZShift,
+
+    // EVEX_L2 - Set if this instruction has EVEX.L' field set.
+    EVEX_L2Shift = EVEX_ZShift + 1,
+    EVEX_L2     = 1ULL << EVEX_L2Shift,
+
+    // EVEX_B - Set if this instruction has EVEX.B field set.
+    EVEX_BShift = EVEX_L2Shift + 1,
+    EVEX_B      = 1ULL << EVEX_BShift,
+
+    // The scaling factor for the AVX512's 8-bit compressed displacement.
+    CD8_Scale_Shift = EVEX_BShift + 1,
+    CD8_Scale_Mask = 127ULL << CD8_Scale_Shift,
+
+    /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the
+    /// wacky 0x0F 0x0F prefix for 3DNow! instructions.  The manual documents
+    /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
+    /// storing a classifier in the imm8 field.  To simplify our implementation,
+    /// we handle this by storeing the classifier in the opcode field and using
+    /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
+    Has3DNow0F0FOpcodeShift = CD8_Scale_Shift + 7,
+    Has3DNow0F0FOpcode = 1ULL << Has3DNow0F0FOpcodeShift,
+
+    /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in
+    /// ModRM or I8IMM. This is used for FMA4 and XOP instructions.
+    MemOp4Shift = Has3DNow0F0FOpcodeShift + 1,
+    MemOp4 = 1ULL << MemOp4Shift,
+
+    /// Explicitly specified rounding control
+    EVEX_RCShift = MemOp4Shift + 1,
+    EVEX_RC = 1ULL << EVEX_RCShift
+  };
+
+  // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
+  // specified machine instruction.
+  //
+  inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
+    return TSFlags >> X86II::OpcodeShift;
+  }
+
+  inline bool hasImm(uint64_t TSFlags) {
+    return (TSFlags & X86II::ImmMask) != 0;
+  }
+
+  /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
+  /// of the specified instruction.
+  inline unsigned getSizeOfImm(uint64_t TSFlags) {
+    switch (TSFlags & X86II::ImmMask) {
+    default: llvm_unreachable("Unknown immediate size");
+    case X86II::Imm8:
+    case X86II::Imm8PCRel:  return 1;
+    case X86II::Imm16:
+    case X86II::Imm16PCRel: return 2;
+    case X86II::Imm32:
+    case X86II::Imm32S:
+    case X86II::Imm32PCRel: return 4;
+    case X86II::Imm64:      return 8;
+    }
+  }
+
+  /// isImmPCRel - Return true if the immediate of the specified instruction's
+  /// TSFlags indicates that it is pc relative.
+  inline unsigned isImmPCRel(uint64_t TSFlags) {
+    switch (TSFlags & X86II::ImmMask) {
+    default: llvm_unreachable("Unknown immediate size");
+    case X86II::Imm8PCRel:
+    case X86II::Imm16PCRel:
+    case X86II::Imm32PCRel:
+      return true;
+    case X86II::Imm8:
+    case X86II::Imm16:
+    case X86II::Imm32:
+    case X86II::Imm32S:
+    case X86II::Imm64:
+      return false;
+    }
+  }
+
+  /// isImmSigned - Return true if the immediate of the specified instruction's
+  /// TSFlags indicates that it is signed.
+  inline unsigned isImmSigned(uint64_t TSFlags) {
+    switch (TSFlags & X86II::ImmMask) {
+    default: llvm_unreachable("Unknown immediate signedness");
+    case X86II::Imm32S:
+      return true;
+    case X86II::Imm8:
+    case X86II::Imm8PCRel:
+    case X86II::Imm16:
+    case X86II::Imm16PCRel:
+    case X86II::Imm32:
+    case X86II::Imm32PCRel:
+    case X86II::Imm64:
+      return false;
+    }
+  }
+
+  /// getOperandBias - compute any additional adjustment needed to
+  ///                  the offset to the start of the memory operand
+  ///                  in this instruction.
+  /// If this is a two-address instruction,skip one of the register operands.
+  /// FIXME: This should be handled during MCInst lowering.
+  inline int getOperandBias(const MCInstrDesc& Desc)
+  {
+    unsigned NumOps = Desc.getNumOperands();
+    unsigned CurOp = 0;
+    if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
+      ++CurOp;
+    else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+             Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
+      // Special case for AVX-512 GATHER with 2 TIED_TO operands
+      // Skip the first 2 operands: dst, mask_wb
+      CurOp += 2;
+    else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+             Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1)
+      // Special case for GATHER with 2 TIED_TO operands
+      // Skip the first 2 operands: dst, mask_wb
+      CurOp += 2;
+    else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0)
+      // SCATTER
+      ++CurOp;
+    return CurOp;
+  }
+
+  /// getMemoryOperandNo - The function returns the MCInst operand # for the
+  /// first field of the memory operand.  If the instruction doesn't have a
+  /// memory operand, this returns -1.
+  ///
+  /// Note that this ignores tied operands.  If there is a tied register which
+  /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only
+  /// counted as one operand.
+  ///
+  inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
+    bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+    bool HasMemOp4 = TSFlags & X86II::MemOp4;
+    bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+
+    switch (TSFlags & X86II::FormMask) {
+    default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
+    case X86II::Pseudo:
+    case X86II::RawFrm:
+    case X86II::AddRegFrm:
+    case X86II::MRMDestReg:
+    case X86II::MRMSrcReg:
+    case X86II::RawFrmImm8:
+    case X86II::RawFrmImm16:
+    case X86II::RawFrmMemOffs:
+    case X86II::RawFrmSrc:
+    case X86II::RawFrmDst:
+    case X86II::RawFrmDstSrc:
+      return -1;
+    case X86II::MRMDestMem:
+      return 0;
+    case X86II::MRMSrcMem:
+      // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a
+      // mask register.
+      return 1 + HasVEX_4V + HasMemOp4 + HasEVEX_K;
+    case X86II::MRMXr:
+    case X86II::MRM0r: case X86II::MRM1r:
+    case X86II::MRM2r: case X86II::MRM3r:
+    case X86II::MRM4r: case X86II::MRM5r:
+    case X86II::MRM6r: case X86II::MRM7r:
+      return -1;
+    case X86II::MRMXm:
+    case X86II::MRM0m: case X86II::MRM1m:
+    case X86II::MRM2m: case X86II::MRM3m:
+    case X86II::MRM4m: case X86II::MRM5m:
+    case X86II::MRM6m: case X86II::MRM7m:
+      // Start from 0, skip registers encoded in VEX_VVVV or a mask register.
+      return 0 + HasVEX_4V + HasEVEX_K;
+    case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
+    case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5:
+    case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8:
+    case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
+    case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE:
+    case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1:
+    case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4:
+    case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7:
+    case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA:
+    case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD:
+    case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0:
+    case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3:
+    case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6:
+    case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9:
+    case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
+    case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF:
+    case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2:
+    case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5:
+    case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8:
+    case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB:
+    case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE:
+    case X86II::MRM_FF:
+      return -1;
+    }
+  }
+
+  /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
+  /// higher) register?  e.g. r8, xmm8, xmm13, etc.
+  inline bool isX86_64ExtendedReg(unsigned RegNo) {
+    if ((RegNo > X86::XMM7 && RegNo <= X86::XMM15) ||
+        (RegNo > X86::XMM23 && RegNo <= X86::XMM31) ||
+        (RegNo > X86::YMM7 && RegNo <= X86::YMM15) ||
+        (RegNo > X86::YMM23 && RegNo <= X86::YMM31) ||
+        (RegNo > X86::ZMM7 && RegNo <= X86::ZMM15) ||
+        (RegNo > X86::ZMM23 && RegNo <= X86::ZMM31))
+      return true;
+
+    switch (RegNo) {
+    default: break;
+    case X86::R8:    case X86::R9:    case X86::R10:   case X86::R11:
+    case X86::R12:   case X86::R13:   case X86::R14:   case X86::R15:
+    case X86::R8D:   case X86::R9D:   case X86::R10D:  case X86::R11D:
+    case X86::R12D:  case X86::R13D:  case X86::R14D:  case X86::R15D:
+    case X86::R8W:   case X86::R9W:   case X86::R10W:  case X86::R11W:
+    case X86::R12W:  case X86::R13W:  case X86::R14W:  case X86::R15W:
+    case X86::R8B:   case X86::R9B:   case X86::R10B:  case X86::R11B:
+    case X86::R12B:  case X86::R13B:  case X86::R14B:  case X86::R15B:
+    case X86::CR8:   case X86::CR9:   case X86::CR10:  case X86::CR11:
+    case X86::CR12:  case X86::CR13:  case X86::CR14:  case X86::CR15:
+      return true;
+    }
+    return false;
+  }
+
+  /// is32ExtendedReg - Is the MemoryOperand a 32 extended (zmm16 or higher)
+  /// registers? e.g. zmm21, etc.
+  static inline bool is32ExtendedReg(unsigned RegNo) {
+    return ((RegNo > X86::XMM15 && RegNo <= X86::XMM31) ||
+            (RegNo > X86::YMM15 && RegNo <= X86::YMM31) ||
+            (RegNo > X86::ZMM15 && RegNo <= X86::ZMM31));
+  }
+
+
+  inline bool isX86_64NonExtLowByteReg(unsigned reg) {
+    return (reg == X86::SPL || reg == X86::BPL ||
+            reg == X86::SIL || reg == X86::DIL);
+  }
+}
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
new file mode 100644
index 0000000..736c39d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -0,0 +1,262 @@
+//===-- X86ELFObjectWriter.cpp - X86 ELF Writer ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+  class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+  public:
+    X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
+
+    ~X86ELFObjectWriter() override;
+
+  protected:
+    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                          bool IsPCRel) const override;
+  };
+}
+
+X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
+                                       uint16_t EMachine)
+    : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine,
+                              // Only i386 and IAMCU use Rel instead of RelA.
+                              /*HasRelocationAddend*/
+                              (EMachine != ELF::EM_386) &&
+                                  (EMachine != ELF::EM_IAMCU)) {}
+
+X86ELFObjectWriter::~X86ELFObjectWriter()
+{}
+
+enum X86_64RelType { RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 };
+
+static X86_64RelType getType64(unsigned Kind,
+                               MCSymbolRefExpr::VariantKind &Modifier,
+                               bool &IsPCRel) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unimplemented");
+  case X86::reloc_global_offset_table8:
+    Modifier = MCSymbolRefExpr::VK_GOT;
+    IsPCRel = true;
+    return RT64_64;
+  case FK_Data_8:
+    return RT64_64;
+  case X86::reloc_signed_4byte:
+    if (Modifier == MCSymbolRefExpr::VK_None && !IsPCRel)
+      return RT64_32S;
+    return RT64_32;
+  case X86::reloc_global_offset_table:
+    Modifier = MCSymbolRefExpr::VK_GOT;
+    IsPCRel = true;
+    return RT64_32;
+  case FK_Data_4:
+  case FK_PCRel_4:
+  case X86::reloc_riprel_4byte:
+  case X86::reloc_riprel_4byte_movq_load:
+    return RT64_32;
+  case FK_PCRel_2:
+  case FK_Data_2:
+    return RT64_16;
+  case FK_PCRel_1:
+  case FK_Data_1:
+    return RT64_8;
+  }
+}
+
+static unsigned getRelocType64(MCSymbolRefExpr::VariantKind Modifier,
+                               X86_64RelType Type, bool IsPCRel) {
+  switch (Modifier) {
+  default:
+    llvm_unreachable("Unimplemented");
+  case MCSymbolRefExpr::VK_None:
+    switch (Type) {
+    case RT64_64:
+      return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64;
+    case RT64_32:
+      return IsPCRel ? ELF::R_X86_64_PC32 : ELF::R_X86_64_32;
+    case RT64_32S:
+      return ELF::R_X86_64_32S;
+    case RT64_16:
+      return IsPCRel ? ELF::R_X86_64_PC16 : ELF::R_X86_64_16;
+    case RT64_8:
+      return IsPCRel ? ELF::R_X86_64_PC8 : ELF::R_X86_64_8;
+    }
+  case MCSymbolRefExpr::VK_GOT:
+    switch (Type) {
+    case RT64_64:
+      return IsPCRel ? ELF::R_X86_64_GOTPC64 : ELF::R_X86_64_GOT64;
+    case RT64_32:
+      return IsPCRel ? ELF::R_X86_64_GOTPC32 : ELF::R_X86_64_GOT32;
+    case RT64_32S:
+    case RT64_16:
+    case RT64_8:
+      llvm_unreachable("Unimplemented");
+    }
+  case MCSymbolRefExpr::VK_GOTOFF:
+    assert(Type == RT64_64);
+    assert(!IsPCRel);
+    return ELF::R_X86_64_GOTOFF64;
+  case MCSymbolRefExpr::VK_TPOFF:
+    assert(!IsPCRel);
+    switch (Type) {
+    case RT64_64:
+      return ELF::R_X86_64_TPOFF64;
+    case RT64_32:
+      return ELF::R_X86_64_TPOFF32;
+    case RT64_32S:
+    case RT64_16:
+    case RT64_8:
+      llvm_unreachable("Unimplemented");
+    }
+  case MCSymbolRefExpr::VK_DTPOFF:
+    assert(!IsPCRel);
+    switch (Type) {
+    case RT64_64:
+      return ELF::R_X86_64_DTPOFF64;
+    case RT64_32:
+      return ELF::R_X86_64_DTPOFF32;
+    case RT64_32S:
+    case RT64_16:
+    case RT64_8:
+      llvm_unreachable("Unimplemented");
+    }
+  case MCSymbolRefExpr::VK_SIZE:
+    assert(!IsPCRel);
+    switch (Type) {
+    case RT64_64:
+      return ELF::R_X86_64_SIZE64;
+    case RT64_32:
+      return ELF::R_X86_64_SIZE32;
+    case RT64_32S:
+    case RT64_16:
+    case RT64_8:
+      llvm_unreachable("Unimplemented");
+    }
+  case MCSymbolRefExpr::VK_TLSGD:
+    assert(Type == RT64_32);
+    return ELF::R_X86_64_TLSGD;
+  case MCSymbolRefExpr::VK_GOTTPOFF:
+    assert(Type == RT64_32);
+    return ELF::R_X86_64_GOTTPOFF;
+  case MCSymbolRefExpr::VK_TLSLD:
+    assert(Type == RT64_32);
+    return ELF::R_X86_64_TLSLD;
+  case MCSymbolRefExpr::VK_PLT:
+    assert(Type == RT64_32);
+    return ELF::R_X86_64_PLT32;
+  case MCSymbolRefExpr::VK_GOTPCREL:
+    assert(Type == RT64_32);
+    return ELF::R_X86_64_GOTPCREL;
+  }
+}
+
+enum X86_32RelType { RT32_32, RT32_16, RT32_8 };
+
+static X86_32RelType getType32(X86_64RelType T) {
+  switch (T) {
+  case RT64_64:
+    llvm_unreachable("Unimplemented");
+  case RT64_32:
+  case RT64_32S:
+    return RT32_32;
+  case RT64_16:
+    return RT32_16;
+  case RT64_8:
+    return RT32_8;
+  }
+  llvm_unreachable("unexpected relocation type!");
+}
+
+static unsigned getRelocType32(MCSymbolRefExpr::VariantKind Modifier,
+                               X86_32RelType Type, bool IsPCRel) {
+  switch (Modifier) {
+  default:
+    llvm_unreachable("Unimplemented");
+  case MCSymbolRefExpr::VK_None:
+    switch (Type) {
+    case RT32_32:
+      return IsPCRel ? ELF::R_386_PC32 : ELF::R_386_32;
+    case RT32_16:
+      return IsPCRel ? ELF::R_386_PC16 : ELF::R_386_16;
+    case RT32_8:
+      return IsPCRel ? ELF::R_386_PC8 : ELF::R_386_8;
+    }
+  case MCSymbolRefExpr::VK_GOT:
+    assert(Type == RT32_32);
+    return IsPCRel ? ELF::R_386_GOTPC : ELF::R_386_GOT32;
+  case MCSymbolRefExpr::VK_GOTOFF:
+    assert(Type == RT32_32);
+    assert(!IsPCRel);
+    return ELF::R_386_GOTOFF;
+  case MCSymbolRefExpr::VK_TPOFF:
+    assert(Type == RT32_32);
+    assert(!IsPCRel);
+    return ELF::R_386_TLS_LE_32;
+  case MCSymbolRefExpr::VK_DTPOFF:
+    assert(Type == RT32_32);
+    assert(!IsPCRel);
+    return ELF::R_386_TLS_LDO_32;
+  case MCSymbolRefExpr::VK_TLSGD:
+    assert(Type == RT32_32);
+    assert(!IsPCRel);
+    return ELF::R_386_TLS_GD;
+  case MCSymbolRefExpr::VK_GOTTPOFF:
+    assert(Type == RT32_32);
+    assert(!IsPCRel);
+    return ELF::R_386_TLS_IE_32;
+  case MCSymbolRefExpr::VK_PLT:
+    assert(Type == RT32_32);
+    return ELF::R_386_PLT32;
+  case MCSymbolRefExpr::VK_INDNTPOFF:
+    assert(Type == RT32_32);
+    assert(!IsPCRel);
+    return ELF::R_386_TLS_IE;
+  case MCSymbolRefExpr::VK_NTPOFF:
+    assert(Type == RT32_32);
+    assert(!IsPCRel);
+    return ELF::R_386_TLS_LE;
+  case MCSymbolRefExpr::VK_GOTNTPOFF:
+    assert(Type == RT32_32);
+    assert(!IsPCRel);
+    return ELF::R_386_TLS_GOTIE;
+  case MCSymbolRefExpr::VK_TLSLDM:
+    assert(Type == RT32_32);
+    assert(!IsPCRel);
+    return ELF::R_386_TLS_LDM;
+  }
+}
+
+unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
+                                          const MCFixup &Fixup,
+                                          bool IsPCRel) const {
+  MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
+  X86_64RelType Type = getType64(Fixup.getKind(), Modifier, IsPCRel);
+  if (getEMachine() == ELF::EM_X86_64)
+    return getRelocType64(Modifier, Type, IsPCRel);
+
+  assert((getEMachine() == ELF::EM_386 || getEMachine() == ELF::EM_IAMCU) &&
+         "Unsupported ELF machine type.");
+  return getRelocType32(Modifier, getType32(Type), IsPCRel);
+}
+
+MCObjectWriter *llvm::createX86ELFObjectWriter(raw_pwrite_stream &OS,
+                                               bool IsELF64, uint8_t OSABI,
+                                               uint16_t EMachine) {
+  MCELFObjectTargetWriter *MOTW =
+    new X86ELFObjectWriter(IsELF64, OSABI, EMachine);
+  return createELFObjectWriter(MOTW, OS,  /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
new file mode 100644
index 0000000..ddb764f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
@@ -0,0 +1,141 @@
+//===-- X86ELFRelocationInfo.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+using namespace object;
+using namespace ELF;
+
+namespace {
+class X86_64ELFRelocationInfo : public MCRelocationInfo {
+public:
+  X86_64ELFRelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
+
+  const MCExpr *createExprForRelocation(RelocationRef Rel) override {
+    uint64_t RelType = Rel.getType();
+    elf_symbol_iterator SymI = Rel.getSymbol();
+
+    ErrorOr<StringRef> SymNameOrErr = SymI->getName();
+    if (std::error_code EC = SymNameOrErr.getError())
+      report_fatal_error(EC.message());
+    StringRef SymName = *SymNameOrErr;
+
+    ErrorOr<uint64_t> SymAddr = SymI->getAddress();
+    if (std::error_code EC = SymAddr.getError())
+      report_fatal_error(EC.message());
+    uint64_t SymSize = SymI->getSize();
+    int64_t Addend = *ELFRelocationRef(Rel).getAddend();
+
+    MCSymbol *Sym = Ctx.getOrCreateSymbol(SymName);
+    // FIXME: check that the value is actually the same.
+    if (!Sym->isVariable())
+      Sym->setVariableValue(MCConstantExpr::create(*SymAddr, Ctx));
+
+    const MCExpr *Expr = nullptr;
+    // If hasAddend is true, then we need to add Addend (r_addend) to Expr.
+    bool hasAddend = false;
+
+    // The AMD64 SysV ABI says:
+    // A: the addend used to compute the value of the relocatable field.
+    // B: the base address at which a shared object has been loaded into memory
+    //    during execution. Generally, a shared object is built with a 0 base
+    //    virtual address, but the execution address will be different.
+    // G: the offset into the global offset table at which the relocation
+    //    entry's symbol will reside during execution.
+    // GOT: the address of the global offset table.
+    // L: the place (section offset or address) of the Procedure Linkage Table
+    //    entry for a symbol.
+    // P: the place (section offset or address) of the storage unit being
+    //    relocated (computed using r_offset).
+    // S: the value of the symbol whose index resides in the relocation entry.
+    // Z: the size of the symbol whose index resides in the relocation entry.
+
+    switch(RelType) {
+    case R_X86_64_NONE:
+    case R_X86_64_COPY:
+      // none
+      break;
+    case R_X86_64_64:
+    case R_X86_64_16:
+    case R_X86_64_8:
+      // S + A
+    case R_X86_64_32:
+    case R_X86_64_32S:
+      // S + A (We don't care about the result not fitting in 32 bits.)
+    case R_X86_64_PC32:
+    case R_X86_64_PC16:
+    case R_X86_64_PC8:
+    case R_X86_64_PC64:
+      // S + A - P (P/pcrel is implicit)
+      hasAddend = true;
+      Expr = MCSymbolRefExpr::create(Sym, Ctx);
+      break;
+    case R_X86_64_GOT32:
+    case R_X86_64_GOT64:
+    case R_X86_64_GOTPC32:
+    case R_X86_64_GOTPC64:
+    case R_X86_64_GOTPLT64:
+      // G + A
+      hasAddend = true;
+      Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, Ctx);
+      break;
+    case R_X86_64_PLT32:
+      // L + A - P -> S@PLT + A
+      hasAddend = true;
+      Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_PLT, Ctx);
+      break;
+    case R_X86_64_GLOB_DAT:
+    case R_X86_64_JUMP_SLOT:
+      // S
+      Expr = MCSymbolRefExpr::create(Sym, Ctx);
+      break;
+    case R_X86_64_GOTPCREL:
+    case R_X86_64_GOTPCREL64:
+      // G + GOT + A - P -> S@GOTPCREL + A
+      hasAddend = true;
+      Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
+      break;
+    case R_X86_64_GOTOFF64:
+      // S + A - GOT
+      Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTOFF, Ctx);
+      break;
+    case R_X86_64_PLTOFF64:
+      // L + A - GOT
+      break;
+    case R_X86_64_SIZE32:
+    case R_X86_64_SIZE64:
+      // Z + A
+      Expr = MCConstantExpr::create(SymSize, Ctx);
+      break;
+    default:
+      Expr = MCSymbolRefExpr::create(Sym, Ctx);
+      break;
+    }
+    if (Expr && hasAddend && Addend != 0)
+      Expr = MCBinaryExpr::createAdd(Expr,
+                                     MCConstantExpr::create(Addend, Ctx),
+                                     Ctx);
+    return Expr;
+  }
+};
+} // End unnamed namespace
+
+/// createX86ELFRelocationInfo - Construct an X86 Mach-O RelocationInfo.
+MCRelocationInfo *llvm::createX86_64ELFRelocationInfo(MCContext &Ctx) {
+  // We only handle x86-64 for now.
+  return new X86_64ELFRelocationInfo(Ctx);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
new file mode 100644
index 0000000..4899900
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -0,0 +1,34 @@
+//===-- X86FixupKinds.h - X86 Specific Fixup Entries ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86FIXUPKINDS_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace X86 {
+enum Fixups {
+  reloc_riprel_4byte = FirstTargetFixupKind, // 32-bit rip-relative
+  reloc_riprel_4byte_movq_load,              // 32-bit rip-relative in movq
+  reloc_signed_4byte,                        // 32-bit signed. Unlike FK_Data_4
+                                             // this will be sign extended at
+                                             // runtime.
+  reloc_global_offset_table,                 // 32-bit, relative to the start
+                                             // of the instruction. Used only
+                                             // for _GLOBAL_OFFSET_TABLE_.
+  reloc_global_offset_table8,                // 64-bit variant.
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
new file mode 100644
index 0000000..fc0b0f8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -0,0 +1,172 @@
+//===-- X86MCAsmInfo.cpp - X86 asm properties -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the X86MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+using namespace llvm;
+
+enum AsmWriterFlavorTy {
+  // Note: This numbering has to match the GCC assembler dialects for inline
+  // asm alternatives to work right.
+  ATT = 0, Intel = 1
+};
+
+static cl::opt<AsmWriterFlavorTy>
+AsmWriterFlavor("x86-asm-syntax", cl::init(ATT),
+  cl::desc("Choose style of code to emit from X86 backend:"),
+  cl::values(clEnumValN(ATT,   "att",   "Emit AT&T-style assembly"),
+             clEnumValN(Intel, "intel", "Emit Intel-style assembly"),
+             clEnumValEnd));
+
+static cl::opt<bool>
+MarkedJTDataRegions("mark-data-regions", cl::init(false),
+  cl::desc("Mark code section jump table data regions."),
+  cl::Hidden);
+
+void X86MCAsmInfoDarwin::anchor() { }
+
+X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
+  bool is64Bit = T.getArch() == Triple::x86_64;
+  if (is64Bit)
+    PointerSize = CalleeSaveStackSlotSize = 8;
+
+  AssemblerDialect = AsmWriterFlavor;
+
+  TextAlignFillValue = 0x90;
+
+  if (!is64Bit)
+    Data64bitsDirective = nullptr;       // we can't emit a 64-bit unit
+
+  // Use ## as a comment string so that .s files generated by llvm can go
+  // through the GCC preprocessor without causing an error.  This is needed
+  // because "clang foo.s" runs the C preprocessor, which is usually reserved
+  // for .S files on other systems.  Perhaps this is because the file system
+  // wasn't always case preserving or something.
+  CommentString = "##";
+
+  SupportsDebugInformation = true;
+  UseDataRegionDirectives = MarkedJTDataRegions;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  // old assembler lacks some directives
+  // FIXME: this should really be a check on the assembler characteristics
+  // rather than OS version
+  if (T.isMacOSX() && T.isMacOSXVersionLT(10, 6))
+    HasWeakDefCanBeHiddenDirective = false;
+
+  // Assume ld64 is new enough that the abs-ified FDE relocs may be used
+  // (actually, must, since otherwise the non-extern relocations we produce
+  // overwhelm ld64's tiny little mind and it fails).
+  DwarfFDESymbolsUseAbsDiff = true;
+
+  UseIntegratedAssembler = true;
+}
+
+X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple)
+  : X86MCAsmInfoDarwin(Triple) {
+}
+
+void X86ELFMCAsmInfo::anchor() { }
+
+X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
+  bool is64Bit = T.getArch() == Triple::x86_64;
+  bool isX32 = T.getEnvironment() == Triple::GNUX32;
+
+  // For ELF, x86-64 pointer size depends on the ABI.
+  // For x86-64 without the x32 ABI, pointer size is 8. For x86 and for x86-64
+  // with the x32 ABI, pointer size remains the default 4.
+  PointerSize = (is64Bit && !isX32) ? 8 : 4;
+
+  // OTOH, stack slot size is always 8 for x86-64, even with the x32 ABI.
+  CalleeSaveStackSlotSize = is64Bit ? 8 : 4;
+
+  AssemblerDialect = AsmWriterFlavor;
+
+  TextAlignFillValue = 0x90;
+
+  // Debug Information
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  // Always enable the integrated assembler by default.
+  // Clang also enabled it when the OS is Solaris but that is redundant here.
+  UseIntegratedAssembler = true;
+}
+
+const MCExpr *
+X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                                   unsigned Encoding,
+                                                   MCStreamer &Streamer) const {
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Res =
+    MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Context);
+  const MCExpr *Four = MCConstantExpr::create(4, Context);
+  return MCBinaryExpr::createAdd(Res, Four, Context);
+}
+
+void X86MCAsmInfoMicrosoft::anchor() { }
+
+X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
+  if (Triple.getArch() == Triple::x86_64) {
+    PrivateGlobalPrefix = ".L";
+    PrivateLabelPrefix = ".L";
+    PointerSize = 8;
+    WinEHEncodingType = WinEH::EncodingType::Itanium;
+  } else {
+    // 32-bit X86 doesn't use CFI, so this isn't a real encoding type. It's just
+    // a place holder that the Windows EHStreamer looks for to suppress CFI
+    // output. In particular, usesWindowsCFI() returns false.
+    WinEHEncodingType = WinEH::EncodingType::X86;
+  }
+
+  ExceptionsType = ExceptionHandling::WinEH;
+
+  AssemblerDialect = AsmWriterFlavor;
+
+  TextAlignFillValue = 0x90;
+
+  AllowAtInName = true;
+
+  UseIntegratedAssembler = true;
+}
+
+void X86MCAsmInfoGNUCOFF::anchor() { }
+
+X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
+  assert(Triple.isOSWindows() && "Windows is the only supported COFF target");
+  if (Triple.getArch() == Triple::x86_64) {
+    PrivateGlobalPrefix = ".L";
+    PrivateLabelPrefix = ".L";
+    PointerSize = 8;
+    WinEHEncodingType = WinEH::EncodingType::Itanium;
+    ExceptionsType = ExceptionHandling::WinEH;
+  } else {
+    ExceptionsType = ExceptionHandling::DwarfCFI;
+  }
+
+  AssemblerDialect = AsmWriterFlavor;
+
+  TextAlignFillValue = 0x90;
+
+  UseIntegratedAssembler = true;
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
new file mode 100644
index 0000000..30d5c80
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -0,0 +1,61 @@
+//===-- X86MCAsmInfo.h - X86 asm properties --------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the X86MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
+  virtual void anchor();
+
+public:
+  explicit X86MCAsmInfoDarwin(const Triple &Triple);
+};
+
+struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin {
+  explicit X86_64MCAsmInfoDarwin(const Triple &Triple);
+  const MCExpr *
+  getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+                              MCStreamer &Streamer) const override;
+};
+
+class X86ELFMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit X86ELFMCAsmInfo(const Triple &Triple);
+};
+
+class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+  void anchor() override;
+
+public:
+  explicit X86MCAsmInfoMicrosoft(const Triple &Triple);
+};
+
+class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
+  void anchor() override;
+
+public:
+  explicit X86MCAsmInfoGNUCOFF(const Triple &Triple);
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
new file mode 100644
index 0000000..dfab6ec
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -0,0 +1,1506 @@
+//===-- X86MCCodeEmitter.cpp - Convert X86 code to machine code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+namespace {
+class X86MCCodeEmitter : public MCCodeEmitter {
+  X86MCCodeEmitter(const X86MCCodeEmitter &) = delete;
+  void operator=(const X86MCCodeEmitter &) = delete;
+  const MCInstrInfo &MCII;
+  MCContext &Ctx;
+public:
+  X86MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+    : MCII(mcii), Ctx(ctx) {
+  }
+
+  ~X86MCCodeEmitter() override {}
+
+  bool is64BitMode(const MCSubtargetInfo &STI) const {
+    return STI.getFeatureBits()[X86::Mode64Bit];
+  }
+
+  bool is32BitMode(const MCSubtargetInfo &STI) const {
+    return STI.getFeatureBits()[X86::Mode32Bit];
+  }
+
+  bool is16BitMode(const MCSubtargetInfo &STI) const {
+    return STI.getFeatureBits()[X86::Mode16Bit];
+  }
+
+  /// Is16BitMemOperand - Return true if the specified instruction has
+  /// a 16-bit memory operand. Op specifies the operand # of the memoperand.
+  bool Is16BitMemOperand(const MCInst &MI, unsigned Op,
+                         const MCSubtargetInfo &STI) const {
+    const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+    const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+    const MCOperand &Disp     = MI.getOperand(Op+X86::AddrDisp);
+
+    if (is16BitMode(STI) && BaseReg.getReg() == 0 &&
+        Disp.isImm() && Disp.getImm() < 0x10000)
+      return true;
+    if ((BaseReg.getReg() != 0 &&
+         X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
+        (IndexReg.getReg() != 0 &&
+         X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
+      return true;
+    return false;
+  }
+
+  unsigned GetX86RegNum(const MCOperand &MO) const {
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7;
+  }
+
+  // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
+  // 0-7 and the difference between the 2 groups is given by the REX prefix.
+  // In the VEX prefix, registers are seen sequencially from 0-15 and encoded
+  // in 1's complement form, example:
+  //
+  //  ModRM field => XMM9 => 1
+  //  VEX.VVVV    => XMM9 => ~9
+  //
+  // See table 4-35 of Intel AVX Programming Reference for details.
+  unsigned char getVEXRegisterEncoding(const MCInst &MI,
+                                       unsigned OpNum) const {
+    unsigned SrcReg = MI.getOperand(OpNum).getReg();
+    unsigned SrcRegNum = GetX86RegNum(MI.getOperand(OpNum));
+    if (X86II::isX86_64ExtendedReg(SrcReg))
+      SrcRegNum |= 8;
+
+    // The registers represented through VEX_VVVV should
+    // be encoded in 1's complement form.
+    return (~SrcRegNum) & 0xf;
+  }
+
+  unsigned char getWriteMaskRegisterEncoding(const MCInst &MI,
+                                             unsigned OpNum) const {
+    assert(X86::K0 != MI.getOperand(OpNum).getReg() &&
+           "Invalid mask register as write-mask!");
+    unsigned MaskRegNum = GetX86RegNum(MI.getOperand(OpNum));
+    return MaskRegNum;
+  }
+
+  void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
+    OS << (char)C;
+    ++CurByte;
+  }
+
+  void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+                    raw_ostream &OS) const {
+    // Output the constant in little endian byte order.
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, CurByte, OS);
+      Val >>= 8;
+    }
+  }
+
+  void EmitImmediate(const MCOperand &Disp, SMLoc Loc,
+                     unsigned ImmSize, MCFixupKind FixupKind,
+                     unsigned &CurByte, raw_ostream &OS,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     int ImmOffset = 0) const;
+
+  inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
+                                        unsigned RM) {
+    assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+    return RM | (RegOpcode << 3) | (Mod << 6);
+  }
+
+  void EmitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
+                        unsigned &CurByte, raw_ostream &OS) const {
+    EmitByte(ModRMByte(3, RegOpcodeFld, GetX86RegNum(ModRMReg)), CurByte, OS);
+  }
+
+  void EmitSIBByte(unsigned SS, unsigned Index, unsigned Base,
+                   unsigned &CurByte, raw_ostream &OS) const {
+    // SIB byte is in the same format as the ModRMByte.
+    EmitByte(ModRMByte(SS, Index, Base), CurByte, OS);
+  }
+
+
+  void EmitMemModRMByte(const MCInst &MI, unsigned Op,
+                        unsigned RegOpcodeField,
+                        uint64_t TSFlags, unsigned &CurByte, raw_ostream &OS,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const;
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  void EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+                           const MCInst &MI, const MCInstrDesc &Desc,
+                           raw_ostream &OS) const;
+
+  void EmitSegmentOverridePrefix(unsigned &CurByte, unsigned SegOperand,
+                                 const MCInst &MI, raw_ostream &OS) const;
+
+  void EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+                        const MCInst &MI, const MCInstrDesc &Desc,
+                        const MCSubtargetInfo &STI,
+                        raw_ostream &OS) const;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
+                                            MCContext &Ctx) {
+  return new X86MCCodeEmitter(MCII, Ctx);
+}
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
+static bool isDisp8(int Value) {
+  return Value == (signed char)Value;
+}
+
+/// isCDisp8 - Return true if this signed displacement fits in a 8-bit
+/// compressed dispacement field.
+static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) {
+  assert(((TSFlags & X86II::EncodingMask) == X86II::EVEX) &&
+         "Compressed 8-bit displacement is only valid for EVEX inst.");
+
+  unsigned CD8_Scale =
+    (TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift;
+  if (CD8_Scale == 0) {
+    CValue = Value;
+    return isDisp8(Value);
+  }
+
+  unsigned Mask = CD8_Scale - 1;
+  assert((CD8_Scale & Mask) == 0 && "Invalid memory object size.");
+  if (Value & Mask) // Unaligned offset
+    return false;
+  Value /= (int)CD8_Scale;
+  bool Ret = (Value == (signed char)Value);
+
+  if (Ret)
+    CValue = Value;
+  return Ret;
+}
+
+/// getImmFixupKind - Return the appropriate fixup kind to use for an immediate
+/// in an instruction with the specified TSFlags.
+static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
+  unsigned Size = X86II::getSizeOfImm(TSFlags);
+  bool isPCRel = X86II::isImmPCRel(TSFlags);
+
+  if (X86II::isImmSigned(TSFlags)) {
+    switch (Size) {
+    default: llvm_unreachable("Unsupported signed fixup size!");
+    case 4: return MCFixupKind(X86::reloc_signed_4byte);
+    }
+  }
+  return MCFixup::getKindForSize(Size, isPCRel);
+}
+
+/// Is32BitMemOperand - Return true if the specified instruction has
+/// a 32-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
+  const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+
+/// Is64BitMemOperand - Return true if the specified instruction has
+/// a 64-bit memory operand. Op specifies the operand # of the memoperand.
+#ifndef NDEBUG
+static bool Is64BitMemOperand(const MCInst &MI, unsigned Op) {
+  const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+#endif
+
+/// StartsWithGlobalOffsetTable - Check if this expression starts with
+///  _GLOBAL_OFFSET_TABLE_ and if it is of the form
+///  _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on ELF
+/// i386 as _GLOBAL_OFFSET_TABLE_ is magical. We check only simple case that
+/// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start
+/// of a binary expression.
+enum GlobalOffsetTableExprKind {
+  GOT_None,
+  GOT_Normal,
+  GOT_SymDiff
+};
+static GlobalOffsetTableExprKind
+StartsWithGlobalOffsetTable(const MCExpr *Expr) {
+  const MCExpr *RHS = nullptr;
+  if (Expr->getKind() == MCExpr::Binary) {
+    const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr);
+    Expr = BE->getLHS();
+    RHS = BE->getRHS();
+  }
+
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    return GOT_None;
+
+  const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+  const MCSymbol &S = Ref->getSymbol();
+  if (S.getName() != "_GLOBAL_OFFSET_TABLE_")
+    return GOT_None;
+  if (RHS && RHS->getKind() == MCExpr::SymbolRef)
+    return GOT_SymDiff;
+  return GOT_Normal;
+}
+
+static bool HasSecRelSymbolRef(const MCExpr *Expr) {
+  if (Expr->getKind() == MCExpr::SymbolRef) {
+    const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+    return Ref->getKind() == MCSymbolRefExpr::VK_SECREL;
+  }
+  return false;
+}
+
+void X86MCCodeEmitter::
+EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
+              MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
+              SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const {
+  const MCExpr *Expr = nullptr;
+  if (DispOp.isImm()) {
+    // If this is a simple integer displacement that doesn't require a
+    // relocation, emit it now.
+    if (FixupKind != FK_PCRel_1 &&
+        FixupKind != FK_PCRel_2 &&
+        FixupKind != FK_PCRel_4) {
+      EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS);
+      return;
+    }
+    Expr = MCConstantExpr::create(DispOp.getImm(), Ctx);
+  } else {
+    Expr = DispOp.getExpr();
+  }
+
+  // If we have an immoffset, add it to the expression.
+  if ((FixupKind == FK_Data_4 ||
+       FixupKind == FK_Data_8 ||
+       FixupKind == MCFixupKind(X86::reloc_signed_4byte))) {
+    GlobalOffsetTableExprKind Kind = StartsWithGlobalOffsetTable(Expr);
+    if (Kind != GOT_None) {
+      assert(ImmOffset == 0);
+
+      if (Size == 8) {
+        FixupKind = MCFixupKind(X86::reloc_global_offset_table8);
+      } else {
+        assert(Size == 4);
+        FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+      }
+
+      if (Kind == GOT_Normal)
+        ImmOffset = CurByte;
+    } else if (Expr->getKind() == MCExpr::SymbolRef) {
+      if (HasSecRelSymbolRef(Expr)) {
+        FixupKind = MCFixupKind(FK_SecRel_4);
+      }
+    } else if (Expr->getKind() == MCExpr::Binary) {
+      const MCBinaryExpr *Bin = static_cast<const MCBinaryExpr*>(Expr);
+      if (HasSecRelSymbolRef(Bin->getLHS())
+          || HasSecRelSymbolRef(Bin->getRHS())) {
+        FixupKind = MCFixupKind(FK_SecRel_4);
+      }
+    }
+  }
+
+  // If the fixup is pc-relative, we need to bias the value to be relative to
+  // the start of the field, not the end of the field.
+  if (FixupKind == FK_PCRel_4 ||
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte) ||
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load))
+    ImmOffset -= 4;
+  if (FixupKind == FK_PCRel_2)
+    ImmOffset -= 2;
+  if (FixupKind == FK_PCRel_1)
+    ImmOffset -= 1;
+
+  if (ImmOffset)
+    Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(ImmOffset, Ctx),
+                                   Ctx);
+
+  // Emit a symbolic constant as a fixup and 4 zeros.
+  Fixups.push_back(MCFixup::create(CurByte, Expr, FixupKind, Loc));
+  EmitConstant(0, Size, CurByte, OS);
+}
+
+void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
+                                        unsigned RegOpcodeField,
+                                        uint64_t TSFlags, unsigned &CurByte,
+                                        raw_ostream &OS,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const{
+  const MCOperand &Disp     = MI.getOperand(Op+X86::AddrDisp);
+  const MCOperand &Base     = MI.getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &Scale    = MI.getOperand(Op+X86::AddrScaleAmt);
+  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+  unsigned BaseReg = Base.getReg();
+  bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;
+
+  // Handle %rip relative addressing.
+  if (BaseReg == X86::RIP) {    // [disp32+RIP] in X86-64 mode
+    assert(is64BitMode(STI) && "Rip-relative addressing requires 64-bit mode");
+    assert(IndexReg.getReg() == 0 && "Invalid rip-relative address");
+    EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+
+    unsigned FixupKind = X86::reloc_riprel_4byte;
+
+    // movq loads are handled with a special relocation form which allows the
+    // linker to eliminate some loads for GOT references which end up in the
+    // same linkage unit.
+    if (MI.getOpcode() == X86::MOV64rm)
+      FixupKind = X86::reloc_riprel_4byte_movq_load;
+
+    // rip-relative addressing is actually relative to the *next* instruction.
+    // Since an immediate can follow the mod/rm byte for an instruction, this
+    // means that we need to bias the immediate field of the instruction with
+    // the size of the immediate field.  If we have this case, add it into the
+    // expression to emit.
+    int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
+
+    EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind),
+                  CurByte, OS, Fixups, -ImmSize);
+    return;
+  }
+
+  unsigned BaseRegNo = BaseReg ? GetX86RegNum(Base) : -1U;
+
+  // 16-bit addressing forms of the ModR/M byte have a different encoding for
+  // the R/M field and are far more limited in which registers can be used.
+  if (Is16BitMemOperand(MI, Op, STI)) {
+    if (BaseReg) {
+      // For 32-bit addressing, the row and column values in Table 2-2 are
+      // basically the same. It's AX/CX/DX/BX/SP/BP/SI/DI in that order, with
+      // some special cases. And GetX86RegNum reflects that numbering.
+      // For 16-bit addressing it's more fun, as shown in the SDM Vol 2A,
+      // Table 2-1 "16-Bit Addressing Forms with the ModR/M byte". We can only
+      // use SI/DI/BP/BX, which have "row" values 4-7 in no particular order,
+      // while values 0-3 indicate the allowed combinations (base+index) of
+      // those: 0 for BX+SI, 1 for BX+DI, 2 for BP+SI, 3 for BP+DI.
+      //
+      // R16Table[] is a lookup from the normal RegNo, to the row values from
+      // Table 2-1 for 16-bit addressing modes. Where zero means disallowed.
+      static const unsigned R16Table[] = { 0, 0, 0, 7, 0, 6, 4, 5 };
+      unsigned RMfield = R16Table[BaseRegNo];
+
+      assert(RMfield && "invalid 16-bit base register");
+
+      if (IndexReg.getReg()) {
+        unsigned IndexReg16 = R16Table[GetX86RegNum(IndexReg)];
+
+        assert(IndexReg16 && "invalid 16-bit index register");
+        // We must have one of SI/DI (4,5), and one of BP/BX (6,7).
+        assert(((IndexReg16 ^ RMfield) & 2) &&
+               "invalid 16-bit base/index register combination");
+        assert(Scale.getImm() == 1 &&
+               "invalid scale for 16-bit memory reference");
+
+        // Allow base/index to appear in either order (although GAS doesn't).
+        if (IndexReg16 & 2)
+          RMfield = (RMfield & 1) | ((7 - IndexReg16) << 1);
+        else
+          RMfield = (IndexReg16 & 1) | ((7 - RMfield) << 1);
+      }
+
+      if (Disp.isImm() && isDisp8(Disp.getImm())) {
+        if (Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
+          // There is no displacement; just the register.
+          EmitByte(ModRMByte(0, RegOpcodeField, RMfield), CurByte, OS);
+          return;
+        }
+        // Use the [REG]+disp8 form, including for [BP] which cannot be encoded.
+        EmitByte(ModRMByte(1, RegOpcodeField, RMfield), CurByte, OS);
+        EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+        return;
+      }
+      // This is the [REG]+disp16 case.
+      EmitByte(ModRMByte(2, RegOpcodeField, RMfield), CurByte, OS);
+    } else {
+      // There is no BaseReg; this is the plain [disp16] case.
+      EmitByte(ModRMByte(0, RegOpcodeField, 6), CurByte, OS);
+    }
+
+    // Emit 16-bit displacement for plain disp16 or [REG]+disp16 cases.
+    EmitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, CurByte, OS, Fixups);
+    return;
+  }
+
+  // Determine whether a SIB byte is needed.
+  // If no BaseReg, issue a RIP relative instruction only if the MCE can
+  // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
+  // 2-7) and absolute references.
+
+  if (// The SIB byte must be used if there is an index register.
+      IndexReg.getReg() == 0 &&
+      // The SIB byte must be used if the base is ESP/RSP/R12, all of which
+      // encode to an R/M value of 4, which indicates that a SIB byte is
+      // present.
+      BaseRegNo != N86::ESP &&
+      // If there is no base register and we're in 64-bit mode, we need a SIB
+      // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
+      (!is64BitMode(STI) || BaseReg != 0)) {
+
+    if (BaseReg == 0) {          // [disp32]     in X86-32 mode
+      EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+      EmitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups);
+      return;
+    }
+
+    // If the base is not EBP/ESP and there is no displacement, use simple
+    // indirect register encoding, this handles addresses like [EAX].  The
+    // encoding for [EBP] with no displacement means [disp32] so we handle it
+    // by emitting a displacement of 0 below.
+    if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
+      EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+      return;
+    }
+
+    // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
+    if (Disp.isImm()) {
+      if (!HasEVEX && isDisp8(Disp.getImm())) {
+        EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+        EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+        return;
+      }
+      // Try EVEX compressed 8-bit displacement first; if failed, fall back to
+      // 32-bit displacement.
+      int CDisp8 = 0;
+      if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
+        EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+        EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups,
+                      CDisp8 - Disp.getImm());
+        return;
+      }
+    }
+
+    // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
+    EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
+    EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
+                  CurByte, OS, Fixups);
+    return;
+  }
+
+  // We need a SIB byte, so start by outputting the ModR/M byte first
+  assert(IndexReg.getReg() != X86::ESP &&
+         IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+  bool ForceDisp32 = false;
+  bool ForceDisp8  = false;
+  int CDisp8 = 0;
+  int ImmOffset = 0;
+  if (BaseReg == 0) {
+    // If there is no base register, we emit the special case SIB byte with
+    // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
+    EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp32 = true;
+  } else if (!Disp.isImm()) {
+    // Emit the normal disp32 encoding.
+    EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp32 = true;
+  } else if (Disp.getImm() == 0 &&
+             // Base reg can't be anything that ends up with '5' as the base
+             // reg, it is the magic [*] nomenclature that indicates no base.
+             BaseRegNo != N86::EBP) {
+    // Emit no displacement ModR/M byte
+    EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+  } else if (!HasEVEX && isDisp8(Disp.getImm())) {
+    // Emit the disp8 encoding.
+    EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
+  } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
+    // Emit the disp8 encoding.
+    EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
+    ImmOffset = CDisp8 - Disp.getImm();
+  } else {
+    // Emit the normal disp32 encoding.
+    EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+  }
+
+  // Calculate what the SS field value should be...
+  static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 };
+  unsigned SS = SSTable[Scale.getImm()];
+
+  if (BaseReg == 0) {
+    // Handle the SIB byte for the case where there is no base, see Intel
+    // Manual 2A, table 2-7. The displacement has already been output.
+    unsigned IndexRegNo;
+    if (IndexReg.getReg())
+      IndexRegNo = GetX86RegNum(IndexReg);
+    else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
+      IndexRegNo = 4;
+    EmitSIBByte(SS, IndexRegNo, 5, CurByte, OS);
+  } else {
+    unsigned IndexRegNo;
+    if (IndexReg.getReg())
+      IndexRegNo = GetX86RegNum(IndexReg);
+    else
+      IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
+    EmitSIBByte(SS, IndexRegNo, GetX86RegNum(Base), CurByte, OS);
+  }
+
+  // Do we need to output a displacement?
+  if (ForceDisp8)
+    EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, ImmOffset);
+  else if (ForceDisp32 || Disp.getImm() != 0)
+    EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
+                  CurByte, OS, Fixups);
+}
+
+/// EmitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix
+/// called VEX.
+void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+                                           int MemOperand, const MCInst &MI,
+                                           const MCInstrDesc &Desc,
+                                           raw_ostream &OS) const {
+  assert(!(TSFlags & X86II::LOCK) && "Can't have LOCK VEX.");
+
+  uint64_t Encoding = TSFlags & X86II::EncodingMask;
+  bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+  bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+  bool HasVEX_4VOp3 = TSFlags & X86II::VEX_4VOp3;
+  bool HasMemOp4 = TSFlags & X86II::MemOp4;
+  bool HasEVEX_RC = TSFlags & X86II::EVEX_RC;
+
+  // VEX_R: opcode externsion equivalent to REX.R in
+  // 1's complement (inverted) form
+  //
+  //  1: Same as REX_R=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX_R=1 (64 bit mode only)
+  //
+  unsigned char VEX_R = 0x1;
+  unsigned char EVEX_R2 = 0x1;
+
+  // VEX_X: equivalent to REX.X, only used when a
+  // register is used for index in SIB Byte.
+  //
+  //  1: Same as REX.X=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX.X=1 (64-bit mode only)
+  unsigned char VEX_X = 0x1;
+
+  // VEX_B:
+  //
+  //  1: Same as REX_B=0 (ignored in 32-bit mode)
+  //  0: Same as REX_B=1 (64 bit mode only)
+  //
+  unsigned char VEX_B = 0x1;
+
+  // VEX_W: opcode specific (use like REX.W, or used for
+  // opcode extension, or ignored, depending on the opcode byte)
+  unsigned char VEX_W = 0;
+
+  // VEX_5M (VEX m-mmmmm field):
+  //
+  //  0b00000: Reserved for future use
+  //  0b00001: implied 0F leading opcode
+  //  0b00010: implied 0F 38 leading opcode bytes
+  //  0b00011: implied 0F 3A leading opcode bytes
+  //  0b00100-0b11111: Reserved for future use
+  //  0b01000: XOP map select - 08h instructions with imm byte
+  //  0b01001: XOP map select - 09h instructions with no imm byte
+  //  0b01010: XOP map select - 0Ah instructions with imm dword
+  unsigned char VEX_5M = 0;
+
+  // VEX_4V (VEX vvvv field): a register specifier
+  // (in 1's complement form) or 1111 if unused.
+  unsigned char VEX_4V = 0xf;
+  unsigned char EVEX_V2 = 0x1;
+
+  // VEX_L (Vector Length):
+  //
+  //  0: scalar or 128-bit vector
+  //  1: 256-bit vector
+  //
+  unsigned char VEX_L = 0;
+  unsigned char EVEX_L2 = 0;
+
+  // VEX_PP: opcode extension providing equivalent
+  // functionality of a SIMD prefix
+  //
+  //  0b00: None
+  //  0b01: 66
+  //  0b10: F3
+  //  0b11: F2
+  //
+  unsigned char VEX_PP = 0;
+
+  // EVEX_U
+  unsigned char EVEX_U = 1; // Always '1' so far
+
+  // EVEX_z
+  unsigned char EVEX_z = 0;
+
+  // EVEX_b
+  unsigned char EVEX_b = 0;
+
+  // EVEX_rc
+  unsigned char EVEX_rc = 0;
+
+  // EVEX_aaa
+  unsigned char EVEX_aaa = 0;
+
+  bool EncodeRC = false;
+
+  if (TSFlags & X86II::VEX_W)
+    VEX_W = 1;
+
+  if (TSFlags & X86II::VEX_L)
+    VEX_L = 1;
+  if (TSFlags & X86II::EVEX_L2)
+    EVEX_L2 = 1;
+
+  if (HasEVEX_K && (TSFlags & X86II::EVEX_Z))
+    EVEX_z = 1;
+
+  if ((TSFlags & X86II::EVEX_B))
+    EVEX_b = 1;
+
+  switch (TSFlags & X86II::OpPrefixMask) {
+  default: break; // VEX_PP already correct
+  case X86II::PD: VEX_PP = 0x1; break; // 66
+  case X86II::XS: VEX_PP = 0x2; break; // F3
+  case X86II::XD: VEX_PP = 0x3; break; // F2
+  }
+
+  switch (TSFlags & X86II::OpMapMask) {
+  default: llvm_unreachable("Invalid prefix!");
+  case X86II::TB:   VEX_5M = 0x1; break; // 0F
+  case X86II::T8:   VEX_5M = 0x2; break; // 0F 38
+  case X86II::TA:   VEX_5M = 0x3; break; // 0F 3A
+  case X86II::XOP8: VEX_5M = 0x8; break;
+  case X86II::XOP9: VEX_5M = 0x9; break;
+  case X86II::XOPA: VEX_5M = 0xA; break;
+  }
+
+  // Classify VEX_B, VEX_4V, VEX_R, VEX_X
+  unsigned NumOps = Desc.getNumOperands();
+  unsigned CurOp = X86II::getOperandBias(Desc);
+
+  switch (TSFlags & X86II::FormMask) {
+  default: llvm_unreachable("Unexpected form in EmitVEXOpcodePrefix!");
+  case X86II::RawFrm:
+    break;
+  case X86II::MRMDestMem: {
+    // MRMDestMem instructions forms:
+    //  MemAddr, src1(ModR/M)
+    //  MemAddr, src1(VEX_4V), src2(ModR/M)
+    //  MemAddr, src1(ModR/M), imm8
+    //
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand +
+                                                 X86::AddrBaseReg).getReg()))
+      VEX_B = 0x0;
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand +
+                                                 X86::AddrIndexReg).getReg()))
+      VEX_X = 0x0;
+    if (X86II::is32ExtendedReg(MI.getOperand(MemOperand +
+                                          X86::AddrIndexReg).getReg()))
+      EVEX_V2 = 0x0;
+
+    CurOp += X86::AddrNumOperands;
+
+    if (HasEVEX_K)
+      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+        EVEX_V2 = 0x0;
+      CurOp++;
+    }
+
+    const MCOperand &MO = MI.getOperand(CurOp);
+    if (MO.isReg()) {
+      if (X86II::isX86_64ExtendedReg(MO.getReg()))
+        VEX_R = 0x0;
+      if (X86II::is32ExtendedReg(MO.getReg()))
+        EVEX_R2 = 0x0;
+    }
+    break;
+  }
+  case X86II::MRMSrcMem:
+    // MRMSrcMem instructions forms:
+    //  src1(ModR/M), MemAddr
+    //  src1(ModR/M), src2(VEX_4V), MemAddr
+    //  src1(ModR/M), MemAddr, imm8
+    //  src1(ModR/M), MemAddr, src2(VEX_I8IMM)
+    //
+    //  FMA4:
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_R = 0x0;
+    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      EVEX_R2 = 0x0;
+    CurOp++;
+
+    if (HasEVEX_K)
+      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+        EVEX_V2 = 0x0;
+      CurOp++;
+    }
+
+    if (X86II::isX86_64ExtendedReg(
+               MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+      VEX_B = 0x0;
+    if (X86II::isX86_64ExtendedReg(
+               MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+      VEX_X = 0x0;
+    if (X86II::is32ExtendedReg(MI.getOperand(MemOperand +
+                               X86::AddrIndexReg).getReg()))
+      EVEX_V2 = 0x0;
+
+    if (HasVEX_4VOp3)
+      // Instruction format for 4VOp3:
+      //   src1(ModR/M), MemAddr, src3(VEX_4V)
+      // CurOp points to start of the MemoryOperand,
+      //   it skips TIED_TO operands if exist, then increments past src1.
+      // CurOp + X86::AddrNumOperands will point to src3.
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands);
+    break;
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m: {
+    // MRM[0-9]m instructions forms:
+    //  MemAddr
+    //  src1(VEX_4V), MemAddr
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+        EVEX_V2 = 0x0;
+      CurOp++;
+    }
+
+    if (HasEVEX_K)
+      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+    if (X86II::isX86_64ExtendedReg(
+               MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+      VEX_B = 0x0;
+    if (X86II::isX86_64ExtendedReg(
+               MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+      VEX_X = 0x0;
+    break;
+  }
+  case X86II::MRMSrcReg:
+    // MRMSrcReg instructions forms:
+    //  dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+    //  dst(ModR/M), src1(ModR/M)
+    //  dst(ModR/M), src1(ModR/M), imm8
+    //
+    //  FMA4:
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_R = 0x0;
+    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      EVEX_R2 = 0x0;
+    CurOp++;
+
+    if (HasEVEX_K)
+      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+        EVEX_V2 = 0x0;
+      CurOp++;
+    }
+
+    if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+      CurOp++;
+
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_B = 0x0;
+    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_X = 0x0;
+    CurOp++;
+    if (HasVEX_4VOp3)
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+    if (EVEX_b) {
+      if (HasEVEX_RC) {
+        unsigned RcOperand = NumOps-1;
+        assert(RcOperand >= CurOp);
+        EVEX_rc = MI.getOperand(RcOperand).getImm() & 0x3;
+      }
+      EncodeRC = true;
+    }
+    break;
+  case X86II::MRMDestReg:
+    // MRMDestReg instructions forms:
+    //  dst(ModR/M), src(ModR/M)
+    //  dst(ModR/M), src(ModR/M), imm8
+    //  dst(ModR/M), src1(VEX_4V), src2(ModR/M)
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_B = 0x0;
+    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_X = 0x0;
+    CurOp++;
+
+    if (HasEVEX_K)
+      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+        EVEX_V2 = 0x0;
+      CurOp++;
+    }
+
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_R = 0x0;
+    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      EVEX_R2 = 0x0;
+    if (EVEX_b)
+      EncodeRC = true;
+    break;
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r:
+    // MRM0r-MRM7r instructions forms:
+    //  dst(VEX_4V), src(ModR/M), imm8
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+          EVEX_V2 = 0x0;
+      CurOp++;
+    }
+    if (HasEVEX_K)
+      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_B = 0x0;
+    if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_X = 0x0;
+    break;
+  }
+
+  if (Encoding == X86II::VEX || Encoding == X86II::XOP) {
+    // VEX opcode prefix can have 2 or 3 bytes
+    //
+    //  3 bytes:
+    //    +-----+ +--------------+ +-------------------+
+    //    | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+    //    +-----+ +--------------+ +-------------------+
+    //  2 bytes:
+    //    +-----+ +-------------------+
+    //    | C5h | | R | vvvv | L | pp |
+    //    +-----+ +-------------------+
+    //
+    //  XOP uses a similar prefix:
+    //    +-----+ +--------------+ +-------------------+
+    //    | 8Fh | | RXB | m-mmmm | | W | vvvv | L | pp |
+    //    +-----+ +--------------+ +-------------------+
+    unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+
+    // Can we use the 2 byte VEX prefix?
+    if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
+      EmitByte(0xC5, CurByte, OS);
+      EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
+      return;
+    }
+
+    // 3 byte VEX prefix
+    EmitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, CurByte, OS);
+    EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
+    EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
+  } else {
+    assert(Encoding == X86II::EVEX && "unknown encoding!");
+    // EVEX opcode prefix can have 4 bytes
+    //
+    // +-----+ +--------------+ +-------------------+ +------------------------+
+    // | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa |
+    // +-----+ +--------------+ +-------------------+ +------------------------+
+    assert((VEX_5M & 0x3) == VEX_5M
+           && "More than 2 significant bits in VEX.m-mmmm fields for EVEX!");
+
+    VEX_5M &= 0x3;
+
+    EmitByte(0x62, CurByte, OS);
+    EmitByte((VEX_R   << 7) |
+             (VEX_X   << 6) |
+             (VEX_B   << 5) |
+             (EVEX_R2 << 4) |
+             VEX_5M, CurByte, OS);
+    EmitByte((VEX_W   << 7) |
+             (VEX_4V  << 3) |
+             (EVEX_U  << 2) |
+             VEX_PP, CurByte, OS);
+    if (EncodeRC)
+      EmitByte((EVEX_z  << 7) |
+              (EVEX_rc << 5) |
+              (EVEX_b  << 4) |
+              (EVEX_V2 << 3) |
+              EVEX_aaa, CurByte, OS);
+    else
+      EmitByte((EVEX_z  << 7) |
+              (EVEX_L2 << 6) |
+              (VEX_L   << 5) |
+              (EVEX_b  << 4) |
+              (EVEX_V2 << 3) |
+              EVEX_aaa, CurByte, OS);
+  }
+}
+
+/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
+                                   const MCInstrDesc &Desc) {
+  unsigned REX = 0;
+  bool UsesHighByteReg = false;
+
+  if (TSFlags & X86II::REX_W)
+    REX |= 1 << 3; // set REX.W
+
+  if (MI.getNumOperands() == 0) return REX;
+
+  unsigned NumOps = MI.getNumOperands();
+  // FIXME: MCInst should explicitize the two-addrness.
+  bool isTwoAddr = NumOps > 1 &&
+                      Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+  // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+  unsigned i = isTwoAddr ? 1 : 0;
+  for (; i != NumOps; ++i) {
+    const MCOperand &MO = MI.getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
+      UsesHighByteReg = true;
+    if (!X86II::isX86_64NonExtLowByteReg(Reg)) continue;
+    // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
+    // that returns non-zero.
+    REX |= 0x40; // REX fixed encoding prefix
+    break;
+  }
+
+  switch (TSFlags & X86II::FormMask) {
+  case X86II::MRMSrcReg:
+    if (MI.getOperand(0).isReg() &&
+        X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      REX |= 1 << 2; // set REX.R
+    i = isTwoAddr ? 2 : 1;
+    for (; i != NumOps; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+        REX |= 1 << 0; // set REX.B
+    }
+    break;
+  case X86II::MRMSrcMem: {
+    if (MI.getOperand(0).isReg() &&
+        X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      REX |= 1 << 2; // set REX.R
+    unsigned Bit = 0;
+    i = isTwoAddr ? 2 : 1;
+    for (; i != NumOps; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg()) {
+        if (X86II::isX86_64ExtendedReg(MO.getReg()))
+          REX |= 1 << Bit; // set REX.B (Bit=0) and REX.X (Bit=1)
+        Bit++;
+      }
+    }
+    break;
+  }
+  case X86II::MRMXm:
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m:
+  case X86II::MRMDestMem: {
+    unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands);
+    i = isTwoAddr ? 1 : 0;
+    if (NumOps > e && MI.getOperand(e).isReg() &&
+        X86II::isX86_64ExtendedReg(MI.getOperand(e).getReg()))
+      REX |= 1 << 2; // set REX.R
+    unsigned Bit = 0;
+    for (; i != e; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg()) {
+        if (X86II::isX86_64ExtendedReg(MO.getReg()))
+          REX |= 1 << Bit; // REX.B (Bit=0) and REX.X (Bit=1)
+        Bit++;
+      }
+    }
+    break;
+  }
+  default:
+    if (MI.getOperand(0).isReg() &&
+        X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      REX |= 1 << 0; // set REX.B
+    i = isTwoAddr ? 2 : 1;
+    for (unsigned e = NumOps; i != e; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+        REX |= 1 << 2; // set REX.R
+    }
+    break;
+  }
+  if (REX && UsesHighByteReg)
+    report_fatal_error("Cannot encode high byte register in REX-prefixed instruction");
+
+  return REX;
+}
+
+/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
+void X86MCCodeEmitter::EmitSegmentOverridePrefix(unsigned &CurByte,
+                                                 unsigned SegOperand,
+                                                 const MCInst &MI,
+                                                 raw_ostream &OS) const {
+  // Check for explicit segment override on memory operand.
+  switch (MI.getOperand(SegOperand).getReg()) {
+  default: llvm_unreachable("Unknown segment register!");
+  case 0: break;
+  case X86::CS: EmitByte(0x2E, CurByte, OS); break;
+  case X86::SS: EmitByte(0x36, CurByte, OS); break;
+  case X86::DS: EmitByte(0x3E, CurByte, OS); break;
+  case X86::ES: EmitByte(0x26, CurByte, OS); break;
+  case X86::FS: EmitByte(0x64, CurByte, OS); break;
+  case X86::GS: EmitByte(0x65, CurByte, OS); break;
+  }
+}
+
+/// EmitOpcodePrefix - Emit all instruction prefixes prior to the opcode.
+///
+/// MemOperand is the operand # of the start of a memory operand if present.  If
+/// Not present, it is -1.
+void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+                                        int MemOperand, const MCInst &MI,
+                                        const MCInstrDesc &Desc,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &OS) const {
+
+  // Emit the operand size opcode prefix as needed.
+  if ((TSFlags & X86II::OpSizeMask) == (is16BitMode(STI) ? X86II::OpSize32
+                                                         : X86II::OpSize16))
+    EmitByte(0x66, CurByte, OS);
+
+  // Emit the LOCK opcode prefix.
+  if (TSFlags & X86II::LOCK)
+    EmitByte(0xF0, CurByte, OS);
+
+  switch (TSFlags & X86II::OpPrefixMask) {
+  case X86II::PD:   // 66
+    EmitByte(0x66, CurByte, OS);
+    break;
+  case X86II::XS:   // F3
+    EmitByte(0xF3, CurByte, OS);
+    break;
+  case X86II::XD:   // F2
+    EmitByte(0xF2, CurByte, OS);
+    break;
+  }
+
+  // Handle REX prefix.
+  // FIXME: Can this come before F2 etc to simplify emission?
+  if (is64BitMode(STI)) {
+    if (unsigned REX = DetermineREXPrefix(MI, TSFlags, Desc))
+      EmitByte(0x40 | REX, CurByte, OS);
+  }
+
+  // 0x0F escape code must be emitted just before the opcode.
+  switch (TSFlags & X86II::OpMapMask) {
+  case X86II::TB:  // Two-byte opcode map
+  case X86II::T8:  // 0F 38
+  case X86II::TA:  // 0F 3A
+    EmitByte(0x0F, CurByte, OS);
+    break;
+  }
+
+  switch (TSFlags & X86II::OpMapMask) {
+  case X86II::T8:    // 0F 38
+    EmitByte(0x38, CurByte, OS);
+    break;
+  case X86II::TA:    // 0F 3A
+    EmitByte(0x3A, CurByte, OS);
+    break;
+  }
+}
+
+void X86MCCodeEmitter::
+encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MCII.get(Opcode);
+  uint64_t TSFlags = Desc.TSFlags;
+
+  // Pseudo instructions don't get encoded.
+  if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+    return;
+
+  unsigned NumOps = Desc.getNumOperands();
+  unsigned CurOp = X86II::getOperandBias(Desc);
+
+  // Keep track of the current byte being emitted.
+  unsigned CurByte = 0;
+
+  // Encoding type for this instruction.
+  uint64_t Encoding = TSFlags & X86II::EncodingMask;
+
+  // It uses the VEX.VVVV field?
+  bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+  bool HasVEX_4VOp3 = TSFlags & X86II::VEX_4VOp3;
+  bool HasMemOp4 = TSFlags & X86II::MemOp4;
+  const unsigned MemOp4_I8IMMOperand = 2;
+
+  // It uses the EVEX.aaa field?
+  bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+  bool HasEVEX_RC = TSFlags & X86II::EVEX_RC;
+
+  // Determine where the memory operand starts, if present.
+  int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
+  if (MemoryOperand != -1) MemoryOperand += CurOp;
+
+  // Emit segment override opcode prefix as needed.
+  if (MemoryOperand >= 0)
+    EmitSegmentOverridePrefix(CurByte, MemoryOperand+X86::AddrSegmentReg,
+                              MI, OS);
+
+  // Emit the repeat opcode prefix as needed.
+  if (TSFlags & X86II::REP)
+    EmitByte(0xF3, CurByte, OS);
+
+  // Emit the address size opcode prefix as needed.
+  bool need_address_override;
+  uint64_t AdSize = TSFlags & X86II::AdSizeMask;
+  if ((is16BitMode(STI) && AdSize == X86II::AdSize32) ||
+      (is32BitMode(STI) && AdSize == X86II::AdSize16) ||
+      (is64BitMode(STI) && AdSize == X86II::AdSize32)) {
+    need_address_override = true;
+  } else if (MemoryOperand < 0) {
+    need_address_override = false;
+  } else if (is64BitMode(STI)) {
+    assert(!Is16BitMemOperand(MI, MemoryOperand, STI));
+    need_address_override = Is32BitMemOperand(MI, MemoryOperand);
+  } else if (is32BitMode(STI)) {
+    assert(!Is64BitMemOperand(MI, MemoryOperand));
+    need_address_override = Is16BitMemOperand(MI, MemoryOperand, STI);
+  } else {
+    assert(is16BitMode(STI));
+    assert(!Is64BitMemOperand(MI, MemoryOperand));
+    need_address_override = !Is16BitMemOperand(MI, MemoryOperand, STI);
+  }
+
+  if (need_address_override)
+    EmitByte(0x67, CurByte, OS);
+
+  if (Encoding == 0)
+    EmitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, STI, OS);
+  else
+    EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
+
+  unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
+
+  if (TSFlags & X86II::Has3DNow0F0FOpcode)
+    BaseOpcode = 0x0F;   // Weird 3DNow! encoding.
+
+  unsigned SrcRegNum = 0;
+  switch (TSFlags & X86II::FormMask) {
+  default: errs() << "FORM: " << (TSFlags & X86II::FormMask) << "\n";
+    llvm_unreachable("Unknown FormMask value in X86MCCodeEmitter!");
+  case X86II::Pseudo:
+    llvm_unreachable("Pseudo instruction shouldn't be emitted");
+  case X86II::RawFrmDstSrc: {
+    unsigned siReg = MI.getOperand(1).getReg();
+    assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) ||
+            (siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) ||
+            (siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) &&
+           "SI and DI register sizes do not match");
+    // Emit segment override opcode prefix as needed (not for %ds).
+    if (MI.getOperand(2).getReg() != X86::DS)
+      EmitSegmentOverridePrefix(CurByte, 2, MI, OS);
+    // Emit AdSize prefix as needed.
+    if ((!is32BitMode(STI) && siReg == X86::ESI) ||
+        (is32BitMode(STI) && siReg == X86::SI))
+      EmitByte(0x67, CurByte, OS);
+    CurOp += 3; // Consume operands.
+    EmitByte(BaseOpcode, CurByte, OS);
+    break;
+  }
+  case X86II::RawFrmSrc: {
+    unsigned siReg = MI.getOperand(0).getReg();
+    // Emit segment override opcode prefix as needed (not for %ds).
+    if (MI.getOperand(1).getReg() != X86::DS)
+      EmitSegmentOverridePrefix(CurByte, 1, MI, OS);
+    // Emit AdSize prefix as needed.
+    if ((!is32BitMode(STI) && siReg == X86::ESI) ||
+        (is32BitMode(STI) && siReg == X86::SI))
+      EmitByte(0x67, CurByte, OS);
+    CurOp += 2; // Consume operands.
+    EmitByte(BaseOpcode, CurByte, OS);
+    break;
+  }
+  case X86II::RawFrmDst: {
+    unsigned siReg = MI.getOperand(0).getReg();
+    // Emit AdSize prefix as needed.
+    if ((!is32BitMode(STI) && siReg == X86::EDI) ||
+        (is32BitMode(STI) && siReg == X86::DI))
+      EmitByte(0x67, CurByte, OS);
+    ++CurOp; // Consume operand.
+    EmitByte(BaseOpcode, CurByte, OS);
+    break;
+  }
+  case X86II::RawFrm:
+    EmitByte(BaseOpcode, CurByte, OS);
+    break;
+  case X86II::RawFrmMemOffs:
+    // Emit segment override opcode prefix as needed.
+    EmitSegmentOverridePrefix(CurByte, 1, MI, OS);
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+                  X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                  CurByte, OS, Fixups);
+    ++CurOp; // skip segment operand
+    break;
+  case X86II::RawFrmImm8:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+                  X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                  CurByte, OS, Fixups);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte,
+                  OS, Fixups);
+    break;
+  case X86II::RawFrmImm16:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+                  X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                  CurByte, OS, Fixups);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte,
+                  OS, Fixups);
+    break;
+
+  case X86II::AddRegFrm:
+    EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
+    break;
+
+  case X86II::MRMDestReg:
+    EmitByte(BaseOpcode, CurByte, OS);
+    SrcRegNum = CurOp + 1;
+
+    if (HasEVEX_K) // Skip writemask
+      SrcRegNum++;
+
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      ++SrcRegNum;
+
+    EmitRegModRMByte(MI.getOperand(CurOp),
+                     GetX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS);
+    CurOp = SrcRegNum + 1;
+    break;
+
+  case X86II::MRMDestMem:
+    EmitByte(BaseOpcode, CurByte, OS);
+    SrcRegNum = CurOp + X86::AddrNumOperands;
+
+    if (HasEVEX_K) // Skip writemask
+      SrcRegNum++;
+
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      ++SrcRegNum;
+
+    EmitMemModRMByte(MI, CurOp,
+                     GetX86RegNum(MI.getOperand(SrcRegNum)),
+                     TSFlags, CurByte, OS, Fixups, STI);
+    CurOp = SrcRegNum + 1;
+    break;
+
+  case X86II::MRMSrcReg:
+    EmitByte(BaseOpcode, CurByte, OS);
+    SrcRegNum = CurOp + 1;
+
+    if (HasEVEX_K) // Skip writemask
+      SrcRegNum++;
+
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      ++SrcRegNum;
+
+    if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
+      ++SrcRegNum;
+
+    EmitRegModRMByte(MI.getOperand(SrcRegNum),
+                     GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+
+    // 2 operands skipped with HasMemOp4, compensate accordingly
+    CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
+    if (HasVEX_4VOp3)
+      ++CurOp;
+    // do not count the rounding control operand
+    if (HasEVEX_RC)
+      NumOps--;
+    break;
+
+  case X86II::MRMSrcMem: {
+    int AddrOperands = X86::AddrNumOperands;
+    unsigned FirstMemOp = CurOp+1;
+
+    if (HasEVEX_K) { // Skip writemask
+      ++AddrOperands;
+      ++FirstMemOp;
+    }
+
+    if (HasVEX_4V) {
+      ++AddrOperands;
+      ++FirstMemOp;  // Skip the register source (which is encoded in VEX_VVVV).
+    }
+    if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+      ++FirstMemOp;
+
+    EmitByte(BaseOpcode, CurByte, OS);
+
+    EmitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
+                     TSFlags, CurByte, OS, Fixups, STI);
+    CurOp += AddrOperands + 1;
+    if (HasVEX_4VOp3)
+      ++CurOp;
+    break;
+  }
+
+  case X86II::MRMXr:
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r: {
+    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+      ++CurOp;
+    if (HasEVEX_K) // Skip writemask
+      ++CurOp;
+    EmitByte(BaseOpcode, CurByte, OS);
+    uint64_t Form = TSFlags & X86II::FormMask;
+    EmitRegModRMByte(MI.getOperand(CurOp++),
+                     (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r,
+                     CurByte, OS);
+    break;
+  }
+
+  case X86II::MRMXm:
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m: {
+    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+      ++CurOp;
+    if (HasEVEX_K) // Skip writemask
+      ++CurOp;
+    EmitByte(BaseOpcode, CurByte, OS);
+    uint64_t Form = TSFlags & X86II::FormMask;
+    EmitMemModRMByte(MI, CurOp, (Form == X86II::MRMXm) ? 0 : Form-X86II::MRM0m,
+                     TSFlags, CurByte, OS, Fixups, STI);
+    CurOp += X86::AddrNumOperands;
+    break;
+  }
+  case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
+  case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5:
+  case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8:
+  case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
+  case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE:
+  case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1:
+  case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4:
+  case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7:
+  case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA:
+  case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD:
+  case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0:
+  case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3:
+  case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6:
+  case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9:
+  case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
+  case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF:
+  case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2:
+  case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5:
+  case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8:
+  case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB:
+  case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE:
+  case X86II::MRM_FF:
+    EmitByte(BaseOpcode, CurByte, OS);
+
+    uint64_t Form = TSFlags & X86II::FormMask;
+    EmitByte(0xC0 + Form - X86II::MRM_C0, CurByte, OS);
+    break;
+  }
+
+  // If there is a remaining operand, it must be a trailing immediate.  Emit it
+  // according to the right size for the instruction. Some instructions
+  // (SSE4a extrq and insertq) have two trailing immediates.
+  while (CurOp != NumOps && NumOps - CurOp <= 2) {
+    // The last source register of a 4 operand instruction in AVX is encoded
+    // in bits[7:4] of a immediate byte.
+    if (TSFlags & X86II::VEX_I8IMM) {
+      const MCOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
+                                                    : CurOp);
+      ++CurOp;
+      unsigned RegNum = GetX86RegNum(MO) << 4;
+      if (X86II::isX86_64ExtendedReg(MO.getReg()))
+        RegNum |= 1 << 7;
+      // If there is an additional 5th operand it must be an immediate, which
+      // is encoded in bits[3:0]
+      if (CurOp != NumOps) {
+        const MCOperand &MIMM = MI.getOperand(CurOp++);
+        if (MIMM.isImm()) {
+          unsigned Val = MIMM.getImm();
+          assert(Val < 16 && "Immediate operand value out of range");
+          RegNum |= Val;
+        }
+      }
+      EmitImmediate(MCOperand::createImm(RegNum), MI.getLoc(), 1, FK_Data_1,
+                    CurByte, OS, Fixups);
+    } else {
+      EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+                    X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                    CurByte, OS, Fixups);
+    }
+  }
+
+  if (TSFlags & X86II::Has3DNow0F0FOpcode)
+    EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
+
+#ifndef NDEBUG
+  // FIXME: Verify.
+  if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) {
+    errs() << "Cannot encode all operands of: ";
+    MI.dump();
+    errs() << '\n';
+    abort();
+  }
+#endif
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
new file mode 100644
index 0000000..53a6550
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -0,0 +1,451 @@
+//===-- X86MCTargetDesc.cpp - X86 Target Descriptions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "InstPrinter/X86IntelInstPrinter.h"
+#include "X86MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#if _MSC_VER
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
+#define GET_REGINFO_MC_DESC
+#include "X86GenRegisterInfo.inc"
+
+#define GET_INSTRINFO_MC_DESC
+#include "X86GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "X86GenSubtargetInfo.inc"
+
+std::string X86_MC::ParseX86Triple(const Triple &TT) {
+  std::string FS;
+  if (TT.getArch() == Triple::x86_64)
+    FS = "+64bit-mode,-32bit-mode,-16bit-mode";
+  else if (TT.getEnvironment() != Triple::CODE16)
+    FS = "-64bit-mode,+32bit-mode,-16bit-mode";
+  else
+    FS = "-64bit-mode,-32bit-mode,+16bit-mode";
+
+  return FS;
+}
+
+unsigned X86_MC::getDwarfRegFlavour(const Triple &TT, bool isEH) {
+  if (TT.getArch() == Triple::x86_64)
+    return DWARFFlavour::X86_64;
+
+  if (TT.isOSDarwin())
+    return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic;
+  if (TT.isOSCygMing())
+    // Unsupported by now, just quick fallback
+    return DWARFFlavour::X86_32_Generic;
+  return DWARFFlavour::X86_32_Generic;
+}
+
+void X86_MC::InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI) {
+  // FIXME: TableGen these.
+  for (unsigned Reg = X86::NoRegister+1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
+    unsigned SEH = MRI->getEncodingValue(Reg);
+    MRI->mapLLVMRegToSEHReg(Reg, SEH);
+  }
+}
+
+MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
+                                                  StringRef CPU, StringRef FS) {
+  std::string ArchFS = X86_MC::ParseX86Triple(TT);
+  if (!FS.empty()) {
+    if (!ArchFS.empty())
+      ArchFS = (Twine(ArchFS) + "," + FS).str();
+    else
+      ArchFS = FS;
+  }
+
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "generic";
+
+  return createX86MCSubtargetInfoImpl(TT, CPUName, ArchFS);
+}
+
+static MCInstrInfo *createX86MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitX86MCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createX86MCRegisterInfo(const Triple &TT) {
+  unsigned RA = (TT.getArch() == Triple::x86_64)
+                    ? X86::RIP  // Should have dwarf #16.
+                    : X86::EIP; // Should have dwarf #8.
+
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitX86MCRegisterInfo(X, RA, X86_MC::getDwarfRegFlavour(TT, false),
+                        X86_MC::getDwarfRegFlavour(TT, true), RA);
+  X86_MC::InitLLVM2SEHRegisterMapping(X);
+  return X;
+}
+
+static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
+                                     const Triple &TheTriple) {
+  bool is64Bit = TheTriple.getArch() == Triple::x86_64;
+
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSBinFormatMachO()) {
+    if (is64Bit)
+      MAI = new X86_64MCAsmInfoDarwin(TheTriple);
+    else
+      MAI = new X86MCAsmInfoDarwin(TheTriple);
+  } else if (TheTriple.isOSBinFormatELF()) {
+    // Force the use of an ELF container.
+    MAI = new X86ELFMCAsmInfo(TheTriple);
+  } else if (TheTriple.isWindowsMSVCEnvironment() ||
+             TheTriple.isWindowsCoreCLREnvironment()) {
+    MAI = new X86MCAsmInfoMicrosoft(TheTriple);
+  } else if (TheTriple.isOSCygMing() ||
+             TheTriple.isWindowsItaniumEnvironment()) {
+    MAI = new X86MCAsmInfoGNUCOFF(TheTriple);
+  } else {
+    // The default is ELF.
+    MAI = new X86ELFMCAsmInfo(TheTriple);
+  }
+
+  // Initialize initial frame state.
+  // Calculate amount of bytes used for return address storing
+  int stackGrowth = is64Bit ? -8 : -4;
+
+  // Initial state of the frame pointer is esp+stackGrowth.
+  unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP;
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
+      nullptr, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth);
+  MAI->addInitialFrameState(Inst);
+
+  // Add return address to move list
+  unsigned InstPtr = is64Bit ? X86::RIP : X86::EIP;
+  MCCFIInstruction Inst2 = MCCFIInstruction::createOffset(
+      nullptr, MRI.getDwarfRegNum(InstPtr, true), stackGrowth);
+  MAI->addInitialFrameState(Inst2);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createX86MCCodeGenInfo(const Triple &TT, Reloc::Model RM,
+                                             CodeModel::Model CM,
+                                             CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+
+  bool is64Bit = TT.getArch() == Triple::x86_64;
+
+  if (RM == Reloc::Default) {
+    // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
+    // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
+    // use static relocation model by default.
+    if (TT.isOSDarwin()) {
+      if (is64Bit)
+        RM = Reloc::PIC_;
+      else
+        RM = Reloc::DynamicNoPIC;
+    } else if (TT.isOSWindows() && is64Bit)
+      RM = Reloc::PIC_;
+    else
+      RM = Reloc::Static;
+  }
+
+  // ELF and X86-64 don't have a distinct DynamicNoPIC model.  DynamicNoPIC
+  // is defined as a model for code which may be used in static or dynamic
+  // executables but not necessarily a shared library. On X86-32 we just
+  // compile in -static mode, in x86-64 we use PIC.
+  if (RM == Reloc::DynamicNoPIC) {
+    if (is64Bit)
+      RM = Reloc::PIC_;
+    else if (!TT.isOSDarwin())
+      RM = Reloc::Static;
+  }
+
+  // If we are on Darwin, disallow static relocation model in X86-64 mode, since
+  // the Mach-O file format doesn't support it.
+  if (RM == Reloc::Static && TT.isOSDarwin() && is64Bit)
+    RM = Reloc::PIC_;
+
+  // For static codegen, if we're not already set, use Small codegen.
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+  else if (CM == CodeModel::JITDefault)
+    // 64-bit JIT places everything in the same buffer except external funcs.
+    CM = is64Bit ? CodeModel::Large : CodeModel::Small;
+
+  X->initMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createX86MCInstPrinter(const Triple &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCInstrInfo &MII,
+                                             const MCRegisterInfo &MRI) {
+  if (SyntaxVariant == 0)
+    return new X86ATTInstPrinter(MAI, MII, MRI);
+  if (SyntaxVariant == 1)
+    return new X86IntelInstPrinter(MAI, MII, MRI);
+  return nullptr;
+}
+
+static MCRelocationInfo *createX86MCRelocationInfo(const Triple &TheTriple,
+                                                   MCContext &Ctx) {
+  if (TheTriple.isOSBinFormatMachO() && TheTriple.getArch() == Triple::x86_64)
+    return createX86_64MachORelocationInfo(Ctx);
+  else if (TheTriple.isOSBinFormatELF())
+    return createX86_64ELFRelocationInfo(Ctx);
+  // Default to the stock relocation info.
+  return llvm::createMCRelocationInfo(TheTriple, Ctx);
+}
+
+static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) {
+  return new MCInstrAnalysis(Info);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86TargetMC() {
+  for (Target *T : {&TheX86_32Target, &TheX86_64Target}) {
+    // Register the MC asm info.
+    RegisterMCAsmInfoFn X(*T, createX86MCAsmInfo);
+
+    // Register the MC codegen info.
+    RegisterMCCodeGenInfoFn Y(*T, createX86MCCodeGenInfo);
+
+    // Register the MC instruction info.
+    TargetRegistry::RegisterMCInstrInfo(*T, createX86MCInstrInfo);
+
+    // Register the MC register info.
+    TargetRegistry::RegisterMCRegInfo(*T, createX86MCRegisterInfo);
+
+    // Register the MC subtarget info.
+    TargetRegistry::RegisterMCSubtargetInfo(*T,
+                                            X86_MC::createX86MCSubtargetInfo);
+
+    // Register the MC instruction analyzer.
+    TargetRegistry::RegisterMCInstrAnalysis(*T, createX86MCInstrAnalysis);
+
+    // Register the code emitter.
+    TargetRegistry::RegisterMCCodeEmitter(*T, createX86MCCodeEmitter);
+
+    // Register the object streamer.
+    TargetRegistry::RegisterCOFFStreamer(*T, createX86WinCOFFStreamer);
+
+    // Register the MCInstPrinter.
+    TargetRegistry::RegisterMCInstPrinter(*T, createX86MCInstPrinter);
+
+    // Register the MC relocation info.
+    TargetRegistry::RegisterMCRelocationInfo(*T, createX86MCRelocationInfo);
+  }
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheX86_32Target,
+                                       createX86_32AsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheX86_64Target,
+                                       createX86_64AsmBackend);
+}
+
+unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
+                                            bool High) {
+  switch (Size) {
+  default: return 0;
+  case 8:
+    if (High) {
+      switch (Reg) {
+      default: return getX86SubSuperRegisterOrZero(Reg, 64);
+      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+        return X86::SI;
+      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+        return X86::DI;
+      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+        return X86::BP;
+      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+        return X86::SP;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AH;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DH;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CH;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BH;
+      }
+    } else {
+      switch (Reg) {
+      default: return 0;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AL;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DL;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CL;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BL;
+      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+        return X86::SIL;
+      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+        return X86::DIL;
+      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+        return X86::BPL;
+      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+        return X86::SPL;
+      case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+        return X86::R8B;
+      case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+        return X86::R9B;
+      case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+        return X86::R10B;
+      case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+        return X86::R11B;
+      case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+        return X86::R12B;
+      case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+        return X86::R13B;
+      case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+        return X86::R14B;
+      case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+        return X86::R15B;
+      }
+    }
+  case 16:
+    switch (Reg) {
+    default: return 0;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::AX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::DX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::CX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::BX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::SI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::DI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::BP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::SP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8W;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9W;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10W;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11W;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12W;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13W;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14W;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15W;
+    }
+  case 32:
+    switch (Reg) {
+    default: return 0;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::EAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::EDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::ECX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::EBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::ESI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::EDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::EBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::ESP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8D;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9D;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10D;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11D;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12D;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13D;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14D;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15D;
+    }
+  case 64:
+    switch (Reg) {
+    default: return 0;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::RAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::RDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::RCX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::RBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::RSI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::RDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::RBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::RSP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15;
+    }
+  }
+}
+
+unsigned llvm::getX86SubSuperRegister(unsigned Reg, unsigned Size, bool High) {
+  unsigned Res = getX86SubSuperRegisterOrZero(Reg, Size, High);
+  assert(Res != 0 && "Unexpected register or VT");
+  return Res;
+}
+
+
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
new file mode 100644
index 0000000..2d2836f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -0,0 +1,129 @@
+//===-- X86MCTargetDesc.h - X86 Target Descriptions -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include <string>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCRelocationInfo;
+class MCStreamer;
+class Target;
+class Triple;
+class StringRef;
+class raw_ostream;
+class raw_pwrite_stream;
+
+extern Target TheX86_32Target, TheX86_64Target;
+
+/// Flavour of dwarf regnumbers
+///
+namespace DWARFFlavour {
+  enum {
+    X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2
+  };
+}
+
+///  Native X86 register numbers
+///
+namespace N86 {
+  enum {
+    EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
+  };
+}
+
+namespace X86_MC {
+std::string ParseX86Triple(const Triple &TT);
+
+unsigned getDwarfRegFlavour(const Triple &TT, bool isEH);
+
+void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI);
+
+/// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc.
+/// do not need to go through TargetRegistry.
+MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU,
+                                          StringRef FS);
+}
+
+MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
+                                      MCContext &Ctx);
+
+MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                     const Triple &TT, StringRef CPU);
+MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                     const Triple &TT, StringRef CPU);
+
+/// Construct an X86 Windows COFF machine code streamer which will generate
+/// PE/COFF format object files.
+///
+/// Takes ownership of \p AB and \p CE.
+MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
+                                     raw_pwrite_stream &OS, MCCodeEmitter *CE,
+                                     bool RelaxAll, bool IncrementalLinkerCompatible);
+
+/// Construct an X86 Mach-O object writer.
+MCObjectWriter *createX86MachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
+                                          uint32_t CPUType,
+                                          uint32_t CPUSubtype);
+
+/// Construct an X86 ELF object writer.
+MCObjectWriter *createX86ELFObjectWriter(raw_pwrite_stream &OS, bool IsELF64,
+                                         uint8_t OSABI, uint16_t EMachine);
+/// Construct an X86 Win COFF object writer.
+MCObjectWriter *createX86WinCOFFObjectWriter(raw_pwrite_stream &OS,
+                                             bool Is64Bit);
+
+/// Construct X86-64 Mach-O relocation info.
+MCRelocationInfo *createX86_64MachORelocationInfo(MCContext &Ctx);
+
+/// Construct X86-64 ELF relocation info.
+MCRelocationInfo *createX86_64ELFRelocationInfo(MCContext &Ctx);
+
+/// Returns the sub or super register of a specific X86 register.
+/// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX.
+/// Aborts on error.
+unsigned getX86SubSuperRegister(unsigned, unsigned, bool High=false);
+
+/// Returns the sub or super register of a specific X86 register.
+/// Like getX86SubSuperRegister() but returns 0 on error.
+unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned,
+                                      bool High = false);
+
+} // End llvm namespace
+
+
+// Defines symbolic names for X86 registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "X86GenRegisterInfo.inc"
+
+// Defines symbolic names for the X86 instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "X86GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "X86GenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
new file mode 100644
index 0000000..9bfe999
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
@@ -0,0 +1,119 @@
+//===-- X86MachORelocationInfo.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Object/MachO.h"
+
+using namespace llvm;
+using namespace object;
+using namespace MachO;
+
+namespace {
+class X86_64MachORelocationInfo : public MCRelocationInfo {
+public:
+  X86_64MachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
+
+  const MCExpr *createExprForRelocation(RelocationRef Rel) override {
+    const MachOObjectFile *Obj = cast<MachOObjectFile>(Rel.getObject());
+
+    uint64_t RelType = Rel.getType();
+    symbol_iterator SymI = Rel.getSymbol();
+
+    ErrorOr<StringRef> SymNameOrErr = SymI->getName();
+    if (std::error_code EC = SymNameOrErr.getError())
+      report_fatal_error(EC.message());
+    StringRef SymName = *SymNameOrErr;
+    uint64_t SymAddr = SymI->getValue();
+
+    any_relocation_info RE = Obj->getRelocation(Rel.getRawDataRefImpl());
+    bool isPCRel = Obj->getAnyRelocationPCRel(RE);
+
+    MCSymbol *Sym = Ctx.getOrCreateSymbol(SymName);
+    // FIXME: check that the value is actually the same.
+    if (!Sym->isVariable())
+      Sym->setVariableValue(MCConstantExpr::create(SymAddr, Ctx));
+    const MCExpr *Expr = nullptr;
+
+    switch(RelType) {
+    case X86_64_RELOC_TLV:
+      Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
+      break;
+    case X86_64_RELOC_SIGNED_4:
+      Expr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Sym, Ctx),
+                                     MCConstantExpr::create(4, Ctx),
+                                     Ctx);
+      break;
+    case X86_64_RELOC_SIGNED_2:
+      Expr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Sym, Ctx),
+                                     MCConstantExpr::create(2, Ctx),
+                                     Ctx);
+      break;
+    case X86_64_RELOC_SIGNED_1:
+      Expr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Sym, Ctx),
+                                     MCConstantExpr::create(1, Ctx),
+                                     Ctx);
+      break;
+    case X86_64_RELOC_GOT_LOAD:
+      Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
+      break;
+    case X86_64_RELOC_GOT:
+      Expr = MCSymbolRefExpr::create(Sym, isPCRel ?
+                                     MCSymbolRefExpr::VK_GOTPCREL :
+                                     MCSymbolRefExpr::VK_GOT,
+                                     Ctx);
+      break;
+    case X86_64_RELOC_SUBTRACTOR:
+      {
+        Rel.moveNext();
+        any_relocation_info RENext =
+            Obj->getRelocation(Rel.getRawDataRefImpl());
+
+        // X86_64_SUBTRACTOR must be followed by a relocation of type
+        // X86_64_RELOC_UNSIGNED.
+        // NOTE: Scattered relocations don't exist on x86_64.
+        unsigned RType = Obj->getAnyRelocationType(RENext);
+        if (RType != X86_64_RELOC_UNSIGNED)
+          report_fatal_error("Expected X86_64_RELOC_UNSIGNED after "
+                             "X86_64_RELOC_SUBTRACTOR.");
+
+        const MCExpr *LHS = MCSymbolRefExpr::create(Sym, Ctx);
+
+        symbol_iterator RSymI = Rel.getSymbol();
+        uint64_t RSymAddr = RSymI->getValue();
+        ErrorOr<StringRef> RSymName = RSymI->getName();
+        if (std::error_code EC = RSymName.getError())
+          report_fatal_error(EC.message());
+
+        MCSymbol *RSym = Ctx.getOrCreateSymbol(*RSymName);
+        if (!RSym->isVariable())
+          RSym->setVariableValue(MCConstantExpr::create(RSymAddr, Ctx));
+
+        const MCExpr *RHS = MCSymbolRefExpr::create(RSym, Ctx);
+
+        Expr = MCBinaryExpr::createSub(LHS, RHS, Ctx);
+        break;
+      }
+    default:
+      Expr = MCSymbolRefExpr::create(Sym, Ctx);
+      break;
+    }
+    return Expr;
+  }
+};
+} // End unnamed namespace
+
+/// createX86_64MachORelocationInfo - Construct an X86-64 Mach-O RelocationInfo.
+MCRelocationInfo *llvm::createX86_64MachORelocationInfo(MCContext &Ctx) {
+  return new X86_64MachORelocationInfo(Ctx);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
new file mode 100644
index 0000000..191ebea
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -0,0 +1,605 @@
+//===-- X86MachObjectWriter.cpp - X86 Mach-O Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MachO.h"
+
+using namespace llvm;
+
+namespace {
+class X86MachObjectWriter : public MCMachObjectTargetWriter {
+  bool recordScatteredRelocation(MachObjectWriter *Writer,
+                                 const MCAssembler &Asm,
+                                 const MCAsmLayout &Layout,
+                                 const MCFragment *Fragment,
+                                 const MCFixup &Fixup,
+                                 MCValue Target,
+                                 unsigned Log2Size,
+                                 uint64_t &FixedValue);
+  void recordTLVPRelocation(MachObjectWriter *Writer,
+                            const MCAssembler &Asm,
+                            const MCAsmLayout &Layout,
+                            const MCFragment *Fragment,
+                            const MCFixup &Fixup,
+                            MCValue Target,
+                            uint64_t &FixedValue);
+
+  void RecordX86Relocation(MachObjectWriter *Writer,
+                              const MCAssembler &Asm,
+                              const MCAsmLayout &Layout,
+                              const MCFragment *Fragment,
+                              const MCFixup &Fixup,
+                              MCValue Target,
+                              uint64_t &FixedValue);
+  void RecordX86_64Relocation(MachObjectWriter *Writer, MCAssembler &Asm,
+                              const MCAsmLayout &Layout,
+                              const MCFragment *Fragment, const MCFixup &Fixup,
+                              MCValue Target, uint64_t &FixedValue);
+
+public:
+  X86MachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype)
+      : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
+
+  void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+                        const MCAsmLayout &Layout, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) override {
+    if (Writer->is64Bit())
+      RecordX86_64Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                             FixedValue);
+    else
+      RecordX86Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                          FixedValue);
+  }
+};
+}
+
+static bool isFixupKindRIPRel(unsigned Kind) {
+  return Kind == X86::reloc_riprel_4byte ||
+    Kind == X86::reloc_riprel_4byte_movq_load;
+}
+
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("invalid fixup kind!");
+  case FK_PCRel_1:
+  case FK_Data_1: return 0;
+  case FK_PCRel_2:
+  case FK_Data_2: return 1;
+  case FK_PCRel_4:
+    // FIXME: Remove these!!!
+  case X86::reloc_riprel_4byte:
+  case X86::reloc_riprel_4byte_movq_load:
+  case X86::reloc_signed_4byte:
+  case FK_Data_4: return 2;
+  case FK_Data_8: return 3;
+  }
+}
+
+void X86MachObjectWriter::RecordX86_64Relocation(
+    MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned IsRIPRel = isFixupKindRIPRel(Fixup.getKind());
+  unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+
+  // See <reloc.h>.
+  uint32_t FixupOffset =
+    Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+  uint32_t FixupAddress =
+    Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
+  int64_t Value = 0;
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = 0;
+  const MCSymbol *RelSymbol = nullptr;
+
+  Value = Target.getConstant();
+
+  if (IsPCRel) {
+    // Compensate for the relocation offset, Darwin x86_64 relocations only have
+    // the addend and appear to have attempted to define it to be the actual
+    // expression addend without the PCrel bias. However, instructions with data
+    // following the relocation are not accommodated for (see comment below
+    // regarding SIGNED{1,2,4}), so it isn't exactly that either.
+    Value += 1LL << Log2Size;
+  }
+
+  if (Target.isAbsolute()) { // constant
+    // SymbolNum of 0 indicates the absolute section.
+    Type = MachO::X86_64_RELOC_UNSIGNED;
+
+    // FIXME: I believe this is broken, I don't think the linker can understand
+    // it. I think it would require a local relocation, but I'm not sure if that
+    // would work either. The official way to get an absolute PCrel relocation
+    // is to use an absolute symbol (which we don't support yet).
+    if (IsPCRel) {
+      IsExtern = 1;
+      Type = MachO::X86_64_RELOC_BRANCH;
+    }
+  } else if (Target.getSymB()) { // A - B + constant
+    const MCSymbol *A = &Target.getSymA()->getSymbol();
+    if (A->isTemporary())
+      A = &Writer->findAliasedSymbol(*A);
+    const MCSymbol *A_Base = Asm.getAtom(*A);
+
+    const MCSymbol *B = &Target.getSymB()->getSymbol();
+    if (B->isTemporary())
+      B = &Writer->findAliasedSymbol(*B);
+    const MCSymbol *B_Base = Asm.getAtom(*B);
+
+    // Neither symbol can be modified.
+    if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
+        Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) {
+      Asm.getContext().reportError(Fixup.getLoc(),
+                                   "unsupported relocation of modified symbol");
+      return;
+    }
+
+    // We don't support PCrel relocations of differences. Darwin 'as' doesn't
+    // implement most of these correctly.
+    if (IsPCRel) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(), "unsupported pc-relative relocation of difference");
+      return;
+    }
+
+    // The support for the situation where one or both of the symbols would
+    // require a local relocation is handled just like if the symbols were
+    // external.  This is certainly used in the case of debug sections where the
+    // section has only temporary symbols and thus the symbols don't have base
+    // symbols.  This is encoded using the section ordinal and non-extern
+    // relocation entries.
+
+    // Darwin 'as' doesn't emit correct relocations for this (it ends up with a
+    // single SIGNED relocation); reject it for now.  Except the case where both
+    // symbols don't have a base, equal but both NULL.
+    if (A_Base == B_Base && A_Base) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(), "unsupported relocation with identical base");
+      return;
+    }
+
+    // A subtraction expression where either symbol is undefined is a
+    // non-relocatable expression.
+    if (A->isUndefined() || B->isUndefined()) {
+      StringRef Name = A->isUndefined() ? A->getName() : B->getName();
+      Asm.getContext().reportError(Fixup.getLoc(),
+        "unsupported relocation with subtraction expression, symbol '" +
+        Name + "' can not be undefined in a subtraction expression");
+      return;
+    }
+
+    Value += Writer->getSymbolAddress(*A, Layout) -
+             (!A_Base ? 0 : Writer->getSymbolAddress(*A_Base, Layout));
+    Value -= Writer->getSymbolAddress(*B, Layout) -
+             (!B_Base ? 0 : Writer->getSymbolAddress(*B_Base, Layout));
+
+    if (!A_Base)
+      Index = A->getFragment()->getParent()->getOrdinal() + 1;
+    Type = MachO::X86_64_RELOC_UNSIGNED;
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 =
+        (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+    Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
+
+    if (B_Base)
+      RelSymbol = B_Base;
+    else
+      Index = B->getFragment()->getParent()->getOrdinal() + 1;
+    Type = MachO::X86_64_RELOC_SUBTRACTOR;
+  } else {
+    const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+    if (Symbol->isTemporary() && Value) {
+      const MCSection &Sec = Symbol->getSection();
+      if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec))
+        Symbol->setUsedInReloc();
+    }
+    RelSymbol = Asm.getAtom(*Symbol);
+
+    // Relocations inside debug sections always use local relocations when
+    // possible. This seems to be done because the debugger doesn't fully
+    // understand x86_64 relocation entries, and expects to find values that
+    // have already been fixed up.
+    if (Symbol->isInSection()) {
+      const MCSectionMachO &Section =
+          static_cast<const MCSectionMachO &>(*Fragment->getParent());
+      if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+        RelSymbol = nullptr;
+    }
+
+    // x86_64 almost always uses external relocations, except when there is no
+    // symbol to use as a base address (a local symbol with no preceding
+    // non-local symbol).
+    if (RelSymbol) {
+      // Add the local offset, if needed.
+      if (RelSymbol != Symbol)
+        Value += Layout.getSymbolOffset(*Symbol) -
+                 Layout.getSymbolOffset(*RelSymbol);
+    } else if (Symbol->isInSection() && !Symbol->isVariable()) {
+      // The index is the section ordinal (1-based).
+      Index = Symbol->getFragment()->getParent()->getOrdinal() + 1;
+      Value += Writer->getSymbolAddress(*Symbol, Layout);
+
+      if (IsPCRel)
+        Value -= FixupAddress + (1 << Log2Size);
+    } else if (Symbol->isVariable()) {
+      const MCExpr *Value = Symbol->getVariableValue();
+      int64_t Res;
+      bool isAbs = Value->evaluateAsAbsolute(Res, Layout,
+                                             Writer->getSectionAddressMap());
+      if (isAbs) {
+        FixedValue = Res;
+        return;
+      } else {
+        Asm.getContext().reportError(Fixup.getLoc(),
+                                     "unsupported relocation of variable '" +
+                                         Symbol->getName() + "'");
+        return;
+      }
+    } else {
+      Asm.getContext().reportError(
+          Fixup.getLoc(), "unsupported relocation of undefined symbol '" +
+                              Symbol->getName() + "'");
+      return;
+    }
+
+    MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind();
+    if (IsPCRel) {
+      if (IsRIPRel) {
+        if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+          // x86_64 distinguishes movq foo@GOTPCREL so that the linker can
+          // rewrite the movq to an leaq at link time if the symbol ends up in
+          // the same linkage unit.
+          if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load)
+            Type = MachO::X86_64_RELOC_GOT_LOAD;
+          else
+            Type = MachO::X86_64_RELOC_GOT;
+        }  else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
+          Type = MachO::X86_64_RELOC_TLV;
+        }  else if (Modifier != MCSymbolRefExpr::VK_None) {
+          Asm.getContext().reportError(
+              Fixup.getLoc(), "unsupported symbol modifier in relocation");
+          return;
+        } else {
+          Type = MachO::X86_64_RELOC_SIGNED;
+
+          // The Darwin x86_64 relocation format has a problem where it cannot
+          // encode an address (L<foo> + <constant>) which is outside the atom
+          // containing L<foo>. Generally, this shouldn't occur but it does
+          // happen when we have a RIPrel instruction with data following the
+          // relocation entry (e.g., movb $012, L0(%rip)). Even with the PCrel
+          // adjustment Darwin x86_64 uses, the offset is still negative and the
+          // linker has no way to recognize this.
+          //
+          // To work around this, Darwin uses several special relocation types
+          // to indicate the offsets. However, the specification or
+          // implementation of these seems to also be incomplete; they should
+          // adjust the addend as well based on the actual encoded instruction
+          // (the additional bias), but instead appear to just look at the final
+          // offset.
+          switch (-(Target.getConstant() + (1LL << Log2Size))) {
+          case 1: Type = MachO::X86_64_RELOC_SIGNED_1; break;
+          case 2: Type = MachO::X86_64_RELOC_SIGNED_2; break;
+          case 4: Type = MachO::X86_64_RELOC_SIGNED_4; break;
+          }
+        }
+      } else {
+        if (Modifier != MCSymbolRefExpr::VK_None) {
+          Asm.getContext().reportError(
+              Fixup.getLoc(),
+              "unsupported symbol modifier in branch relocation");
+          return;
+        }
+
+        Type = MachO::X86_64_RELOC_BRANCH;
+      }
+    } else {
+      if (Modifier == MCSymbolRefExpr::VK_GOT) {
+        Type = MachO::X86_64_RELOC_GOT;
+      } else if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+        // GOTPCREL is allowed as a modifier on non-PCrel instructions, in which
+        // case all we do is set the PCrel bit in the relocation entry; this is
+        // used with exception handling, for example. The source is required to
+        // include any necessary offset directly.
+        Type = MachO::X86_64_RELOC_GOT;
+        IsPCRel = 1;
+      } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
+        Asm.getContext().reportError(
+            Fixup.getLoc(), "TLVP symbol modifier should have been rip-rel");
+        return;
+      } else if (Modifier != MCSymbolRefExpr::VK_None) {
+        Asm.getContext().reportError(
+            Fixup.getLoc(), "unsupported symbol modifier in relocation");
+        return;
+      } else {
+        Type = MachO::X86_64_RELOC_UNSIGNED;
+        unsigned Kind = Fixup.getKind();
+        if (Kind == X86::reloc_signed_4byte) {
+          Asm.getContext().reportError(
+              Fixup.getLoc(),
+              "32-bit absolute addressing is not supported in 64-bit mode");
+          return;
+        }
+      }
+    }
+  }
+
+  // x86_64 always writes custom values into the fixups.
+  FixedValue = Value;
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                (IsExtern << 27) | (Type << 28);
+  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+}
+
+bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
+                                                    const MCAssembler &Asm,
+                                                    const MCAsmLayout &Layout,
+                                                    const MCFragment *Fragment,
+                                                    const MCFixup &Fixup,
+                                                    MCValue Target,
+                                                    unsigned Log2Size,
+                                                    uint64_t &FixedValue) {
+  uint64_t OriginalFixedValue = FixedValue;
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Type = MachO::GENERIC_RELOC_VANILLA;
+
+  // See <reloc.h>.
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+
+  if (!A->getFragment()) {
+    Asm.getContext().reportError(
+        Fixup.getLoc(),
+        "symbol '" + A->getName() +
+            "' can not be undefined in a subtraction expression");
+    return false;
+  }
+
+  uint32_t Value = Writer->getSymbolAddress(*A, Layout);
+  uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
+  FixedValue += SecAddr;
+  uint32_t Value2 = 0;
+
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    const MCSymbol *SB = &B->getSymbol();
+
+    if (!SB->getFragment()) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(),
+          "symbol '" + B->getSymbol().getName() +
+              "' can not be undefined in a subtraction expression");
+      return false;
+    }
+
+    // Select the appropriate difference relocation type.
+    //
+    // Note that there is no longer any semantic difference between these two
+    // relocation types from the linkers point of view, this is done solely for
+    // pedantic compatibility with 'as'.
+    Type = A->isExternal() ? (unsigned)MachO::GENERIC_RELOC_SECTDIFF
+                           : (unsigned)MachO::GENERIC_RELOC_LOCAL_SECTDIFF;
+    Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
+    FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
+  }
+
+  // Relocations are written out in reverse order, so the PAIR comes first.
+  if (Type == MachO::GENERIC_RELOC_SECTDIFF ||
+      Type == MachO::GENERIC_RELOC_LOCAL_SECTDIFF) {
+    // If the offset is too large to fit in a scattered relocation,
+    // we're hosed. It's an unfortunate limitation of the MachO format.
+    if (FixupOffset > 0xffffff) {
+      char Buffer[32];
+      format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
+      Asm.getContext().reportError(Fixup.getLoc(),
+                         Twine("Section too large, can't encode "
+                                "r_address (") + Buffer +
+                         ") into 24 bits of scattered "
+                         "relocation entry.");
+      return false;
+    }
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = ((0                         <<  0) | // r_address
+                   (MachO::GENERIC_RELOC_PAIR << 24) | // r_type
+                   (Log2Size                  << 28) |
+                   (IsPCRel                   << 30) |
+                   MachO::R_SCATTERED);
+    MRE.r_word1 = Value2;
+    Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+  } else {
+    // If the offset is more than 24-bits, it won't fit in a scattered
+    // relocation offset field, so we fall back to using a non-scattered
+    // relocation. This is a bit risky, as if the offset reaches out of
+    // the block and the linker is doing scattered loading on this
+    // symbol, things can go badly.
+    //
+    // Required for 'as' compatibility.
+    if (FixupOffset > 0xffffff) {
+      FixedValue = OriginalFixedValue;
+      return false;
+    }
+  }
+
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = ((FixupOffset <<  0) |
+                 (Type        << 24) |
+                 (Log2Size    << 28) |
+                 (IsPCRel     << 30) |
+                 MachO::R_SCATTERED);
+  MRE.r_word1 = Value;
+  Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+  return true;
+}
+
+void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer,
+                                               const MCAssembler &Asm,
+                                               const MCAsmLayout &Layout,
+                                               const MCFragment *Fragment,
+                                               const MCFixup &Fixup,
+                                               MCValue Target,
+                                               uint64_t &FixedValue) {
+  assert(Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP &&
+         !is64Bit() &&
+         "Should only be called with a 32-bit TLVP relocation!");
+
+  unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+  uint32_t Value = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned IsPCRel = 0;
+
+  // We're only going to have a second symbol in pic mode and it'll be a
+  // subtraction from the picbase. For 32-bit pic the addend is the difference
+  // between the picbase and the next address.  For 32-bit static the addend is
+  // zero.
+  if (Target.getSymB()) {
+    // If this is a subtraction then we're pcrel.
+    uint32_t FixupAddress =
+      Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
+    IsPCRel = 1;
+    FixedValue =
+        FixupAddress -
+        Writer->getSymbolAddress(Target.getSymB()->getSymbol(), Layout) +
+        Target.getConstant();
+    FixedValue += 1ULL << Log2Size;
+  } else {
+    FixedValue = 0;
+  }
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = Value;
+  MRE.r_word1 =
+      (IsPCRel << 24) | (Log2Size << 25) | (MachO::GENERIC_RELOC_TLV << 28);
+  Writer->addRelocation(&Target.getSymA()->getSymbol(), Fragment->getParent(),
+                        MRE);
+}
+
+void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
+                                              const MCAssembler &Asm,
+                                              const MCAsmLayout &Layout,
+                                              const MCFragment *Fragment,
+                                              const MCFixup &Fixup,
+                                              MCValue Target,
+                                              uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+
+  // If this is a 32-bit TLVP reloc it's handled a bit differently.
+  if (Target.getSymA() &&
+      Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP) {
+    recordTLVPRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                         FixedValue);
+    return;
+  }
+
+  // If this is a difference or a defined symbol plus an offset, then we need a
+  // scattered relocation entry. Differences always require scattered
+  // relocations.
+  if (Target.getSymB()) {
+    recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                              Target, Log2Size, FixedValue);
+    return;
+  }
+
+  // Get the symbol data, if any.
+  const MCSymbol *A = nullptr;
+  if (Target.getSymA())
+    A = &Target.getSymA()->getSymbol();
+
+  // If this is an internal relocation with an offset, it also needs a scattered
+  // relocation entry.
+  uint32_t Offset = Target.getConstant();
+  if (IsPCRel)
+    Offset += 1 << Log2Size;
+  // Try to record the scattered relocation if needed. Fall back to non
+  // scattered if necessary (see comments in recordScatteredRelocation()
+  // for details).
+  if (Offset && A && !Writer->doesSymbolRequireExternRelocation(*A) &&
+      recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                                Log2Size, FixedValue))
+    return;
+
+  // See <reloc.h>.
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned Index = 0;
+  unsigned Type = 0;
+  const MCSymbol *RelSymbol = nullptr;
+
+  if (Target.isAbsolute()) { // constant
+    // SymbolNum of 0 indicates the absolute section.
+    //
+    // FIXME: Currently, these are never generated (see code below). I cannot
+    // find a case where they are actually emitted.
+    Type = MachO::GENERIC_RELOC_VANILLA;
+  } else {
+    // Resolve constant variables.
+    if (A->isVariable()) {
+      int64_t Res;
+      if (A->getVariableValue()->evaluateAsAbsolute(
+              Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+    }
+
+    // Check whether we need an external or internal relocation.
+    if (Writer->doesSymbolRequireExternRelocation(*A)) {
+      RelSymbol = A;
+      // For external relocations, make sure to offset the fixup value to
+      // compensate for the addend of the symbol address, if it was
+      // undefined. This occurs with weak definitions, for example.
+      if (!A->isUndefined())
+        FixedValue -= Layout.getSymbolOffset(*A);
+    } else {
+      // The index is the section ordinal (1-based).
+      const MCSection &Sec = A->getSection();
+      Index = Sec.getOrdinal() + 1;
+      FixedValue += Writer->getSectionAddress(&Sec);
+    }
+    if (IsPCRel)
+      FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+
+    Type = MachO::GENERIC_RELOC_VANILLA;
+  }
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 =
+      (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createX86MachObjectWriter(raw_pwrite_stream &OS,
+                                                bool Is64Bit, uint32_t CPUType,
+                                                uint32_t CPUSubtype) {
+  return createMachObjectWriter(new X86MachObjectWriter(Is64Bit,
+                                                        CPUType,
+                                                        CPUSubtype),
+                                OS, /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
new file mode 100644
index 0000000..bd1bc99
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -0,0 +1,97 @@
+//===-- X86WinCOFFObjectWriter.cpp - X86 Win COFF Writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace llvm {
+  class MCObjectWriter;
+}
+
+namespace {
+  class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+  public:
+    X86WinCOFFObjectWriter(bool Is64Bit);
+    ~X86WinCOFFObjectWriter() override;
+
+    unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
+                          bool IsCrossSection,
+                          const MCAsmBackend &MAB) const override;
+  };
+}
+
+X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit)
+    : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64
+                                          : COFF::IMAGE_FILE_MACHINE_I386) {}
+
+X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {}
+
+unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
+                                              const MCFixup &Fixup,
+                                              bool IsCrossSection,
+                                              const MCAsmBackend &MAB) const {
+  unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind();
+
+  MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+    MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
+  if (getMachine() == COFF::IMAGE_FILE_MACHINE_AMD64) {
+    switch (FixupKind) {
+    case FK_PCRel_4:
+    case X86::reloc_riprel_4byte:
+    case X86::reloc_riprel_4byte_movq_load:
+      return COFF::IMAGE_REL_AMD64_REL32;
+    case FK_Data_4:
+    case X86::reloc_signed_4byte:
+      if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+        return COFF::IMAGE_REL_AMD64_ADDR32NB;
+      return COFF::IMAGE_REL_AMD64_ADDR32;
+    case FK_Data_8:
+      return COFF::IMAGE_REL_AMD64_ADDR64;
+    case FK_SecRel_2:
+      return COFF::IMAGE_REL_AMD64_SECTION;
+    case FK_SecRel_4:
+      return COFF::IMAGE_REL_AMD64_SECREL;
+    default:
+      llvm_unreachable("unsupported relocation type");
+    }
+  } else if (getMachine() == COFF::IMAGE_FILE_MACHINE_I386) {
+    switch (FixupKind) {
+    case FK_PCRel_4:
+    case X86::reloc_riprel_4byte:
+    case X86::reloc_riprel_4byte_movq_load:
+      return COFF::IMAGE_REL_I386_REL32;
+    case FK_Data_4:
+    case X86::reloc_signed_4byte:
+      if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+        return COFF::IMAGE_REL_I386_DIR32NB;
+      return COFF::IMAGE_REL_I386_DIR32;
+    case FK_SecRel_2:
+      return COFF::IMAGE_REL_I386_SECTION;
+    case FK_SecRel_4:
+      return COFF::IMAGE_REL_I386_SECREL;
+    default:
+      llvm_unreachable("unsupported relocation type");
+    }
+  } else
+    llvm_unreachable("Unsupported COFF machine type.");
+}
+
+MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_pwrite_stream &OS,
+                                                   bool Is64Bit) {
+  MCWinCOFFObjectTargetWriter *MOTW = new X86WinCOFFObjectWriter(Is64Bit);
+  return createWinCOFFObjectWriter(MOTW, OS);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
new file mode 100644
index 0000000..d045118
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -0,0 +1,60 @@
+//===-- X86WinCOFFStreamer.cpp - X86 Target WinCOFF Streamer ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "llvm/MC/MCWin64EH.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+class X86WinCOFFStreamer : public MCWinCOFFStreamer {
+  Win64EH::UnwindEmitter EHStreamer;
+public:
+  X86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter *CE,
+                     raw_pwrite_stream &OS)
+      : MCWinCOFFStreamer(C, AB, *CE, OS) {}
+
+  void EmitWinEHHandlerData() override;
+  void EmitWindowsUnwindTables() override;
+  void FinishImpl() override;
+};
+
+void X86WinCOFFStreamer::EmitWinEHHandlerData() {
+  MCStreamer::EmitWinEHHandlerData();
+
+  // We have to emit the unwind info now, because this directive
+  // actually switches to the .xdata section!
+  EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo());
+}
+
+void X86WinCOFFStreamer::EmitWindowsUnwindTables() {
+  if (!getNumWinFrameInfos())
+    return;
+  EHStreamer.Emit(*this);
+}
+
+void X86WinCOFFStreamer::FinishImpl() {
+  EmitFrames(nullptr);
+  EmitWindowsUnwindTables();
+
+  MCWinCOFFStreamer::FinishImpl();
+}
+}
+
+MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
+                                           raw_pwrite_stream &OS,
+                                           MCCodeEmitter *CE, bool RelaxAll,
+                                           bool IncrementalLinkerCompatible) {
+  X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS);
+  S->getAssembler().setRelaxAll(RelaxAll);
+  S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
+  return S;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
new file mode 100644
index 0000000..fceb083
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -0,0 +1,22 @@
+//===-- X86TargetInfo.cpp - X86 Target Implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheX86_32Target, llvm::TheX86_64Target;
+
+extern "C" void LLVMInitializeX86TargetInfo() {
+  RegisterTarget<Triple::x86, /*HasJIT=*/true>
+    X(TheX86_32Target, "x86", "32-bit X86: Pentium-Pro and above");
+
+  RegisterTarget<Triple::x86_64, /*HasJIT=*/true>
+    Y(TheX86_64Target, "x86-64", "64-bit X86: EM64T and AMD64");
+}
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
new file mode 100644
index 0000000..619f7c8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -0,0 +1,464 @@
+//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleDecode.h"
+#include "llvm/CodeGen/MachineValueType.h"
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  // Defaults the copying the dest value.
+  ShuffleMask.push_back(0);
+  ShuffleMask.push_back(1);
+  ShuffleMask.push_back(2);
+  ShuffleMask.push_back(3);
+
+  // Decode the immediate.
+  unsigned ZMask = Imm & 15;
+  unsigned CountD = (Imm >> 4) & 3;
+  unsigned CountS = (Imm >> 6) & 3;
+
+  // CountS selects which input element to use.
+  unsigned InVal = 4 + CountS;
+  // CountD specifies which element of destination to update.
+  ShuffleMask[CountD] = InVal;
+  // ZMask zaps values, potentially overriding the CountD elt.
+  if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero;
+  if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero;
+  if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero;
+  if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero;
+}
+
+// <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = NElts / 2; i != NElts; ++i)
+    ShuffleMask.push_back(NElts + i);
+
+  for (unsigned i = NElts / 2; i != NElts; ++i)
+    ShuffleMask.push_back(i);
+}
+
+// <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = 0; i != NElts / 2; ++i)
+    ShuffleMask.push_back(i);
+
+  for (unsigned i = 0; i != NElts / 2; ++i)
+    ShuffleMask.push_back(NElts + i);
+}
+
+void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  for (int i = 0, e = NumElts / 2; i < e; ++i) {
+    ShuffleMask.push_back(2 * i);
+    ShuffleMask.push_back(2 * i);
+  }
+}
+
+void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  for (int i = 0, e = NumElts / 2; i < e; ++i) {
+    ShuffleMask.push_back(2 * i + 1);
+    ShuffleMask.push_back(2 * i + 1);
+  }
+}
+
+void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned VectorSizeInBits = VT.getSizeInBits();
+  unsigned ScalarSizeInBits = VT.getScalarSizeInBits();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VectorSizeInBits / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+  unsigned NumLaneSubElts = 64 / ScalarSizeInBits;
+
+  for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+    for (unsigned i = 0; i < NumLaneElts; i += NumLaneSubElts)
+      for (unsigned s = 0; s != NumLaneSubElts; s++)
+        ShuffleMask.push_back(l + s);
+}
+
+void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned VectorSizeInBits = VT.getSizeInBits();
+  unsigned NumElts = VectorSizeInBits / 8;
+  unsigned NumLanes = VectorSizeInBits / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+    for (unsigned i = 0; i < NumLaneElts; ++i) {
+      int M = SM_SentinelZero;
+      if (i >= Imm) M = i - Imm + l;
+      ShuffleMask.push_back(M);
+    }
+}
+
+void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned VectorSizeInBits = VT.getSizeInBits();
+  unsigned NumElts = VectorSizeInBits / 8;
+  unsigned NumLanes = VectorSizeInBits / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+    for (unsigned i = 0; i < NumLaneElts; ++i) {
+      unsigned Base = i + Imm;
+      int M = Base + l;
+      if (Base >= NumLaneElts) M = SM_SentinelZero;
+      ShuffleMask.push_back(M);
+    }
+}
+
+void DecodePALIGNRMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8);
+
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0; i != NumLaneElts; ++i) {
+      unsigned Base = i + Offset;
+      // if i+offset is out of this lane then we actually need the other source
+      if (Base >= NumLaneElts) Base += NumElts - NumLaneElts;
+      ShuffleMask.push_back(Base + l);
+    }
+  }
+}
+
+/// DecodePSHUFMask - This decodes the shuffle masks for pshufw, pshufd, and vpermilp*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  if (NumLanes == 0) NumLanes = 1;  // Handle MMX
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  unsigned NewImm = Imm;
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0; i != NumLaneElts; ++i) {
+      ShuffleMask.push_back(NewImm % NumLaneElts + l);
+      NewImm /= NumLaneElts;
+    }
+    if (NumLaneElts == 4) NewImm = Imm; // reload imm
+  }
+}
+
+void DecodePSHUFHWMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    unsigned NewImm = Imm;
+    for (unsigned i = 0, e = 4; i != e; ++i) {
+      ShuffleMask.push_back(l + i);
+    }
+    for (unsigned i = 4, e = 8; i != e; ++i) {
+      ShuffleMask.push_back(l + 4 + (NewImm & 3));
+      NewImm >>= 2;
+    }
+  }
+}
+
+void DecodePSHUFLWMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    unsigned NewImm = Imm;
+    for (unsigned i = 0, e = 4; i != e; ++i) {
+      ShuffleMask.push_back(l + (NewImm & 3));
+      NewImm >>= 2;
+    }
+    for (unsigned i = 4, e = 8; i != e; ++i) {
+      ShuffleMask.push_back(l + i);
+    }
+  }
+}
+
+void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumHalfElts = NumElts / 2;
+
+  for (unsigned l = 0; l != NumHalfElts; ++l)
+    ShuffleMask.push_back(l + NumHalfElts);
+  for (unsigned h = 0; h != NumHalfElts; ++h)
+    ShuffleMask.push_back(h);
+}
+
+/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
+/// the type of the vector allowing it to handle different datatypes and vector
+/// widths.
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  unsigned NewImm = Imm;
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    // each half of a lane comes from different source
+    for (unsigned s = 0; s != NumElts * 2; s += NumElts) {
+      for (unsigned i = 0; i != NumLaneElts / 2; ++i) {
+        ShuffleMask.push_back(NewImm % NumLaneElts + s + l);
+        NewImm /= NumLaneElts;
+      }
+    }
+    if (NumLaneElts == 4) NewImm = Imm; // reload imm
+  }
+}
+
+/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// and punpckh*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  if (NumLanes == 0) NumLanes = 1;  // Handle MMX
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = l + NumLaneElts / 2, e = l + NumLaneElts; i != e; ++i) {
+      ShuffleMask.push_back(i);           // Reads from dest/src1
+      ShuffleMask.push_back(i + NumElts); // Reads from src/src2
+    }
+  }
+}
+
+/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// and punpckl*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  if (NumLanes == 0 ) NumLanes = 1;  // Handle MMX
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = l, e = l + NumLaneElts / 2; i != e; ++i) {
+      ShuffleMask.push_back(i);           // Reads from dest/src1
+      ShuffleMask.push_back(i + NumElts); // Reads from src/src2
+    }
+  }
+}
+
+/// \brief Decode a shuffle packed values at 128-bit granularity
+/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2)
+/// immediate mask into a shuffle mask.
+void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
+                        SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumElementsInLane = 128 / VT.getScalarSizeInBits();
+  unsigned ControlBitsMask = NumLanes - 1;
+  unsigned NumControlBits  = NumLanes / 2;
+
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask;
+    // We actually need the other source.
+    if (l >= NumLanes / 2)
+      LaneMask += NumLanes;
+    for (unsigned i = 0; i != NumElementsInLane; ++i)
+      ShuffleMask.push_back(LaneMask * NumElementsInLane + i);
+  }
+}
+
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+                          SmallVectorImpl<int> &ShuffleMask) {
+  unsigned HalfSize = VT.getVectorNumElements() / 2;
+
+  for (unsigned l = 0; l != 2; ++l) {
+    unsigned HalfMask = Imm >> (l * 4);
+    unsigned HalfBegin = (HalfMask & 0x3) * HalfSize;
+    for (unsigned i = HalfBegin, e = HalfBegin + HalfSize; i != e; ++i)
+      ShuffleMask.push_back(HalfMask & 8 ? SM_SentinelZero : (int)i);
+  }
+}
+
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  for (int i = 0, e = RawMask.size(); i < e; ++i) {
+    uint64_t M = RawMask[i];
+    if (M == (uint64_t)SM_SentinelUndef) {
+      ShuffleMask.push_back(M);
+      continue;
+    }
+    // For AVX vectors with 32 bytes the base of the shuffle is the half of
+    // the vector we're inside.
+    int Base = i < 16 ? 0 : 16;
+    // If the high bit (7) of the byte is set, the element is zeroed.
+    if (M & (1 << 7))
+      ShuffleMask.push_back(SM_SentinelZero);
+    else {
+      // Only the least significant 4 bits of the byte are used.
+      int Index = Base + (M & 0xf);
+      ShuffleMask.push_back(Index);
+    }
+  }
+}
+
+void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  int ElementBits = VT.getScalarSizeInBits();
+  int NumElements = VT.getVectorNumElements();
+  for (int i = 0; i < NumElements; ++i) {
+    // If there are more than 8 elements in the vector, then any immediate blend
+    // mask applies to each 128-bit lane. There can never be more than
+    // 8 elements in a 128-bit lane with an immediate blend.
+    int Bit = NumElements > 8 ? i % (128 / ElementBits) : i;
+    assert(Bit < 8 &&
+           "Immediate blends only operate over 8 elements at a time!");
+    ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i);
+  }
+}
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = 0; i != 4; ++i) {
+    ShuffleMask.push_back((Imm >> (2 * i)) & 3);
+  }
+}
+
+void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, SmallVectorImpl<int> &Mask) {
+  unsigned NumDstElts = DstVT.getVectorNumElements();
+  unsigned SrcScalarBits = SrcVT.getScalarSizeInBits();
+  unsigned DstScalarBits = DstVT.getScalarSizeInBits();
+  unsigned Scale = DstScalarBits / SrcScalarBits;
+  assert(SrcScalarBits < DstScalarBits &&
+         "Expected zero extension mask to increase scalar size");
+  assert(SrcVT.getVectorNumElements() >= NumDstElts &&
+         "Too many zero extension lanes");
+
+  for (unsigned i = 0; i != NumDstElts; i++) {
+    Mask.push_back(i);
+    for (unsigned j = 1; j != Scale; j++)
+      Mask.push_back(SM_SentinelZero);
+  }
+}
+
+void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  ShuffleMask.push_back(0);
+  for (unsigned i = 1; i < NumElts; i++)
+    ShuffleMask.push_back(SM_SentinelZero);
+}
+
+void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) {
+  // First element comes from the first element of second source.
+  // Remaining elements: Load zero extends / Move copies from first source.
+  unsigned NumElts = VT.getVectorNumElements();
+  Mask.push_back(NumElts);
+  for (unsigned i = 1; i < NumElts; i++)
+    Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
+}
+
+void DecodeEXTRQIMask(int Len, int Idx,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  // Only the bottom 6 bits are valid for each immediate.
+  Len &= 0x3F;
+  Idx &= 0x3F;
+
+  // We can only decode this bit extraction instruction as a shuffle if both the
+  // length and index work with whole bytes.
+  if (0 != (Len % 8) || 0 != (Idx % 8))
+    return;
+
+  // A length of zero is equivalent to a bit length of 64.
+  if (Len == 0)
+    Len = 64;
+
+  // If the length + index exceeds the bottom 64 bits the result is undefined.
+  if ((Len + Idx) > 64) {
+    ShuffleMask.append(16, SM_SentinelUndef);
+    return;
+  }
+
+  // Convert index and index to work with bytes.
+  Len /= 8;
+  Idx /= 8;
+
+  // EXTRQ: Extract Len bytes starting from Idx. Zero pad the remaining bytes
+  // of the lower 64-bits. The upper 64-bits are undefined.
+  for (int i = 0; i != Len; ++i)
+    ShuffleMask.push_back(i + Idx);
+  for (int i = Len; i != 8; ++i)
+    ShuffleMask.push_back(SM_SentinelZero);
+  for (int i = 8; i != 16; ++i)
+    ShuffleMask.push_back(SM_SentinelUndef);
+}
+
+void DecodeINSERTQIMask(int Len, int Idx,
+                        SmallVectorImpl<int> &ShuffleMask) {
+  // Only the bottom 6 bits are valid for each immediate.
+  Len &= 0x3F;
+  Idx &= 0x3F;
+
+  // We can only decode this bit insertion instruction as a shuffle if both the
+  // length and index work with whole bytes.
+  if (0 != (Len % 8) || 0 != (Idx % 8))
+    return;
+
+  // A length of zero is equivalent to a bit length of 64.
+  if (Len == 0)
+    Len = 64;
+
+  // If the length + index exceeds the bottom 64 bits the result is undefined.
+  if ((Len + Idx) > 64) {
+    ShuffleMask.append(16, SM_SentinelUndef);
+    return;
+  }
+
+  // Convert index and index to work with bytes.
+  Len /= 8;
+  Idx /= 8;
+
+  // INSERTQ: Extract lowest Len bytes from lower half of second source and
+  // insert over first source starting at Idx byte. The upper 64-bits are
+  // undefined.
+  for (int i = 0; i != Idx; ++i)
+    ShuffleMask.push_back(i);
+  for (int i = 0; i != Len; ++i)
+    ShuffleMask.push_back(i + 16);
+  for (int i = Idx + Len; i != 8; ++i)
+    ShuffleMask.push_back(i);
+  for (int i = 8; i != 16; ++i)
+    ShuffleMask.push_back(SM_SentinelUndef);
+}
+
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  for (int i = 0, e = RawMask.size(); i < e; ++i) {
+    uint64_t M = RawMask[i];
+    ShuffleMask.push_back((int)M);
+  }
+}
+
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  for (int i = 0, e = RawMask.size(); i < e; ++i) {
+    uint64_t M = RawMask[i];
+    ShuffleMask.push_back((int)M);
+  }
+}
+
+} // llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
new file mode 100644
index 0000000..72db6a8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -0,0 +1,122 @@
+//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
+#define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/ArrayRef.h"
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+class MVT;
+
+enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 };
+
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+// <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+// <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decodes a PSWAPD 3DNow! instruction.
+void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
+/// the type of the vector allowing it to handle different datatypes and vector
+/// widths.
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// and punpckh*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// and punpckl*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a PSHUFB mask from a raw array of constants such as from
+/// BUILD_VECTOR.
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a BLEND immediate mask into a shuffle mask.
+void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+                          SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a shuffle packed values at 128-bit granularity
+/// immediate mask into a shuffle mask.
+void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
+                               SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a zero extension instruction as a shuffle mask.
+void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT,
+                          SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a move lower and zero upper instruction as a shuffle mask.
+void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a scalar float move instruction as a shuffle mask.
+void DecodeScalarMoveMask(MVT VT, bool IsLoad,
+                          SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a SSE4A EXTRQ instruction as a v16i8 shuffle mask.
+void DecodeEXTRQIMask(int Len, int Idx,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask.
+void DecodeINSERTQIMask(int Len, int Idx,
+                        SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask);
+} // llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h
new file mode 100644
index 0000000..fbec662
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86.h
@@ -0,0 +1,76 @@
+//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the x86
+// target library, as used by the LLVM JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86_H
+#define LLVM_LIB_TARGET_X86_X86_H
+
+#include "llvm/Support/CodeGen.h"
+
+namespace llvm {
+
+class FunctionPass;
+class ImmutablePass;
+class X86TargetMachine;
+
+/// This pass converts a legalized DAG into a X86-specific DAG, ready for
+/// instruction scheduling.
+FunctionPass *createX86ISelDag(X86TargetMachine &TM,
+                               CodeGenOpt::Level OptLevel);
+
+/// This pass initializes a global base register for PIC on x86-32.
+FunctionPass* createX86GlobalBaseRegPass();
+
+/// This pass combines multiple accesses to local-dynamic TLS variables so that
+/// the TLS base address for the module is only fetched once per execution path
+/// through the function.
+FunctionPass *createCleanupLocalDynamicTLSPass();
+
+/// This function returns a pass which converts floating-point register
+/// references and pseudo instructions into floating-point stack references and
+/// physical instructions.
+FunctionPass *createX86FloatingPointStackifierPass();
+
+/// This pass inserts AVX vzeroupper instructions before each call to avoid
+/// transition penalty between functions encoded with AVX and SSE.
+FunctionPass *createX86IssueVZeroUpperPass();
+
+/// Return a pass that pads short functions with NOOPs.
+/// This will prevent a stall when returning on the Atom.
+FunctionPass *createX86PadShortFunctions();
+
+/// Return a a pass that selectively replaces certain instructions (like add,
+/// sub, inc, dec, some shifts, and some multiplies) by equivalent LEA
+/// instructions, in order to eliminate execution delays in some processors.
+FunctionPass *createX86FixupLEAs();
+
+/// Return a pass that removes redundant address recalculations.
+FunctionPass *createX86OptimizeLEAs();
+
+/// Return a pass that optimizes the code-size of x86 call sequences. This is
+/// done by replacing esp-relative movs with pushes.
+FunctionPass *createX86CallFrameOptimization();
+
+/// Return an IR pass that inserts EH registration stack objects and explicit
+/// EH state updates. This pass must run after EH preparation, which does
+/// Windows-specific but architecture-neutral preparation.
+FunctionPass *createX86WinEHStatePass();
+
+/// Return a Machine IR pass that expands X86-specific pseudo
+/// instructions into a sequence of actual instructions. This pass
+/// must run after prologue/epilogue insertion and before lowering
+/// the MachineInstr to MC.
+FunctionPass *createX86ExpandPseudoPass();
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td
new file mode 100644
index 0000000..8902a85
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86.td
@@ -0,0 +1,787 @@
+//===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a target description file for the Intel i386 architecture, referred
+// to here as the "X86" architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget state
+//
+
+def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true",
+                                  "64-bit mode (x86_64)">;
+def Mode32Bit : SubtargetFeature<"32bit-mode", "In32BitMode", "true",
+                                  "32-bit mode (80386)">;
+def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true",
+                                  "16-bit mode (i8086)">;
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget features
+//===----------------------------------------------------------------------===//
+
+def FeatureCMOV    : SubtargetFeature<"cmov","HasCMov", "true",
+                                      "Enable conditional move instructions">;
+
+def FeaturePOPCNT   : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
+                                       "Support POPCNT instruction">;
+
+def FeatureFXSR    : SubtargetFeature<"fxsr", "HasFXSR", "true",
+                                      "Support fxsave/fxrestore instructions">;
+
+def FeatureXSAVE   : SubtargetFeature<"xsave", "HasXSAVE", "true",
+                                       "Support xsave instructions">;
+
+def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true",
+                                       "Support xsaveopt instructions">;
+
+def FeatureXSAVEC  : SubtargetFeature<"xsavec", "HasXSAVEC", "true",
+                                       "Support xsavec instructions">;
+
+def FeatureXSAVES  : SubtargetFeature<"xsaves", "HasXSAVES", "true",
+                                       "Support xsaves instructions">;
+
+def FeatureSSE1    : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
+                                      "Enable SSE instructions",
+                                      // SSE codegen depends on cmovs, and all
+                                      // SSE1+ processors support them.
+                                      [FeatureCMOV]>;
+def FeatureSSE2    : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
+                                      "Enable SSE2 instructions",
+                                      [FeatureSSE1]>;
+def FeatureSSE3    : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
+                                      "Enable SSE3 instructions",
+                                      [FeatureSSE2]>;
+def FeatureSSSE3   : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
+                                      "Enable SSSE3 instructions",
+                                      [FeatureSSE3]>;
+def FeatureSSE41   : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41",
+                                      "Enable SSE 4.1 instructions",
+                                      [FeatureSSSE3]>;
+def FeatureSSE42   : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42",
+                                      "Enable SSE 4.2 instructions",
+                                      [FeatureSSE41]>;
+// The MMX subtarget feature is separate from the rest of the SSE features
+// because it's important (for odd compatibility reasons) to be able to
+// turn it off explicitly while allowing SSE+ to be on.
+def FeatureMMX     : SubtargetFeature<"mmx","X863DNowLevel", "MMX",
+                                      "Enable MMX instructions">;
+def Feature3DNow   : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
+                                      "Enable 3DNow! instructions",
+                                      [FeatureMMX]>;
+def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
+                                      "Enable 3DNow! Athlon instructions",
+                                      [Feature3DNow]>;
+// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied
+// feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
+// without disabling 64-bit mode.
+def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
+                                      "Support 64-bit instructions",
+                                      [FeatureCMOV]>;
+def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
+                                      "64-bit with cmpxchg16b",
+                                      [Feature64Bit]>;
+def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
+                                       "Bit testing of memory is slow">;
+def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
+                                       "SHLD instruction is slow">;
+// FIXME: This should not apply to CPUs that do not have SSE.
+def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
+                                "IsUAMem16Slow", "true",
+                                "Slow unaligned 16-byte memory access">;
+def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
+                                "IsUAMem32Slow", "true",
+                                "Slow unaligned 32-byte memory access">;
+def FeatureSSE4A   : SubtargetFeature<"sse4a", "HasSSE4A", "true",
+                                      "Support SSE 4a instructions",
+                                      [FeatureSSE3]>;
+
+def FeatureAVX     : SubtargetFeature<"avx", "X86SSELevel", "AVX",
+                                      "Enable AVX instructions",
+                                      [FeatureSSE42]>;
+def FeatureAVX2    : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
+                                      "Enable AVX2 instructions",
+                                      [FeatureAVX]>;
+def FeatureAVX512   : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
+                                      "Enable AVX-512 instructions",
+                                      [FeatureAVX2]>;
+def FeatureERI      : SubtargetFeature<"avx512er", "HasERI", "true",
+                      "Enable AVX-512 Exponential and Reciprocal Instructions",
+                                      [FeatureAVX512]>;
+def FeatureCDI      : SubtargetFeature<"avx512cd", "HasCDI", "true",
+                      "Enable AVX-512 Conflict Detection Instructions",
+                                      [FeatureAVX512]>;
+def FeaturePFI      : SubtargetFeature<"avx512pf", "HasPFI", "true",
+                      "Enable AVX-512 PreFetch Instructions",
+                                      [FeatureAVX512]>;
+def FeatureDQI     : SubtargetFeature<"avx512dq", "HasDQI", "true",
+                      "Enable AVX-512 Doubleword and Quadword Instructions",
+                                      [FeatureAVX512]>;
+def FeatureBWI     : SubtargetFeature<"avx512bw", "HasBWI", "true",
+                      "Enable AVX-512 Byte and Word Instructions",
+                                      [FeatureAVX512]>;
+def FeatureVLX     : SubtargetFeature<"avx512vl", "HasVLX", "true",
+                      "Enable AVX-512 Vector Length eXtensions",
+                                      [FeatureAVX512]>;
+def FeaturePKU   : SubtargetFeature<"pku", "HasPKU", "true",
+                      "Enable protection keys">;
+def FeaturePCLMUL  : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
+                         "Enable packed carry-less multiplication instructions",
+                               [FeatureSSE2]>;
+def FeatureFMA     : SubtargetFeature<"fma", "HasFMA", "true",
+                                      "Enable three-operand fused multiple-add",
+                                      [FeatureAVX]>;
+def FeatureFMA4    : SubtargetFeature<"fma4", "HasFMA4", "true",
+                                      "Enable four-operand fused multiple-add",
+                                      [FeatureAVX, FeatureSSE4A]>;
+def FeatureXOP     : SubtargetFeature<"xop", "HasXOP", "true",
+                                      "Enable XOP instructions",
+                                      [FeatureFMA4]>;
+def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem",
+                                          "HasSSEUnalignedMem", "true",
+                      "Allow unaligned memory operands with SSE instructions">;
+def FeatureAES     : SubtargetFeature<"aes", "HasAES", "true",
+                                      "Enable AES instructions",
+                                      [FeatureSSE2]>;
+def FeatureTBM     : SubtargetFeature<"tbm", "HasTBM", "true",
+                                      "Enable TBM instructions">;
+def FeatureMOVBE   : SubtargetFeature<"movbe", "HasMOVBE", "true",
+                                      "Support MOVBE instruction">;
+def FeatureRDRAND  : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
+                                      "Support RDRAND instruction">;
+def FeatureF16C    : SubtargetFeature<"f16c", "HasF16C", "true",
+                       "Support 16-bit floating point conversion instructions",
+                       [FeatureAVX]>;
+def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
+                                       "Support FS/GS Base instructions">;
+def FeatureLZCNT   : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
+                                      "Support LZCNT instruction">;
+def FeatureBMI     : SubtargetFeature<"bmi", "HasBMI", "true",
+                                      "Support BMI instructions">;
+def FeatureBMI2    : SubtargetFeature<"bmi2", "HasBMI2", "true",
+                                      "Support BMI2 instructions">;
+def FeatureRTM     : SubtargetFeature<"rtm", "HasRTM", "true",
+                                      "Support RTM instructions">;
+def FeatureHLE     : SubtargetFeature<"hle", "HasHLE", "true",
+                                      "Support HLE">;
+def FeatureADX     : SubtargetFeature<"adx", "HasADX", "true",
+                                      "Support ADX instructions">;
+def FeatureSHA     : SubtargetFeature<"sha", "HasSHA", "true",
+                                      "Enable SHA instructions",
+                                      [FeatureSSE2]>;
+def FeaturePRFCHW  : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
+                                      "Support PRFCHW instructions">;
+def FeatureRDSEED  : SubtargetFeature<"rdseed", "HasRDSEED", "true",
+                                      "Support RDSEED instruction">;
+def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true",
+                                       "Support LAHF and SAHF instructions">;
+def FeatureMPX     : SubtargetFeature<"mpx", "HasMPX", "true",
+                                      "Support MPX instructions">;
+def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
+                                     "Use LEA for adjusting the stack pointer">;
+def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
+                                     "HasSlowDivide32", "true",
+                                     "Use 8-bit divide for positive values less than 256">;
+def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw",
+                                     "HasSlowDivide64", "true",
+                                     "Use 16-bit divide for positive values less than 65536">;
+def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
+                                     "PadShortFunctions", "true",
+                                     "Pad short functions">;
+// TODO: This feature ought to be renamed.
+// What it really refers to are CPUs for which certain instructions
+// (which ones besides the example below?) are microcoded.
+// The best examples of this are the memory forms of CALL and PUSH
+// instructions, which should be avoided in favor of a MOV + register CALL/PUSH.
+def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect",
+                                     "CallRegIndirect", "true",
+                                     "Call register indirect">;
+def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
+                                   "LEA instruction needs inputs at AG stage">;
+def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
+                                   "LEA instruction with certain arguments is slow">;
+def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
+                                   "INC and DEC instructions are slower than ADD and SUB">;
+def FeatureSoftFloat
+    : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
+                       "Use software floating point features.">;
+
+//===----------------------------------------------------------------------===//
+// X86 processors supported.
+//===----------------------------------------------------------------------===//
+
+include "X86Schedule.td"
+
+def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
+                    "Intel Atom processors">;
+def ProcIntelSLM  : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
+                    "Intel Silvermont processors">;
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : ProcessorModel<Name, GenericModel, Features>;
+
+def : Proc<"generic",         [FeatureSlowUAMem16]>;
+def : Proc<"i386",            [FeatureSlowUAMem16]>;
+def : Proc<"i486",            [FeatureSlowUAMem16]>;
+def : Proc<"i586",            [FeatureSlowUAMem16]>;
+def : Proc<"pentium",         [FeatureSlowUAMem16]>;
+def : Proc<"pentium-mmx",     [FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"i686",            [FeatureSlowUAMem16]>;
+def : Proc<"pentiumpro",      [FeatureSlowUAMem16, FeatureCMOV]>;
+def : Proc<"pentium2",        [FeatureSlowUAMem16, FeatureMMX, FeatureCMOV,
+                               FeatureFXSR]>;
+def : Proc<"pentium3",        [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1,
+                               FeatureFXSR]>;
+def : Proc<"pentium3m",       [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1,
+                               FeatureFXSR, FeatureSlowBTMem]>;
+def : Proc<"pentium-m",       [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2,
+                               FeatureFXSR, FeatureSlowBTMem]>;
+def : Proc<"pentium4",        [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2,
+                               FeatureFXSR]>;
+def : Proc<"pentium4m",       [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2,
+                               FeatureFXSR, FeatureSlowBTMem]>;
+
+// Intel Core Duo.
+def : ProcessorModel<"yonah", SandyBridgeModel,
+                     [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR,
+                      FeatureSlowBTMem]>;
+
+// NetBurst.
+def : Proc<"prescott",
+           [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR,
+            FeatureSlowBTMem]>;
+def : Proc<"nocona", [
+  FeatureSlowUAMem16,
+  FeatureMMX,
+  FeatureSSE3,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem
+]>;
+
+// Intel Core 2 Solo/Duo.
+def : ProcessorModel<"core2", SandyBridgeModel, [
+  FeatureSlowUAMem16,
+  FeatureMMX,
+  FeatureSSSE3,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeatureLAHFSAHF
+]>;
+def : ProcessorModel<"penryn", SandyBridgeModel, [
+  FeatureSlowUAMem16,
+  FeatureMMX,
+  FeatureSSE41,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeatureLAHFSAHF
+]>;
+
+// Atom CPUs.
+class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [
+  ProcIntelAtom,
+  FeatureSlowUAMem16,
+  FeatureMMX,
+  FeatureSSSE3,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureMOVBE,
+  FeatureSlowBTMem,
+  FeatureLEAForSP,
+  FeatureSlowDivide32,
+  FeatureSlowDivide64,
+  FeatureCallRegIndirect,
+  FeatureLEAUsesAG,
+  FeaturePadShortFunctions,
+  FeatureLAHFSAHF
+]>;
+def : BonnellProc<"bonnell">;
+def : BonnellProc<"atom">; // Pin the generic name to the baseline.
+
+class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
+  ProcIntelSLM,
+  FeatureMMX,
+  FeatureSSE42,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureMOVBE,
+  FeaturePOPCNT,
+  FeaturePCLMUL,
+  FeatureAES,
+  FeatureSlowDivide64,
+  FeatureCallRegIndirect,
+  FeaturePRFCHW,
+  FeatureSlowLEA,
+  FeatureSlowIncDec,
+  FeatureSlowBTMem,
+  FeatureLAHFSAHF
+]>;
+def : SilvermontProc<"silvermont">;
+def : SilvermontProc<"slm">; // Legacy alias.
+
+// "Arrandale" along with corei3 and corei5
+class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
+  FeatureMMX,
+  FeatureSSE42,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeaturePOPCNT,
+  FeatureLAHFSAHF
+]>;
+def : NehalemProc<"nehalem">;
+def : NehalemProc<"corei7">;
+
+// Westmere is a similar machine to nehalem with some additional features.
+// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
+class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
+  FeatureMMX,
+  FeatureSSE42,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeaturePOPCNT,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureLAHFSAHF
+]>;
+def : WestmereProc<"westmere">;
+
+// SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
+// rather than a superset.
+class SandyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
+  FeatureMMX,
+  FeatureAVX,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeatureSlowUAMem32,
+  FeaturePOPCNT,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureLAHFSAHF
+]>;
+def : SandyBridgeProc<"sandybridge">;
+def : SandyBridgeProc<"corei7-avx">; // Legacy alias.
+
+class IvyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
+  FeatureMMX,
+  FeatureAVX,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeatureSlowUAMem32,
+  FeaturePOPCNT,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureRDRAND,
+  FeatureF16C,
+  FeatureFSGSBase,
+  FeatureLAHFSAHF
+]>;
+def : IvyBridgeProc<"ivybridge">;
+def : IvyBridgeProc<"core-avx-i">; // Legacy alias.
+
+class HaswellProc<string Name> : ProcessorModel<Name, HaswellModel, [
+  FeatureMMX,
+  FeatureAVX2,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeaturePOPCNT,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureRDRAND,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureF16C,
+  FeatureFSGSBase,
+  FeatureMOVBE,
+  FeatureLZCNT,
+  FeatureBMI,
+  FeatureBMI2,
+  FeatureFMA,
+  FeatureRTM,
+  FeatureHLE,
+  FeatureSlowIncDec,
+  FeatureLAHFSAHF
+]>;
+def : HaswellProc<"haswell">;
+def : HaswellProc<"core-avx2">; // Legacy alias.
+
+class BroadwellProc<string Name> : ProcessorModel<Name, HaswellModel, [
+  FeatureMMX,
+  FeatureAVX2,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeaturePOPCNT,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureRDRAND,
+  FeatureF16C,
+  FeatureFSGSBase,
+  FeatureMOVBE,
+  FeatureLZCNT,
+  FeatureBMI,
+  FeatureBMI2,
+  FeatureFMA,
+  FeatureRTM,
+  FeatureHLE,
+  FeatureADX,
+  FeatureRDSEED,
+  FeatureSlowIncDec,
+  FeatureLAHFSAHF
+]>;
+def : BroadwellProc<"broadwell">;
+
+// FIXME: define KNL model
+class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel, [
+  FeatureMMX,
+  FeatureAVX512,
+  FeatureFXSR,
+  FeatureERI,
+  FeatureCDI,
+  FeaturePFI,
+  FeatureCMPXCHG16B,
+  FeaturePOPCNT,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureRDRAND,
+  FeatureF16C,
+  FeatureFSGSBase,
+  FeatureMOVBE,
+  FeatureLZCNT,
+  FeatureBMI,
+  FeatureBMI2,
+  FeatureFMA,
+  FeatureRTM,
+  FeatureHLE,
+  FeatureSlowIncDec,
+  FeatureMPX,
+  FeatureLAHFSAHF
+]>;
+def : KnightsLandingProc<"knl">;
+
+// FIXME: define SKX model
+class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel, [
+  FeatureMMX,
+  FeatureAVX512,
+  FeatureFXSR,
+  FeatureCDI,
+  FeatureDQI,
+  FeatureBWI,
+  FeatureVLX,
+  FeaturePKU,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeaturePOPCNT,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureRDRAND,
+  FeatureF16C,
+  FeatureFSGSBase,
+  FeatureMOVBE,
+  FeatureLZCNT,
+  FeatureBMI,
+  FeatureBMI2,
+  FeatureFMA,
+  FeatureRTM,
+  FeatureHLE,
+  FeatureADX,
+  FeatureRDSEED,
+  FeatureSlowIncDec,
+  FeatureMPX,
+  FeatureXSAVEC,
+  FeatureXSAVES,
+  FeatureLAHFSAHF
+]>;
+def : SkylakeProc<"skylake">;
+def : SkylakeProc<"skx">; // Legacy alias.
+
+
+// AMD CPUs.
+
+def : Proc<"k6",              [FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"k6-2",            [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"k6-3",            [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"athlon",          [FeatureSlowUAMem16, Feature3DNowA,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-tbird",    [FeatureSlowUAMem16, Feature3DNowA,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-4",        [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA,
+                               FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-xp",       [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA,
+                               FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-mp",       [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA,
+                               FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"k8",              [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
+                               FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"opteron",         [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
+                               FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon64",        [FeatureSlowUAMem16, FeatureSSE2,   Feature3DNowA,
+                               FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-fx",       [FeatureSlowUAMem16, FeatureSSE2,   Feature3DNowA,
+                               FeatureFXSR, Feature64Bit, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"k8-sse3",         [FeatureSlowUAMem16, FeatureSSE3,   Feature3DNowA,
+                               FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"opteron-sse3",    [FeatureSlowUAMem16, FeatureSSE3,   Feature3DNowA,
+                               FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon64-sse3",   [FeatureSlowUAMem16, FeatureSSE3,   Feature3DNowA,
+                               FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"amdfam10",        [FeatureSSE4A, Feature3DNowA, FeatureFXSR,
+                               FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
+                               FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>;
+def : Proc<"barcelona",       [FeatureSSE4A, Feature3DNowA, FeatureFXSR,
+                               FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
+                               FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>;
+
+// Bobcat
+def : Proc<"btver1", [
+  FeatureMMX,
+  FeatureSSSE3,
+  FeatureSSE4A,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeaturePRFCHW,
+  FeatureLZCNT,
+  FeaturePOPCNT,
+  FeatureXSAVE,
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
+]>;
+
+// Jaguar
+def : ProcessorModel<"btver2", BtVer2Model, [
+  FeatureMMX,
+  FeatureAVX,
+  FeatureFXSR,
+  FeatureSSE4A,
+  FeatureCMPXCHG16B,
+  FeaturePRFCHW,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureBMI,
+  FeatureF16C,
+  FeatureMOVBE,
+  FeatureLZCNT,
+  FeaturePOPCNT,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
+]>;
+
+// Bulldozer
+def : Proc<"bdver1", [
+  FeatureXOP,
+  FeatureFMA4,
+  FeatureCMPXCHG16B,
+  FeatureAES,
+  FeaturePRFCHW,
+  FeaturePCLMUL,
+  FeatureMMX,
+  FeatureAVX,
+  FeatureFXSR,
+  FeatureSSE4A,
+  FeatureLZCNT,
+  FeaturePOPCNT,
+  FeatureXSAVE,
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
+]>;
+// Piledriver
+def : Proc<"bdver2", [
+  FeatureXOP,
+  FeatureFMA4,
+  FeatureCMPXCHG16B,
+  FeatureAES,
+  FeaturePRFCHW,
+  FeaturePCLMUL,
+  FeatureMMX,
+  FeatureAVX,
+  FeatureFXSR,
+  FeatureSSE4A,
+  FeatureF16C,
+  FeatureLZCNT,
+  FeaturePOPCNT,
+  FeatureXSAVE,
+  FeatureBMI,
+  FeatureTBM,
+  FeatureFMA,
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
+]>;
+
+// Steamroller
+def : Proc<"bdver3", [
+  FeatureXOP,
+  FeatureFMA4,
+  FeatureCMPXCHG16B,
+  FeatureAES,
+  FeaturePRFCHW,
+  FeaturePCLMUL,
+  FeatureMMX,
+  FeatureAVX,
+  FeatureFXSR,
+  FeatureSSE4A,
+  FeatureF16C,
+  FeatureLZCNT,
+  FeaturePOPCNT,
+  FeatureXSAVE,
+  FeatureBMI,
+  FeatureTBM,
+  FeatureFMA,
+  FeatureXSAVEOPT,
+  FeatureSlowSHLD,
+  FeatureFSGSBase,
+  FeatureLAHFSAHF
+]>;
+
+// Excavator
+def : Proc<"bdver4", [
+  FeatureMMX,
+  FeatureAVX2,
+  FeatureFXSR,
+  FeatureXOP,
+  FeatureFMA4,
+  FeatureCMPXCHG16B,
+  FeatureAES,
+  FeaturePRFCHW,
+  FeaturePCLMUL,
+  FeatureF16C,
+  FeatureLZCNT,
+  FeaturePOPCNT,
+  FeatureXSAVE,
+  FeatureBMI,
+  FeatureBMI2,
+  FeatureTBM,
+  FeatureFMA,
+  FeatureXSAVEOPT,
+  FeatureFSGSBase,
+  FeatureLAHFSAHF
+]>;
+
+def : Proc<"geode",           [FeatureSlowUAMem16, Feature3DNowA]>;
+
+def : Proc<"winchip-c6",      [FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"winchip2",        [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"c3",              [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"c3-2", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, FeatureFXSR]>;
+
+// We also provide a generic 64-bit specific x86 processor model which tries to
+// be good for modern chips without enabling instruction set encodings past the
+// basic SSE2 and 64-bit ones. It disables slow things from any mainstream and
+// modern 64-bit x86 chip, and enables features that are generally beneficial.
+//
+// We currently use the Sandy Bridge model as the default scheduling model as
+// we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which
+// covers a huge swath of x86 processors. If there are specific scheduling
+// knobs which need to be tuned differently for AMD chips, we might consider
+// forming a common base for them.
+def : ProcessorModel<"x86-64", SandyBridgeModel,
+                     [FeatureMMX, FeatureSSE2, FeatureFXSR, Feature64Bit,
+                      FeatureSlowBTMem ]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "X86RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "X86InstrInfo.td"
+
+def X86InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "X86CallingConv.td"
+
+
+//===----------------------------------------------------------------------===//
+// Assembly Parser
+//===----------------------------------------------------------------------===//
+
+def ATTAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+
+  // Variant name.
+  string Name = "att";
+
+  // Discard comments in assembly strings.
+  string CommentDelimiter = "#";
+
+  // Recognize hard coded registers.
+  string RegisterPrefix = "%";
+}
+
+def IntelAsmParserVariant : AsmParserVariant {
+  int Variant = 1;
+
+  // Variant name.
+  string Name = "intel";
+
+  // Discard comments in assembly strings.
+  string CommentDelimiter = ";";
+
+  // Recognize hard coded registers.
+  string RegisterPrefix = "";
+}
+
+//===----------------------------------------------------------------------===//
+// Assembly Printers
+//===----------------------------------------------------------------------===//
+
+// The X86 target supports two different syntaxes for emitting machine code.
+// This is controlled by the -x86-asm-syntax={att|intel}
+def ATTAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "ATTInstPrinter";
+  int Variant = 0;
+}
+def IntelAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "IntelInstPrinter";
+  int Variant = 1;
+}
+
+def X86 : Target {
+  // Information about the instructions...
+  let InstructionSet = X86InstrInfo;
+  let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant];
+  let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
new file mode 100644
index 0000000..2170e62
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -0,0 +1,706 @@
+//===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to X86 machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Primitive Helper Functions.
+//===----------------------------------------------------------------------===//
+
+/// runOnMachineFunction - Emit the function body.
+///
+bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<X86Subtarget>();
+
+  SMShadowTracker.startFunction(MF);
+
+  SetupMachineFunction(MF);
+
+  if (Subtarget->isTargetCOFF()) {
+    bool Intrn = MF.getFunction()->hasInternalLinkage();
+    OutStreamer->BeginCOFFSymbolDef(CurrentFnSym);
+    OutStreamer->EmitCOFFSymbolStorageClass(Intrn ? COFF::IMAGE_SYM_CLASS_STATIC
+                                            : COFF::IMAGE_SYM_CLASS_EXTERNAL);
+    OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+                                               << COFF::SCT_COMPLEX_TYPE_SHIFT);
+    OutStreamer->EndCOFFSymbolDef();
+  }
+
+  // Emit the rest of the function body.
+  EmitFunctionBody();
+
+  // We didn't modify anything.
+  return false;
+}
+
+/// printSymbolOperand - Print a raw symbol reference operand.  This handles
+/// jump tables, constant pools, global address and external symbols, all of
+/// which print to a label with various suffixes for relocation types etc.
+static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
+                               raw_ostream &O) {
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown symbol type!");
+  case MachineOperand::MO_ConstantPoolIndex:
+    P.GetCPISymbol(MO.getIndex())->print(O, P.MAI);
+    P.printOffset(MO.getOffset(), O);
+    break;
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MO.getGlobal();
+
+    MCSymbol *GVSym;
+    if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB)
+      GVSym = P.getSymbolWithGlobalValueBase(GV, "$stub");
+    else if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+             MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE ||
+             MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
+      GVSym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+    else
+      GVSym = P.getSymbol(GV);
+
+    // Handle dllimport linkage.
+    if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
+      GVSym =
+          P.OutContext.getOrCreateSymbol(Twine("__imp_") + GVSym->getName());
+
+    if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+        MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) {
+      MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+      MachineModuleInfoImpl::StubValueTy &StubSym =
+          P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
+      if (!StubSym.getPointer())
+        StubSym = MachineModuleInfoImpl::
+          StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
+    } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){
+      MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+      MachineModuleInfoImpl::StubValueTy &StubSym =
+          P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(
+              Sym);
+      if (!StubSym.getPointer())
+        StubSym = MachineModuleInfoImpl::
+          StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
+    } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
+      MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$stub");
+      MachineModuleInfoImpl::StubValueTy &StubSym =
+          P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
+      if (!StubSym.getPointer())
+        StubSym = MachineModuleInfoImpl::
+          StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
+    }
+
+    // If the name begins with a dollar-sign, enclose it in parens.  We do this
+    // to avoid having it look like an integer immediate to the assembler.
+    if (GVSym->getName()[0] != '$')
+      GVSym->print(O, P.MAI);
+    else {
+      O << '(';
+      GVSym->print(O, P.MAI);
+      O << ')';
+    }
+    P.printOffset(MO.getOffset(), O);
+    break;
+  }
+  }
+
+  switch (MO.getTargetFlags()) {
+  default:
+    llvm_unreachable("Unknown target flag on GV operand");
+  case X86II::MO_NO_FLAG:    // No flag.
+    break;
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DLLIMPORT:
+  case X86II::MO_DARWIN_STUB:
+    // These affect the name of the symbol, not any suffix.
+    break;
+  case X86II::MO_GOT_ABSOLUTE_ADDRESS:
+    O << " + [.-";
+    P.MF->getPICBaseSymbol()->print(O, P.MAI);
+    O << ']';
+    break;
+  case X86II::MO_PIC_BASE_OFFSET:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+    O << '-';
+    P.MF->getPICBaseSymbol()->print(O, P.MAI);
+    break;
+  case X86II::MO_TLSGD:     O << "@TLSGD";     break;
+  case X86II::MO_TLSLD:     O << "@TLSLD";     break;
+  case X86II::MO_TLSLDM:    O << "@TLSLDM";    break;
+  case X86II::MO_GOTTPOFF:  O << "@GOTTPOFF";  break;
+  case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break;
+  case X86II::MO_TPOFF:     O << "@TPOFF";     break;
+  case X86II::MO_DTPOFF:    O << "@DTPOFF";    break;
+  case X86II::MO_NTPOFF:    O << "@NTPOFF";    break;
+  case X86II::MO_GOTNTPOFF: O << "@GOTNTPOFF"; break;
+  case X86II::MO_GOTPCREL:  O << "@GOTPCREL";  break;
+  case X86II::MO_GOT:       O << "@GOT";       break;
+  case X86II::MO_GOTOFF:    O << "@GOTOFF";    break;
+  case X86II::MO_PLT:       O << "@PLT";       break;
+  case X86II::MO_TLVP:      O << "@TLVP";      break;
+  case X86II::MO_TLVP_PIC_BASE:
+    O << "@TLVP" << '-';
+    P.MF->getPICBaseSymbol()->print(O, P.MAI);
+    break;
+  case X86II::MO_SECREL:    O << "@SECREL32";  break;
+  }
+}
+
+static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
+                         unsigned OpNo, raw_ostream &O,
+                         const char *Modifier = nullptr, unsigned AsmVariant = 0);
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value.  These print slightly differently, for
+/// example, a $ is not emitted.
+static void printPCRelImm(X86AsmPrinter &P, const MachineInstr *MI,
+                          unsigned OpNo, raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  switch (MO.getType()) {
+  default: llvm_unreachable("Unknown pcrel immediate operand");
+  case MachineOperand::MO_Register:
+    // pc-relativeness was handled when computing the value in the reg.
+    printOperand(P, MI, OpNo, O);
+    return;
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_GlobalAddress:
+    printSymbolOperand(P, MO, O);
+    return;
+  }
+}
+
+static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
+                         unsigned OpNo, raw_ostream &O, const char *Modifier,
+                         unsigned AsmVariant) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown operand type!");
+  case MachineOperand::MO_Register: {
+    // FIXME: Enumerating AsmVariant, so we can remove magic number.
+    if (AsmVariant == 0) O << '%';
+    unsigned Reg = MO.getReg();
+    if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+      unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 :
+                      (strcmp(Modifier+6,"32") == 0) ? 32 :
+                      (strcmp(Modifier+6,"16") == 0) ? 16 : 8;
+      Reg = getX86SubSuperRegister(Reg, Size);
+    }
+    O << X86ATTInstPrinter::getRegisterName(Reg);
+    return;
+  }
+
+  case MachineOperand::MO_Immediate:
+    if (AsmVariant == 0) O << '$';
+    O << MO.getImm();
+    return;
+
+  case MachineOperand::MO_GlobalAddress: {
+    if (AsmVariant == 0) O << '$';
+    printSymbolOperand(P, MO, O);
+    break;
+  }
+  }
+}
+
+static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
+                                 unsigned Op, raw_ostream &O,
+                                 const char *Modifier = nullptr) {
+  const MachineOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
+  const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+  const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+
+  // If we really don't want to print out (rip), don't.
+  bool HasBaseReg = BaseReg.getReg() != 0;
+  if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") &&
+      BaseReg.getReg() == X86::RIP)
+    HasBaseReg = false;
+
+  // HasParenPart - True if we will print out the () part of the mem ref.
+  bool HasParenPart = IndexReg.getReg() || HasBaseReg;
+
+  switch (DispSpec.getType()) {
+  default:
+    llvm_unreachable("unknown operand type!");
+  case MachineOperand::MO_Immediate: {
+    int DispVal = DispSpec.getImm();
+    if (DispVal || !HasParenPart)
+      O << DispVal;
+    break;
+  }
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ConstantPoolIndex:
+    printSymbolOperand(P, DispSpec, O);
+  }
+
+  if (Modifier && strcmp(Modifier, "H") == 0)
+    O << "+8";
+
+  if (HasParenPart) {
+    assert(IndexReg.getReg() != X86::ESP &&
+           "X86 doesn't allow scaling by ESP");
+
+    O << '(';
+    if (HasBaseReg)
+      printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier);
+
+    if (IndexReg.getReg()) {
+      O << ',';
+      printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier);
+      unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+      if (ScaleVal != 1)
+        O << ',' << ScaleVal;
+    }
+    O << ')';
+  }
+}
+
+static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI,
+                              unsigned Op, raw_ostream &O,
+                              const char *Modifier = nullptr) {
+  assert(isMem(MI, Op) && "Invalid memory reference!");
+  const MachineOperand &Segment = MI->getOperand(Op+X86::AddrSegmentReg);
+  if (Segment.getReg()) {
+    printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier);
+    O << ':';
+  }
+  printLeaMemReference(P, MI, Op, O, Modifier);
+}
+
+static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
+                                   unsigned Op, raw_ostream &O,
+                                   const char *Modifier = nullptr,
+                                   unsigned AsmVariant = 1) {
+  const MachineOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
+  unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+  const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+  const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+  const MachineOperand &SegReg   = MI->getOperand(Op+X86::AddrSegmentReg);
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier, AsmVariant);
+    O << ':';
+  }
+
+  O << '[';
+
+  bool NeedPlus = false;
+  if (BaseReg.getReg()) {
+    printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier, AsmVariant);
+    NeedPlus = true;
+  }
+
+  if (IndexReg.getReg()) {
+    if (NeedPlus) O << " + ";
+    if (ScaleVal != 1)
+      O << ScaleVal << '*';
+    printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier, AsmVariant);
+    NeedPlus = true;
+  }
+
+  if (!DispSpec.isImm()) {
+    if (NeedPlus) O << " + ";
+    printOperand(P, MI, Op+X86::AddrDisp, O, Modifier, AsmVariant);
+  } else {
+    int64_t DispVal = DispSpec.getImm();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+      if (NeedPlus) {
+        if (DispVal > 0)
+          O << " + ";
+        else {
+          O << " - ";
+          DispVal = -DispVal;
+        }
+      }
+      O << DispVal;
+    }
+  }
+  O << ']';
+}
+
+static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
+                              char Mode, raw_ostream &O) {
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default: return true;  // Unknown mode.
+  case 'b': // Print QImode register
+    Reg = getX86SubSuperRegister(Reg, 8);
+    break;
+  case 'h': // Print QImode high register
+    Reg = getX86SubSuperRegister(Reg, 8, true);
+    break;
+  case 'w': // Print HImode register
+    Reg = getX86SubSuperRegister(Reg, 16);
+    break;
+  case 'k': // Print SImode register
+    Reg = getX86SubSuperRegister(Reg, 32);
+    break;
+  case 'q':
+    // Print 64-bit register names if 64-bit integer registers are available.
+    // Otherwise, print 32-bit register names.
+    Reg = getX86SubSuperRegister(Reg, P.getSubtarget().is64Bit() ? 64 : 32);
+    break;
+  }
+
+  O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
+  return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                    unsigned AsmVariant,
+                                    const char *ExtraCode, raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    const MachineOperand &MO = MI->getOperand(OpNo);
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'a': // This is an address.  Currently only 'i' and 'r' are expected.
+      switch (MO.getType()) {
+      default:
+        return true;
+      case MachineOperand::MO_Immediate:
+        O << MO.getImm();
+        return false;
+      case MachineOperand::MO_ConstantPoolIndex:
+      case MachineOperand::MO_JumpTableIndex:
+      case MachineOperand::MO_ExternalSymbol:
+        llvm_unreachable("unexpected operand type!");
+      case MachineOperand::MO_GlobalAddress:
+        printSymbolOperand(*this, MO, O);
+        if (Subtarget->isPICStyleRIPRel())
+          O << "(%rip)";
+        return false;
+      case MachineOperand::MO_Register:
+        O << '(';
+        printOperand(*this, MI, OpNo, O);
+        O << ')';
+        return false;
+      }
+
+    case 'c': // Don't print "$" before a global var name or constant.
+      switch (MO.getType()) {
+      default:
+        printOperand(*this, MI, OpNo, O);
+        break;
+      case MachineOperand::MO_Immediate:
+        O << MO.getImm();
+        break;
+      case MachineOperand::MO_ConstantPoolIndex:
+      case MachineOperand::MO_JumpTableIndex:
+      case MachineOperand::MO_ExternalSymbol:
+        llvm_unreachable("unexpected operand type!");
+      case MachineOperand::MO_GlobalAddress:
+        printSymbolOperand(*this, MO, O);
+        break;
+      }
+      return false;
+
+    case 'A': // Print '*' before a register (it must be a register)
+      if (MO.isReg()) {
+        O << '*';
+        printOperand(*this, MI, OpNo, O);
+        return false;
+      }
+      return true;
+
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+    case 'q': // Print DImode register
+      if (MO.isReg())
+        return printAsmMRegister(*this, MO, ExtraCode[0], O);
+      printOperand(*this, MI, OpNo, O);
+      return false;
+
+    case 'P': // This is the operand of a call, treat specially.
+      printPCRelImm(*this, MI, OpNo, O);
+      return false;
+
+    case 'n':  // Negate the immediate or print a '-' before the operand.
+      // Note: this is a temporary solution. It should be handled target
+      // independently as part of the 'MC' work.
+      if (MO.isImm()) {
+        O << -MO.getImm();
+        return false;
+      }
+      O << '-';
+    }
+  }
+
+  printOperand(*this, MI, OpNo, O, /*Modifier*/ nullptr, AsmVariant);
+  return false;
+}
+
+bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                          unsigned OpNo, unsigned AsmVariant,
+                                          const char *ExtraCode,
+                                          raw_ostream &O) {
+  if (AsmVariant) {
+    printIntelMemReference(*this, MI, OpNo, O);
+    return false;
+  }
+
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+    case 'q': // Print SImode register
+      // These only apply to registers, ignore on mem.
+      break;
+    case 'H':
+      printMemReference(*this, MI, OpNo, O, "H");
+      return false;
+    case 'P': // Don't print @PLT, but do print as memory.
+      printMemReference(*this, MI, OpNo, O, "no-rip");
+      return false;
+    }
+  }
+  printMemReference(*this, MI, OpNo, O);
+  return false;
+}
+
+void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
+  const Triple &TT = TM.getTargetTriple();
+
+  if (TT.isOSBinFormatMachO())
+    OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+
+  if (TT.isOSBinFormatCOFF()) {
+    // Emit an absolute @feat.00 symbol.  This appears to be some kind of
+    // compiler features bitfield read by link.exe.
+    if (TT.getArch() == Triple::x86) {
+      MCSymbol *S = MMI->getContext().getOrCreateSymbol(StringRef("@feat.00"));
+      OutStreamer->BeginCOFFSymbolDef(S);
+      OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
+      OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
+      OutStreamer->EndCOFFSymbolDef();
+      // According to the PE-COFF spec, the LSB of this value marks the object
+      // for "registered SEH".  This means that all SEH handler entry points
+      // must be registered in .sxdata.  Use of any unregistered handlers will
+      // cause the process to terminate immediately.  LLVM does not know how to
+      // register any SEH handlers, so its object files should be safe.
+      OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
+      OutStreamer->EmitAssignment(
+          S, MCConstantExpr::create(int64_t(1), MMI->getContext()));
+    }
+  }
+  OutStreamer->EmitSyntaxDirective();
+}
+
+static void
+emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
+                         MachineModuleInfoImpl::StubValueTy &MCSym) {
+  // L_foo$stub:
+  OutStreamer.EmitLabel(StubLabel);
+  //   .indirect_symbol _foo
+  OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+
+  if (MCSym.getInt())
+    // External to current translation unit.
+    OutStreamer.EmitIntValue(0, 4/*size*/);
+  else
+    // Internal to current translation unit.
+    //
+    // When we place the LSDA into the TEXT section, the type info
+    // pointers need to be indirect and pc-rel. We accomplish this by
+    // using NLPs; however, sometimes the types are local to the file.
+    // We need to fill in the value for the NLP in those cases.
+    OutStreamer.EmitValue(
+        MCSymbolRefExpr::create(MCSym.getPointer(), OutStreamer.getContext()),
+        4 /*size*/);
+}
+
+MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const {
+  if (Subtarget->isTargetKnownWindowsMSVC()) {
+    const MachineConstantPoolEntry &CPE =
+        MF->getConstantPool()->getConstants()[CPID];
+    if (!CPE.isMachineConstantPoolEntry()) {
+      const DataLayout &DL = MF->getDataLayout();
+      SectionKind Kind = CPE.getSectionKind(&DL);
+      const Constant *C = CPE.Val.ConstVal;
+      if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>(
+              getObjFileLowering().getSectionForConstant(DL, Kind, C))) {
+        if (MCSymbol *Sym = S->getCOMDATSymbol()) {
+          if (Sym->isUndefined())
+            OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global);
+          return Sym;
+        }
+      }
+    }
+  }
+
+  return AsmPrinter::GetCPISymbol(CPID);
+}
+
+void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
+  const Triple &TT = TM.getTargetTriple();
+
+  if (TT.isOSBinFormatMachO()) {
+    // All darwin targets use mach-o.
+    MachineModuleInfoMachO &MMIMacho =
+        MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+    // Output stubs for dynamically-linked functions.
+    MachineModuleInfoMachO::SymbolListTy Stubs;
+
+    Stubs = MMIMacho.GetFnStubList();
+    if (!Stubs.empty()) {
+      MCSection *TheSection = OutContext.getMachOSection(
+          "__IMPORT", "__jump_table",
+          MachO::S_SYMBOL_STUBS | MachO::S_ATTR_SELF_MODIFYING_CODE |
+              MachO::S_ATTR_PURE_INSTRUCTIONS,
+          5, SectionKind::getMetadata());
+      OutStreamer->SwitchSection(TheSection);
+
+      for (const auto &Stub : Stubs) {
+        // L_foo$stub:
+        OutStreamer->EmitLabel(Stub.first);
+        //   .indirect_symbol _foo
+        OutStreamer->EmitSymbolAttribute(Stub.second.getPointer(),
+                                         MCSA_IndirectSymbol);
+        // hlt; hlt; hlt; hlt; hlt     hlt = 0xf4.
+        const char HltInsts[] = "\xf4\xf4\xf4\xf4\xf4";
+        OutStreamer->EmitBytes(StringRef(HltInsts, 5));
+      }
+
+      Stubs.clear();
+      OutStreamer->AddBlankLine();
+    }
+
+    // Output stubs for external and common global variables.
+    Stubs = MMIMacho.GetGVStubList();
+    if (!Stubs.empty()) {
+      MCSection *TheSection = OutContext.getMachOSection(
+          "__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS,
+          SectionKind::getMetadata());
+      OutStreamer->SwitchSection(TheSection);
+
+      for (auto &Stub : Stubs)
+        emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
+
+      Stubs.clear();
+      OutStreamer->AddBlankLine();
+    }
+
+    Stubs = MMIMacho.GetHiddenGVStubList();
+    if (!Stubs.empty()) {
+      MCSection *TheSection = OutContext.getMachOSection(
+          "__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS,
+          SectionKind::getMetadata());
+      OutStreamer->SwitchSection(TheSection);
+
+      for (auto &Stub : Stubs)
+        emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
+
+      Stubs.clear();
+      OutStreamer->AddBlankLine();
+    }
+
+    SM.serializeToStackMapSection();
+    FM.serializeToFaultMapSection();
+
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+  }
+
+  if (TT.isKnownWindowsMSVCEnvironment() && MMI->usesVAFloatArgument()) {
+    StringRef SymbolName =
+        (TT.getArch() == Triple::x86_64) ? "_fltused" : "__fltused";
+    MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName);
+    OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
+  }
+
+  if (TT.isOSBinFormatCOFF()) {
+    const TargetLoweringObjectFileCOFF &TLOFCOFF =
+        static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering());
+
+    std::string Flags;
+    raw_string_ostream FlagsOS(Flags);
+
+    for (const auto &Function : M)
+      TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Function, *Mang);
+    for (const auto &Global : M.globals())
+      TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Global, *Mang);
+    for (const auto &Alias : M.aliases())
+      TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Alias, *Mang);
+
+    FlagsOS.flush();
+
+    // Output collected flags.
+    if (!Flags.empty()) {
+      OutStreamer->SwitchSection(TLOFCOFF.getDrectveSection());
+      OutStreamer->EmitBytes(Flags);
+    }
+
+    SM.serializeToStackMapSection();
+  }
+
+  if (TT.isOSBinFormatELF()) {
+    SM.serializeToStackMapSection();
+    FM.serializeToFaultMapSection();
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Target Registry Stuff
+//===----------------------------------------------------------------------===//
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmPrinter() {
+  RegisterAsmPrinter<X86AsmPrinter> X(TheX86_32Target);
+  RegisterAsmPrinter<X86AsmPrinter> Y(TheX86_64Target);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
new file mode 100644
index 0000000..9c8bd98
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -0,0 +1,131 @@
+//===-- X86AsmPrinter.h - X86 implementation of AsmPrinter ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86ASMPRINTER_H
+#define LLVM_LIB_TARGET_X86_X86ASMPRINTER_H
+
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/FaultMaps.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/Target/TargetMachine.h"
+
+// Implemented in X86MCInstLower.cpp
+namespace {
+  class X86MCInstLower;
+}
+
+namespace llvm {
+class MCStreamer;
+class MCSymbol;
+
+class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
+  const X86Subtarget *Subtarget;
+  StackMaps SM;
+  FaultMaps FM;
+
+  // This utility class tracks the length of a stackmap instruction's 'shadow'.
+  // It is used by the X86AsmPrinter to ensure that the stackmap shadow
+  // invariants (i.e. no other stackmaps, patchpoints, or control flow within
+  // the shadow) are met, while outputting a minimal number of NOPs for padding.
+  //
+  // To minimise the number of NOPs used, the shadow tracker counts the number
+  // of instruction bytes output since the last stackmap. Only if there are too
+  // few instruction bytes to cover the shadow are NOPs used for padding.
+  class StackMapShadowTracker {
+  public:
+    StackMapShadowTracker(TargetMachine &TM);
+    ~StackMapShadowTracker();
+    void startFunction(MachineFunction &MF);
+    void count(MCInst &Inst, const MCSubtargetInfo &STI);
+
+    // Called to signal the start of a shadow of RequiredSize bytes.
+    void reset(unsigned RequiredSize) {
+      RequiredShadowSize = RequiredSize;
+      CurrentShadowSize = 0;
+      InShadow = true;
+    }
+
+    // Called before every stackmap/patchpoint, and at the end of basic blocks,
+    // to emit any necessary padding-NOPs.
+    void emitShadowPadding(MCStreamer &OutStreamer, const MCSubtargetInfo &STI);
+  private:
+    TargetMachine &TM;
+    const MachineFunction *MF;
+    std::unique_ptr<MCCodeEmitter> CodeEmitter;
+    bool InShadow;
+
+    // RequiredShadowSize holds the length of the shadow specified in the most
+    // recently encountered STACKMAP instruction.
+    // CurrentShadowSize counts the number of bytes encoded since the most
+    // recently encountered STACKMAP, stopping when that number is greater than
+    // or equal to RequiredShadowSize.
+    unsigned RequiredShadowSize, CurrentShadowSize;
+  };
+
+  StackMapShadowTracker SMShadowTracker;
+
+  // All instructions emitted by the X86AsmPrinter should use this helper
+  // method.
+  //
+  // This helper function invokes the SMShadowTracker on each instruction before
+  // outputting it to the OutStream. This allows the shadow tracker to minimise
+  // the number of NOPs used for stackmap padding.
+  void EmitAndCountInstruction(MCInst &Inst);
+  void LowerSTACKMAP(const MachineInstr &MI);
+  void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
+  void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
+  void LowerFAULTING_LOAD_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
+
+  void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI);
+
+ public:
+   explicit X86AsmPrinter(TargetMachine &TM,
+                          std::unique_ptr<MCStreamer> Streamer)
+       : AsmPrinter(TM, std::move(Streamer)), SM(*this), FM(*this),
+         SMShadowTracker(TM) {}
+
+  const char *getPassName() const override {
+    return "X86 Assembly / Object Emitter";
+  }
+
+  const X86Subtarget &getSubtarget() const { return *Subtarget; }
+
+  void EmitStartOfAsmFile(Module &M) override;
+
+  void EmitEndOfAsmFile(Module &M) override;
+
+  void EmitInstruction(const MachineInstr *MI) override;
+
+  void EmitBasicBlockEnd(const MachineBasicBlock &MBB) override {
+    SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+  }
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &OS) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &OS) override;
+
+  /// \brief Return the symbol for the specified constant pool entry.
+  MCSymbol *GetCPISymbol(unsigned CPID) const override;
+
+  bool doInitialization(Module &M) override {
+    SMShadowTracker.reset(0);
+    SM.reset();
+    return AsmPrinter::doInitialization(M);
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
new file mode 100644
index 0000000..fc6ee17
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -0,0 +1,558 @@
+//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that optimizes call sequences on x86.
+// Currently, it converts movs of function parameters onto the stack into
+// pushes. This is beneficial for two main reasons:
+// 1) The push instruction encoding is much smaller than an esp-relative mov
+// 2) It is possible to push memory arguments directly. So, if the
+//    the transformation is preformed pre-reg-alloc, it can help relieve
+//    register pressure.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-cf-opt"
+
+static cl::opt<bool>
+    NoX86CFOpt("no-x86-call-frame-opt",
+               cl::desc("Avoid optimizing x86 call frames for size"),
+               cl::init(false), cl::Hidden);
+
+namespace {
+class X86CallFrameOptimization : public MachineFunctionPass {
+public:
+  X86CallFrameOptimization() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  // Information we know about a particular call site
+  struct CallContext {
+    CallContext()
+        : FrameSetup(nullptr), Call(nullptr), SPCopy(nullptr), ExpectedDist(0),
+          MovVector(4, nullptr), NoStackParams(false), UsePush(false){}
+
+    // Iterator referring to the frame setup instruction
+    MachineBasicBlock::iterator FrameSetup;
+
+    // Actual call instruction
+    MachineInstr *Call;
+
+    // A copy of the stack pointer
+    MachineInstr *SPCopy;
+
+    // The total displacement of all passed parameters
+    int64_t ExpectedDist;
+
+    // The sequence of movs used to pass the parameters
+    SmallVector<MachineInstr *, 4> MovVector;
+
+    // True if this call site has no stack parameters
+    bool NoStackParams;
+
+    // True of this callsite can use push instructions
+    bool UsePush;
+  };
+
+  typedef SmallVector<CallContext, 8> ContextVector;
+
+  bool isLegal(MachineFunction &MF);
+
+  bool isProfitable(MachineFunction &MF, ContextVector &CallSeqMap);
+
+  void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator I, CallContext &Context);
+
+  bool adjustCallSequence(MachineFunction &MF, const CallContext &Context);
+
+  MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
+                                   unsigned Reg);
+
+  enum InstClassification { Convert, Skip, Exit };
+
+  InstClassification classifyInstruction(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                                         const X86RegisterInfo &RegInfo,
+                                         DenseSet<unsigned int> &UsedRegs);
+
+  const char *getPassName() const override { return "X86 Optimize Call Frame"; }
+
+  const TargetInstrInfo *TII;
+  const X86FrameLowering *TFL;
+  const X86Subtarget *STI;
+  const MachineRegisterInfo *MRI;
+  static char ID;
+};
+
+char X86CallFrameOptimization::ID = 0;
+}
+
+FunctionPass *llvm::createX86CallFrameOptimization() {
+  return new X86CallFrameOptimization();
+}
+
+// This checks whether the transformation is legal.
+// Also returns false in cases where it's potentially legal, but
+// we don't even want to try.
+bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
+  if (NoX86CFOpt.getValue())
+    return false;
+
+  // We currently only support call sequences where *all* parameters.
+  // are passed on the stack.
+  // No point in running this in 64-bit mode, since some arguments are
+  // passed in-register in all common calling conventions, so the pattern
+  // we're looking for will never match.
+  if (STI->is64Bit())
+    return false;
+
+  // We can't encode multiple DW_CFA_GNU_args_size or DW_CFA_def_cfa_offset
+  // in the compact unwind encoding that Darwin uses. So, bail if there
+  // is a danger of that being generated.
+  if (STI->isTargetDarwin() && 
+     (!MF.getMMI().getLandingPads().empty() || 
+       (MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF))))
+    return false;
+
+  // You would expect straight-line code between call-frame setup and
+  // call-frame destroy. You would be wrong. There are circumstances (e.g.
+  // CMOV_GR8 expansion of a select that feeds a function call!) where we can
+  // end up with the setup and the destroy in different basic blocks.
+  // This is bad, and breaks SP adjustment.
+  // So, check that all of the frames in the function are closed inside
+  // the same block, and, for good measure, that there are no nested frames.
+  unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+  unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+  for (MachineBasicBlock &BB : MF) {
+    bool InsideFrameSequence = false;
+    for (MachineInstr &MI : BB) {
+      if (MI.getOpcode() == FrameSetupOpcode) {
+        if (InsideFrameSequence)
+          return false;
+        InsideFrameSequence = true;
+      } else if (MI.getOpcode() == FrameDestroyOpcode) {
+        if (!InsideFrameSequence)
+          return false;
+        InsideFrameSequence = false;
+      }
+    }
+
+    if (InsideFrameSequence)
+      return false;
+  }
+
+  return true;
+}
+
+// Check whether this trasnformation is profitable for a particular
+// function - in terms of code size.
+bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, 
+  ContextVector &CallSeqVector) {
+  // This transformation is always a win when we do not expect to have
+  // a reserved call frame. Under other circumstances, it may be either
+  // a win or a loss, and requires a heuristic.
+  bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects();
+  if (CannotReserveFrame)
+    return true;
+
+  // Don't do this when not optimizing for size.
+  if (!MF.getFunction()->optForSize())
+    return false;
+
+  unsigned StackAlign = TFL->getStackAlignment();
+
+  int64_t Advantage = 0;
+  for (auto CC : CallSeqVector) {
+    // Call sites where no parameters are passed on the stack
+    // do not affect the cost, since there needs to be no
+    // stack adjustment.
+    if (CC.NoStackParams)
+      continue;
+
+    if (!CC.UsePush) {
+      // If we don't use pushes for a particular call site,
+      // we pay for not having a reserved call frame with an
+      // additional sub/add esp pair. The cost is ~3 bytes per instruction,
+      // depending on the size of the constant.
+      // TODO: Callee-pop functions should have a smaller penalty, because
+      // an add is needed even with a reserved call frame.
+      Advantage -= 6;
+    } else {
+      // We can use pushes. First, account for the fixed costs.
+      // We'll need a add after the call.
+      Advantage -= 3;
+      // If we have to realign the stack, we'll also need and sub before
+      if (CC.ExpectedDist % StackAlign)
+        Advantage -= 3;
+      // Now, for each push, we save ~3 bytes. For small constants, we actually,
+      // save more (up to 5 bytes), but 3 should be a good approximation.
+      Advantage += (CC.ExpectedDist / 4) * 3;
+    }
+  }
+
+  return (Advantage >= 0);
+}
+
+bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
+  STI = &MF.getSubtarget<X86Subtarget>();
+  TII = STI->getInstrInfo();
+  TFL = STI->getFrameLowering();
+  MRI = &MF.getRegInfo();
+
+  if (!isLegal(MF))
+    return false;
+
+  unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+
+  bool Changed = false;
+
+  ContextVector CallSeqVector;
+
+  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
+      if (I->getOpcode() == FrameSetupOpcode) {
+        CallContext Context;
+        collectCallInfo(MF, *BB, I, Context);
+        CallSeqVector.push_back(Context);
+      }
+
+  if (!isProfitable(MF, CallSeqVector))
+    return false;
+
+  for (auto CC : CallSeqVector)
+    if (CC.UsePush)
+      Changed |= adjustCallSequence(MF, CC);
+
+  return Changed;
+}
+
+X86CallFrameOptimization::InstClassification
+X86CallFrameOptimization::classifyInstruction(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const X86RegisterInfo &RegInfo, DenseSet<unsigned int> &UsedRegs) {
+  if (MI == MBB.end())
+    return Exit;
+
+  // The instructions we actually care about are movs onto the stack
+  int Opcode = MI->getOpcode();
+  if (Opcode == X86::MOV32mi || Opcode == X86::MOV32mr)
+    return Convert;
+
+  // Not all calling conventions have only stack MOVs between the stack
+  // adjust and the call.
+
+  // We want to tolerate other instructions, to cover more cases.
+  // In particular:
+  // a) PCrel calls, where we expect an additional COPY of the basereg.
+  // b) Passing frame-index addresses.
+  // c) Calling conventions that have inreg parameters. These generate
+  //    both copies and movs into registers.
+  // To avoid creating lots of special cases, allow any instruction
+  // that does not write into memory, does not def or use the stack
+  // pointer, and does not def any register that was used by a preceding
+  // push.
+  // (Reading from memory is allowed, even if referenced through a
+  // frame index, since these will get adjusted properly in PEI)
+
+  // The reason for the last condition is that the pushes can't replace
+  // the movs in place, because the order must be reversed.
+  // So if we have a MOV32mr that uses EDX, then an instruction that defs
+  // EDX, and then the call, after the transformation the push will use
+  // the modified version of EDX, and not the original one.
+  // Since we are still in SSA form at this point, we only need to
+  // make sure we don't clobber any *physical* registers that were
+  // used by an earlier mov that will become a push.
+
+  if (MI->isCall() || MI->mayStore())
+    return Exit;
+
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg())
+      continue;
+    unsigned int Reg = MO.getReg();
+    if (!RegInfo.isPhysicalRegister(Reg))
+      continue;
+    if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister()))
+      return Exit;
+    if (MO.isDef()) {
+      for (unsigned int U : UsedRegs)
+        if (RegInfo.regsOverlap(Reg, U))
+          return Exit;
+    }
+  }
+
+  return Skip;
+}
+
+void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
+                                               MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator I,
+                                               CallContext &Context) {
+  // Check that this particular call sequence is amenable to the
+  // transformation.
+  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
+                                       STI->getRegisterInfo());
+  unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+
+  // We expect to enter this at the beginning of a call sequence
+  assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
+  MachineBasicBlock::iterator FrameSetup = I++;
+  Context.FrameSetup = FrameSetup;
+
+  // How much do we adjust the stack? This puts an upper bound on
+  // the number of parameters actually passed on it.
+  unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;
+
+  // A zero adjustment means no stack parameters
+  if (!MaxAdjust) {
+    Context.NoStackParams = true;
+    return;
+  }
+
+  // For globals in PIC mode, we can have some LEAs here.
+  // Ignore them, they don't bother us.
+  // TODO: Extend this to something that covers more cases.
+  while (I->getOpcode() == X86::LEA32r)
+    ++I;
+
+  // We expect a copy instruction here.
+  // TODO: The copy instruction is a lowering artifact.
+  //       We should also support a copy-less version, where the stack
+  //       pointer is used directly.
+  if (!I->isCopy() || !I->getOperand(0).isReg())
+    return;
+  Context.SPCopy = I++;
+
+  unsigned StackPtr = Context.SPCopy->getOperand(0).getReg();
+
+  // Scan the call setup sequence for the pattern we're looking for.
+  // We only handle a simple case - a sequence of MOV32mi or MOV32mr
+  // instructions, that push a sequence of 32-bit values onto the stack, with
+  // no gaps between them.
+  if (MaxAdjust > 4)
+    Context.MovVector.resize(MaxAdjust, nullptr);
+
+  InstClassification Classification;
+  DenseSet<unsigned int> UsedRegs;
+
+  while ((Classification = classifyInstruction(MBB, I, RegInfo, UsedRegs)) !=
+         Exit) {
+    if (Classification == Skip) {
+      ++I;
+      continue;
+    }
+
+    // We know the instruction is a MOV32mi/MOV32mr.
+    // We only want movs of the form:
+    // movl imm/r32, k(%esp)
+    // If we run into something else, bail.
+    // Note that AddrBaseReg may, counter to its name, not be a register,
+    // but rather a frame index.
+    // TODO: Support the fi case. This should probably work now that we
+    // have the infrastructure to track the stack pointer within a call
+    // sequence.
+    if (!I->getOperand(X86::AddrBaseReg).isReg() ||
+        (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
+        !I->getOperand(X86::AddrScaleAmt).isImm() ||
+        (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
+        (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
+        (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
+        !I->getOperand(X86::AddrDisp).isImm())
+      return;
+
+    int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
+    assert(StackDisp >= 0 &&
+           "Negative stack displacement when passing parameters");
+
+    // We really don't want to consider the unaligned case.
+    if (StackDisp % 4)
+      return;
+    StackDisp /= 4;
+
+    assert((size_t)StackDisp < Context.MovVector.size() &&
+           "Function call has more parameters than the stack is adjusted for.");
+
+    // If the same stack slot is being filled twice, something's fishy.
+    if (Context.MovVector[StackDisp] != nullptr)
+      return;
+    Context.MovVector[StackDisp] = I;
+
+    for (const MachineOperand &MO : I->uses()) {
+      if (!MO.isReg())
+        continue;
+      unsigned int Reg = MO.getReg();
+      if (RegInfo.isPhysicalRegister(Reg))
+        UsedRegs.insert(Reg);
+    }
+
+    ++I;
+  }
+
+  // We now expect the end of the sequence. If we stopped early,
+  // or reached the end of the block without finding a call, bail.
+  if (I == MBB.end() || !I->isCall())
+    return;
+
+  Context.Call = I;
+  if ((++I)->getOpcode() != FrameDestroyOpcode)
+    return;
+
+  // Now, go through the vector, and see that we don't have any gaps,
+  // but only a series of 32-bit MOVs.
+  auto MMI = Context.MovVector.begin(), MME = Context.MovVector.end();
+  for (; MMI != MME; ++MMI, Context.ExpectedDist += 4)
+    if (*MMI == nullptr)
+      break;
+
+  // If the call had no parameters, do nothing
+  if (MMI == Context.MovVector.begin())
+    return;
+
+  // We are either at the last parameter, or a gap.
+  // Make sure it's not a gap
+  for (; MMI != MME; ++MMI)
+    if (*MMI != nullptr)
+      return;
+
+  Context.UsePush = true;
+  return;
+}
+
+bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
+                                                  const CallContext &Context) {
+  // Ok, we can in fact do the transformation for this call.
+  // Do not remove the FrameSetup instruction, but adjust the parameters.
+  // PEI will end up finalizing the handling of this.
+  MachineBasicBlock::iterator FrameSetup = Context.FrameSetup;
+  MachineBasicBlock &MBB = *(FrameSetup->getParent());
+  FrameSetup->getOperand(1).setImm(Context.ExpectedDist);
+
+  DebugLoc DL = FrameSetup->getDebugLoc();
+  // Now, iterate through the vector in reverse order, and replace the movs
+  // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to
+  // replace uses.
+  for (int Idx = (Context.ExpectedDist / 4) - 1; Idx >= 0; --Idx) {
+    MachineBasicBlock::iterator MOV = *Context.MovVector[Idx];
+    MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
+    MachineBasicBlock::iterator Push = nullptr;
+    if (MOV->getOpcode() == X86::MOV32mi) {
+      unsigned PushOpcode = X86::PUSHi32;
+      // If the operand is a small (8-bit) immediate, we can use a
+      // PUSH instruction with a shorter encoding.
+      // Note that isImm() may fail even though this is a MOVmi, because
+      // the operand can also be a symbol.
+      if (PushOp.isImm()) {
+        int64_t Val = PushOp.getImm();
+        if (isInt<8>(Val))
+          PushOpcode = X86::PUSH32i8;
+      }
+      Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
+          .addOperand(PushOp);
+    } else {
+      unsigned int Reg = PushOp.getReg();
+
+      // If PUSHrmm is not slow on this target, try to fold the source of the
+      // push into the instruction.
+      bool SlowPUSHrmm = STI->isAtom() || STI->isSLM();
+
+      // Check that this is legal to fold. Right now, we're extremely
+      // conservative about that.
+      MachineInstr *DefMov = nullptr;
+      if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
+        Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm));
+
+        unsigned NumOps = DefMov->getDesc().getNumOperands();
+        for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
+          Push->addOperand(DefMov->getOperand(i));
+
+        DefMov->eraseFromParent();
+      } else {
+        Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r))
+            .addReg(Reg)
+            .getInstr();
+      }
+    }
+
+    // For debugging, when using SP-based CFA, we need to adjust the CFA
+    // offset after each push.
+    // TODO: This is needed only if we require precise CFA.
+    if (!TFL->hasFP(MF))
+      TFL->BuildCFI(MBB, std::next(Push), DL, 
+                    MCCFIInstruction::createAdjustCfaOffset(nullptr, 4));
+
+    MBB.erase(MOV);
+  }
+
+  // The stack-pointer copy is no longer used in the call sequences.
+  // There should not be any other users, but we can't commit to that, so:
+  if (MRI->use_empty(Context.SPCopy->getOperand(0).getReg()))
+    Context.SPCopy->eraseFromParent();
+
+  // Once we've done this, we need to make sure PEI doesn't assume a reserved
+  // frame.
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  FuncInfo->setHasPushSequences(true);
+
+  return true;
+}
+
+MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
+    MachineBasicBlock::iterator FrameSetup, unsigned Reg) {
+  // Do an extremely restricted form of load folding.
+  // ISel will often create patterns like:
+  // movl    4(%edi), %eax
+  // movl    8(%edi), %ecx
+  // movl    12(%edi), %edx
+  // movl    %edx, 8(%esp)
+  // movl    %ecx, 4(%esp)
+  // movl    %eax, (%esp)
+  // call
+  // Get rid of those with prejudice.
+  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    return nullptr;
+
+  // Make sure this is the only use of Reg.
+  if (!MRI->hasOneNonDBGUse(Reg))
+    return nullptr;
+
+  MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg);
+
+  // Make sure the def is a MOV from memory.
+  // If the def is an another block, give up.
+  if (DefMI->getOpcode() != X86::MOV32rm ||
+      DefMI->getParent() != FrameSetup->getParent())
+    return nullptr;
+
+  // Make sure we don't have any instructions between DefMI and the
+  // push that make folding the load illegal.
+  for (auto I = DefMI; I != FrameSetup; ++I)
+    if (I->isLoadFoldBarrier())
+      return nullptr;
+
+  return DefMI;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.h b/contrib/llvm/lib/Target/X86/X86CallingConv.h
new file mode 100644
index 0000000..a08160f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.h
@@ -0,0 +1,107 @@
+//=== X86CallingConv.h - X86 Custom Calling Convention Routines -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the X86 Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86CALLINGCONV_H
+#define LLVM_LIB_TARGET_X86_X86CALLINGCONV_H
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+inline bool CC_X86_32_VectorCallIndirect(unsigned &ValNo, MVT &ValVT,
+                                         MVT &LocVT,
+                                         CCValAssign::LocInfo &LocInfo,
+                                         ISD::ArgFlagsTy &ArgFlags,
+                                         CCState &State) {
+  // Similar to CCPassIndirect, with the addition of inreg.
+  LocVT = MVT::i32;
+  LocInfo = CCValAssign::Indirect;
+  ArgFlags.setInReg();
+  return false; // Continue the search, but now for i32.
+}
+
+
+inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
+                                CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
+                                CCState &) {
+  llvm_unreachable("The AnyReg calling convention is only supported by the " \
+                   "stackmap and patchpoint intrinsics.");
+  // gracefully fallback to X86 C calling convention on Release builds.
+  return false;
+}
+
+inline bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT,
+                                         MVT &LocVT,
+                                         CCValAssign::LocInfo &LocInfo,
+                                         ISD::ArgFlagsTy &ArgFlags,
+                                         CCState &State) {
+  // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure
+  // not to split i64 and double between a register and stack
+  static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX};
+  static const unsigned NumRegs = sizeof(RegList)/sizeof(RegList[0]);
+  
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+  // If this is the first part of an double/i64/i128, or if we're already
+  // in the middle of a split, add to the pending list. If this is not
+  // the end of the split, return, otherwise go on to process the pending
+  // list
+  if (ArgFlags.isSplit() || !PendingMembers.empty()) {
+    PendingMembers.push_back(
+        CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+    if (!ArgFlags.isSplitEnd())
+      return true;
+  }
+
+  // If there are no pending members, we are not in the middle of a split,
+  // so do the usual inreg stuff.
+  if (PendingMembers.empty()) {
+    if (unsigned Reg = State.AllocateReg(RegList)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return true;
+    }
+    return false;
+  }
+
+  assert(ArgFlags.isSplitEnd());
+
+  // We now have the entire original argument in PendingMembers, so decide
+  // whether to use registers or the stack.
+  // Per the MCU ABI:
+  // a) To use registers, we need to have enough of them free to contain
+  // the entire argument.
+  // b) We never want to use more than 2 registers for a single argument.
+
+  unsigned FirstFree = State.getFirstUnallocated(RegList);
+  bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree);
+
+  for (auto &It : PendingMembers) {
+    if (UseRegs)
+      It.convertToReg(State.AllocateReg(RegList[FirstFree++]));
+    else
+      It.convertToMem(State.AllocateStack(4, 4));
+    State.addLoc(It);
+  }
+
+  PendingMembers.clear();
+
+  return true;
+}
+
+} // End llvm namespace
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td
new file mode 100644
index 0000000..54d88cb
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td
@@ -0,0 +1,881 @@
+//===-- X86CallingConv.td - Calling Conventions X86 32/64 --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the X86-32 and X86-64
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+    : CCIf<!strconcat("static_cast<const X86Subtarget&>"
+                       "(State.getMachineFunction().getSubtarget()).", F),
+           A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Return-value conventions common to all X86 CC's.
+def RetCC_X86Common : CallingConv<[
+  // Scalar values are returned in AX first, then DX.  For i8, the ABI
+  // requires the values to be in AL and AH, however this code uses AL and DL
+  // instead. This is because using AH for the second register conflicts with
+  // the way LLVM does multiple return values -- a return of {i16,i8} would end
+  // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI
+  // for functions that return two i8 values are currently expected to pack the
+  // values into an i16 (which uses AX, and thus AL:AH).
+  //
+  // For code that doesn't care about the ABI, we allow returning more than two
+  // integer values in registers.
+  CCIfType<[i1],  CCPromoteToType<i8>>,
+  CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>,
+  CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+  CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+  CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX]>>,
+
+  // Boolean vectors of AVX-512 are returned in SIMD registers.
+  // The call from AVX to AVX-512 function should work,
+  // since the boolean types in AVX/AVX2 are promoted by default.
+  CCIfType<[v2i1],  CCPromoteToType<v2i64>>,
+  CCIfType<[v4i1],  CCPromoteToType<v4i32>>,
+  CCIfType<[v8i1],  CCPromoteToType<v8i16>>,
+  CCIfType<[v16i1], CCPromoteToType<v16i8>>,
+  CCIfType<[v32i1], CCPromoteToType<v32i8>>,
+  CCIfType<[v64i1], CCPromoteToType<v64i8>>,
+
+  // Vector types are returned in XMM0 and XMM1, when they fit.  XMM2 and XMM3
+  // can only be used by ABI non-compliant code. If the target doesn't have XMM
+  // registers, it won't have vector types.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+  // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3
+  // can only be used by ABI non-compliant code. This vector type is only
+  // supported while using the AVX target feature.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+            CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+  // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3
+  // can only be used by ABI non-compliant code. This vector type is only
+  // supported while using the AVX-512 target feature.
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+            CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
+  // MMX vector types are always returned in MM0. If the target doesn't have
+  // MM0, it doesn't support these vector types.
+  CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
+
+  // Long double types are always returned in FP0 (even with SSE).
+  CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>
+]>;
+
+// X86-32 C return-value convention.
+def RetCC_X86_32_C : CallingConv<[
+  // The X86-32 calling convention returns FP values in FP0, unless marked
+  // with "inreg" (used here to distinguish one kind of reg from another,
+  // weirdly; this is really the sse-regparm calling convention) in which
+  // case they use XMM0, otherwise it is the same as the common X86 calling
+  // conv.
+  CCIfInReg<CCIfSubtarget<"hasSSE2()",
+    CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+  CCIfType<[f32,f64], CCAssignToReg<[FP0, FP1]>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 FastCC return-value convention.
+def RetCC_X86_32_Fast : CallingConv<[
+  // The X86-32 fastcc returns 1, 2, or 3 FP values in XMM0-2 if the target has
+  // SSE2.
+  // This can happen when a float, 2 x float, or 3 x float vector is split by
+  // target lowering, and is returned in 1-3 sse regs.
+  CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+  CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+
+  // For integers, ECX can be used as an extra return register
+  CCIfType<[i8],  CCAssignToReg<[AL, DL, CL]>>,
+  CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+  CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+
+  // Otherwise, it is the same as the common X86 calling convention.
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// Intel_OCL_BI return-value convention.
+def RetCC_Intel_OCL_BI : CallingConv<[
+  // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
+  CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+  // 256-bit FP vectors
+  // No more than 4 registers
+  CCIfType<[v8f32, v4f64, v8i32, v4i64],
+            CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+  // 512-bit FP vectors
+  CCIfType<[v16f32, v8f64, v16i32, v8i64],
+            CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
+  // i32, i64 in the standard way
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 HiPE return-value convention.
+def RetCC_X86_32_HiPE : CallingConv<[
+  // Promote all types to i32
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Return: HP, P, VAL1, VAL2
+  CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>>
+]>;
+
+// X86-32 HiPE return-value convention.
+def RetCC_X86_32_VectorCall : CallingConv<[
+  // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+  // 256-bit FP vectors
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+            CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+  // 512-bit FP vectors
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+            CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
+  // Return integers in the standard way.
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-64 C return-value convention.
+def RetCC_X86_64_C : CallingConv<[
+  // The X86-64 calling convention always returns FP values in XMM0.
+  CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>,
+  CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
+  CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>,
+
+  // MMX vector types are always returned in XMM0.
+  CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-Win64 C return-value convention.
+def RetCC_X86_Win64_C : CallingConv<[
+  // The X86-Win64 calling convention always returns __m64 values in RAX.
+  CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+
+  // Otherwise, everything is the same as 'normal' X86-64 C CC.
+  CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// X86-64 HiPE return-value convention.
+def RetCC_X86_64_HiPE : CallingConv<[
+  // Promote all types to i64
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Return: HP, P, VAL1, VAL2
+  CCIfType<[i64], CCAssignToReg<[R15, RBP, RAX, RDX]>>
+]>;
+
+// X86-64 WebKit_JS return-value convention.
+def RetCC_X86_64_WebKit_JS : CallingConv<[
+  // Promote all types to i64
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Return: RAX
+  CCIfType<[i64], CCAssignToReg<[RAX]>>
+]>;
+
+// X86-64 AnyReg return-value convention. No explicit register is specified for
+// the return-value. The register allocator is allowed and expected to choose
+// any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the X86 C calling convention.
+def RetCC_X86_64_AnyReg : CallingConv<[
+  CCCustom<"CC_X86_AnyReg_Error">
+]>;
+
+// X86-64 HHVM return-value convention.
+def RetCC_X86_64_HHVM: CallingConv<[
+  // Promote all types to i64
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Return: could return in any GP register save RSP and R12.
+  CCIfType<[i64], CCAssignToReg<[RBX, RBP, RDI, RSI, RDX, RCX, R8, R9,
+                                 RAX, R10, R11, R13, R14, R15]>>
+]>;
+
+// This is the root return-value convention for the X86-32 backend.
+def RetCC_X86_32 : CallingConv<[
+  // If FastCC, use RetCC_X86_32_Fast.
+  CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+  // If HiPE, use RetCC_X86_32_HiPE.
+  CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>,
+
+  // Otherwise, use RetCC_X86_32_C.
+  CCDelegateTo<RetCC_X86_32_C>
+]>;
+
+// This is the root return-value convention for the X86-64 backend.
+def RetCC_X86_64 : CallingConv<[
+  // HiPE uses RetCC_X86_64_HiPE
+  CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>,
+
+  // Handle JavaScript calls.
+  CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<RetCC_X86_64_WebKit_JS>>,
+  CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_X86_64_AnyReg>>,
+
+  // Handle explicit CC selection
+  CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
+  CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
+
+  // Handle HHVM calls.
+  CCIfCC<"CallingConv::HHVM", CCDelegateTo<RetCC_X86_64_HHVM>>,
+
+  // Mingw64 and native Win64 use Win64 CC
+  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>,
+
+  // Otherwise, drop to normal X86-64 CC
+  CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// This is the return-value convention used for the entire X86 backend.
+def RetCC_X86 : CallingConv<[
+
+  // Check if this is the Intel OpenCL built-ins calling convention
+  CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
+
+  CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
+  CCDelegateTo<RetCC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+def CC_X86_64_C : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in R10.
+  CCIfNest<CCIfSubtarget<"isTarget64BitILP32()", CCAssignToReg<[R10D]>>>,
+  CCIfNest<CCAssignToReg<[R10]>>,
+
+  // The first 6 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
+  CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
+
+  // The first 8 MMX vector arguments are passed in XMM registers on Darwin.
+  CCIfType<[x86mmx],
+            CCIfSubtarget<"isTargetDarwin()",
+            CCIfSubtarget<"hasSSE2()",
+            CCPromoteToType<v2i64>>>>,
+
+  // Boolean vectors of AVX-512 are passed in SIMD registers.
+  // The call from AVX to AVX-512 function should work,
+  // since the boolean types in AVX/AVX2 are promoted by default.
+  CCIfType<[v2i1],  CCPromoteToType<v2i64>>,
+  CCIfType<[v4i1],  CCPromoteToType<v4i32>>,
+  CCIfType<[v8i1],  CCPromoteToType<v8i16>>,
+  CCIfType<[v16i1], CCPromoteToType<v16i8>>,
+  CCIfType<[v32i1], CCPromoteToType<v32i8>>,
+  CCIfType<[v64i1], CCPromoteToType<v64i8>>,
+
+  // The first 8 FP/Vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCIfSubtarget<"hasSSE1()",
+            CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
+
+  // The first 8 256-bit vector arguments are passed in YMM registers, unless
+  // this is a vararg function.
+  // FIXME: This isn't precisely correct; the x86-64 ABI document says that
+  // fixed arguments to vararg functions are supposed to be passed in
+  // registers.  Actually modeling that would be a lot of work, though.
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                          CCIfSubtarget<"hasFp256()",
+                          CCAssignToReg<[YMM0, YMM1, YMM2, YMM3,
+                                         YMM4, YMM5, YMM6, YMM7]>>>>,
+
+  // The first 8 512-bit vector arguments are passed in ZMM registers.
+  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+            CCIfSubtarget<"hasAVX512()",
+            CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]>>>>,
+
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+  // Long doubles get stack slots whose size and alignment depends on the
+  // subtarget.
+  CCIfType<[f80, f128], CCAssignToStack<0, 0>>,
+
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // 256-bit vectors get 32-byte stack slots that are 32-byte aligned.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+           CCAssignToStack<32, 32>>,
+
+  // 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+  CCIfType<[v16i32, v8i64, v16f32, v8f64],
+           CCAssignToStack<64, 64>>
+]>;
+
+// Calling convention for X86-64 HHVM.
+def CC_X86_64_HHVM : CallingConv<[
+  // Use all/any GP registers for args, except RSP.
+  CCIfType<[i64], CCAssignToReg<[RBX, R12, RBP, R15,
+                                 RDI, RSI, RDX, RCX, R8, R9,
+                                 RAX, R10, R11, R13, R14]>>
+]>;
+
+// Calling convention for helper functions in HHVM.
+def CC_X86_64_HHVM_C : CallingConv<[
+  // Pass the first argument in RBP.
+  CCIfType<[i64], CCAssignToReg<[RBP]>>,
+
+  // Otherwise it's the same as the regular C calling convention.
+  CCDelegateTo<CC_X86_64_C>
+]>;
+
+// Calling convention used on Win64
+def CC_X86_Win64_C : CallingConv<[
+  // FIXME: Handle byval stuff.
+  // FIXME: Handle varargs.
+
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in R10.
+  CCIfNest<CCAssignToReg<[R10]>>,
+
+  // 128 bit vectors are passed by pointer
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,
+
+
+  // 256 bit vectors are passed by pointer
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>,
+
+  // 512 bit vectors are passed by pointer
+  CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
+
+  // The first 4 MMX vector arguments are passed in GPRs.
+  CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+
+  // The first 4 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
+                                          [XMM0, XMM1, XMM2, XMM3]>>,
+
+  // Do not pass the sret argument in RCX, the Win64 thiscall calling
+  // convention requires "this" to be passed in RCX.
+  CCIfCC<"CallingConv::X86_ThisCall",
+    CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8  , R9  ],
+                                                     [XMM1, XMM2, XMM3]>>>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8  , R9  ],
+                                          [XMM0, XMM1, XMM2, XMM3]>>,
+
+  // The first 4 FP/Vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
+                                   [RCX , RDX , R8  , R9  ]>>,
+
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+  // Long doubles get stack slots whose size and alignment depends on the
+  // subtarget.
+  CCIfType<[f80], CCAssignToStack<0, 0>>
+]>;
+
+def CC_X86_Win64_VectorCall : CallingConv<[
+  // The first 6 floating point and vector types of 128 bits or less use
+  // XMM0-XMM5.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>,
+
+  // 256-bit vectors use YMM registers.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>,
+
+  // 512-bit vectors use ZMM registers.
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+           CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>,
+
+  // Delegate to fastcall to handle integer types.
+  CCDelegateTo<CC_X86_Win64_C>
+]>;
+
+
+def CC_X86_64_GHC : CallingConv<[
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim
+  CCIfType<[i64],
+            CCAssignToReg<[R13, RBP, R12, RBX, R14, RSI, RDI, R8, R9, R15]>>,
+
+  // Pass in STG registers: F1, F2, F3, F4, D1, D2
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCIfSubtarget<"hasSSE1()",
+            CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>>
+]>;
+
+def CC_X86_64_HiPE : CallingConv<[
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2, ARG3
+  CCIfType<[i64], CCAssignToReg<[R15, RBP, RSI, RDX, RCX, R8]>>,
+
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_X86_64_WebKit_JS : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Only the first integer argument is passed in register.
+  CCIfType<[i32], CCAssignToReg<[EAX]>>,
+  CCIfType<[i64], CCAssignToReg<[RAX]>>,
+
+  // The remaining integer arguments are passed on the stack. 32bit integer and
+  // floating-point arguments are aligned to 4 byte and stored in 4 byte slots.
+  // 64bit integer and floating-point arguments are aligned to 8 byte and stored
+  // in 8 byte stack slots.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+// No explicit register is specified for the AnyReg calling convention. The
+// register allocator may assign the arguments to any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the X86 C calling convention.
+def CC_X86_64_AnyReg : CallingConv<[
+  CCCustom<"CC_X86_AnyReg_Error">
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86 C Calling Convention
+//===----------------------------------------------------------------------===//
+
+/// CC_X86_32_Vector_Common - In all X86-32 calling conventions, extra vector
+/// values are spilled on the stack.
+def CC_X86_32_Vector_Common : CallingConv<[
+  // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+           CCAssignToStack<32, 32>>,
+
+  // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+           CCAssignToStack<64, 64>>
+]>;
+
+// CC_X86_32_Vector_Standard - The first 3 vector arguments are passed in
+// vector registers
+def CC_X86_32_Vector_Standard : CallingConv<[
+  // SSE vector arguments are passed in XMM registers.
+  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                CCAssignToReg<[XMM0, XMM1, XMM2]>>>,
+
+  // AVX 256-bit vector arguments are passed in YMM registers.
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                CCIfSubtarget<"hasFp256()",
+                CCAssignToReg<[YMM0, YMM1, YMM2]>>>>,
+
+  // AVX 512-bit vector arguments are passed in ZMM registers.
+  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+                CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>>,
+
+  CCDelegateTo<CC_X86_32_Vector_Common>
+]>;
+
+// CC_X86_32_Vector_Darwin - The first 4 vector arguments are passed in
+// vector registers.
+def CC_X86_32_Vector_Darwin : CallingConv<[
+  // SSE vector arguments are passed in XMM registers.
+  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>,
+
+  // AVX 256-bit vector arguments are passed in YMM registers.
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                CCIfSubtarget<"hasFp256()",
+                CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
+
+  // AVX 512-bit vector arguments are passed in ZMM registers.
+  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+                CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>>,
+
+  CCDelegateTo<CC_X86_32_Vector_Common>
+]>;
+
+/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
+/// values are spilled on the stack.
+def CC_X86_32_Common : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // The first 3 float or double arguments, if marked 'inreg' and if the call
+  // is not a vararg call and if SSE2 is available, are passed in SSE registers.
+  CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64],
+                CCIfSubtarget<"hasSSE2()",
+                CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>,
+
+  // The first 3 __m64 vector arguments are passed in mmx registers if the
+  // call is not a vararg call.
+  CCIfNotVarArg<CCIfType<[x86mmx],
+                CCAssignToReg<[MM0, MM1, MM2]>>>,
+
+  // Integer/Float values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+  // Doubles get 8-byte slots that are 4-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 4>>,
+
+  // Long doubles get slots whose size depends on the subtarget.
+  CCIfType<[f80], CCAssignToStack<0, 4>>,
+
+  // Boolean vectors of AVX-512 are passed in SIMD registers.
+  // The call from AVX to AVX-512 function should work,
+  // since the boolean types in AVX/AVX2 are promoted by default.
+  CCIfType<[v2i1],  CCPromoteToType<v2i64>>,
+  CCIfType<[v4i1],  CCPromoteToType<v4i32>>,
+  CCIfType<[v8i1],  CCPromoteToType<v8i16>>,
+  CCIfType<[v16i1], CCPromoteToType<v16i8>>,
+  CCIfType<[v32i1], CCPromoteToType<v32i8>>,
+  CCIfType<[v64i1], CCPromoteToType<v64i8>>,
+
+  // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
+  // passed in the parameter area.
+  CCIfType<[x86mmx], CCAssignToStack<8, 4>>,
+
+  // Darwin passes vectors in a form that differs from the i386 psABI
+  CCIfSubtarget<"isTargetDarwin()", CCDelegateTo<CC_X86_32_Vector_Darwin>>,
+
+  // Otherwise, drop to 'normal' X86-32 CC
+  CCDelegateTo<CC_X86_32_Vector_Standard>
+]>;
+
+def CC_X86_32_C : CallingConv<[
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in ECX.
+  CCIfNest<CCAssignToReg<[ECX]>>,
+
+  // The first 3 integer arguments, if marked 'inreg' and if the call is not
+  // a vararg call, are passed in integer registers.
+  CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_MCU : CallingConv<[
+  // Handles byval parameters.  Note that, like FastCC, we can't rely on
+  // the delegation to CC_X86_32_Common because that happens after code that
+  // puts arguments in registers.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // If the call is not a vararg call, some arguments may be passed
+  // in integer registers.
+  CCIfNotVarArg<CCIfType<[i32], CCCustom<"CC_X86_32_MCUInReg">>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_FastCall : CallingConv<[
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in EAX.
+  CCIfNest<CCAssignToReg<[EAX]>>,
+
+  // The first 2 integer arguments are passed in ECX/EDX
+  CCIfInReg<CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_VectorCall : CallingConv<[
+  // The first 6 floating point and vector types of 128 bits or less use
+  // XMM0-XMM5.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>,
+
+  // 256-bit vectors use YMM registers.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>,
+
+  // 512-bit vectors use ZMM registers.
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+           CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>,
+
+  // Otherwise, pass it indirectly.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64,
+            v32i8, v16i16, v8i32, v4i64, v8f32, v4f64,
+            v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+           CCCustom<"CC_X86_32_VectorCallIndirect">>,
+
+  // Delegate to fastcall to handle integer types.
+  CCDelegateTo<CC_X86_32_FastCall>
+]>;
+
+def CC_X86_32_ThisCall_Common : CallingConv<[
+  // The first integer argument is passed in ECX
+  CCIfType<[i32], CCAssignToReg<[ECX]>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_ThisCall_Mingw : CallingConv<[
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  CCDelegateTo<CC_X86_32_ThisCall_Common>
+]>;
+
+def CC_X86_32_ThisCall_Win : CallingConv<[
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // Pass sret arguments indirectly through stack.
+  CCIfSRet<CCAssignToStack<4, 4>>,
+
+  CCDelegateTo<CC_X86_32_ThisCall_Common>
+]>;
+
+def CC_X86_32_ThisCall : CallingConv<[
+  CCIfSubtarget<"isTargetCygMing()", CCDelegateTo<CC_X86_32_ThisCall_Mingw>>,
+  CCDelegateTo<CC_X86_32_ThisCall_Win>
+]>;
+
+def CC_X86_32_FastCC : CallingConv<[
+  // Handles byval parameters.  Note that we can't rely on the delegation
+  // to CC_X86_32_Common for this because that happens after code that
+  // puts arguments in registers.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in EAX.
+  CCIfNest<CCAssignToReg<[EAX]>>,
+
+  // The first 2 integer arguments are passed in ECX/EDX
+  CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+
+  // The first 3 float or double arguments, if the call is not a vararg
+  // call and if SSE2 is available, are passed in SSE registers.
+  CCIfNotVarArg<CCIfType<[f32,f64],
+                CCIfSubtarget<"hasSSE2()",
+                CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+
+  // Doubles get 8-byte slots that are 8-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 8>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_GHC : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Pass in STG registers: Base, Sp, Hp, R1
+  CCIfType<[i32], CCAssignToReg<[EBX, EBP, EDI, ESI]>>
+]>;
+
+def CC_X86_32_HiPE : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2
+  CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX, ECX]>>,
+
+  // Integer/Float values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>
+]>;
+
+// X86-64 Intel OpenCL built-ins calling convention.
+def CC_Intel_OCL_BI : CallingConv<[
+
+  CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>,
+  CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8,  R9 ]>>>,
+
+  CCIfType<[i32], CCIfSubtarget<"is64Bit()", CCAssignToReg<[EDI, ESI, EDX, ECX]>>>,
+  CCIfType<[i64], CCIfSubtarget<"is64Bit()", CCAssignToReg<[RDI, RSI, RDX, RCX]>>>,
+
+  CCIfType<[i32], CCAssignToStack<4, 4>>,
+
+  // The SSE vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+  // The 256-bit vector arguments are passed in YMM registers.
+  CCIfType<[v8f32, v4f64, v8i32, v4i64],
+           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>,
+
+  // The 512-bit vector arguments are passed in ZMM registers.
+  CCIfType<[v16f32, v8f64, v16i32, v8i64],
+           CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>,
+
+  // Pass masks in mask registers
+  CCIfType<[v16i1, v8i1], CCAssignToReg<[K1]>>,
+
+  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+  CCIfSubtarget<"is64Bit()",       CCDelegateTo<CC_X86_64_C>>,
+  CCDelegateTo<CC_X86_32_C>
+]>;
+
+def CC_X86_32_Intr : CallingConv<[
+  CCAssignToStack<4, 4>
+]>;
+
+def CC_X86_64_Intr : CallingConv<[
+  CCAssignToStack<8, 8>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Root Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// This is the root argument convention for the X86-32 backend.
+def CC_X86_32 : CallingConv<[
+  CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>,
+  CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>,
+  CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
+  CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
+  CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
+  CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
+  CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>,
+
+  // Otherwise, drop to normal X86-32 CC
+  CCDelegateTo<CC_X86_32_C>
+]>;
+
+// This is the root argument convention for the X86-64 backend.
+def CC_X86_64 : CallingConv<[
+  CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>,
+  CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>,
+  CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<CC_X86_64_WebKit_JS>>,
+  CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>,
+  CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
+  CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>,
+  CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>,
+  CCIfCC<"CallingConv::HHVM_C", CCDelegateTo<CC_X86_64_HHVM_C>>,
+  CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_64_Intr>>,
+
+  // Mingw64 and native Win64 use Win64 CC
+  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+
+  // Otherwise, drop to normal X86-64 CC
+  CCDelegateTo<CC_X86_64_C>
+]>;
+
+// This is the argument convention used for the entire X86 backend.
+def CC_X86 : CallingConv<[
+  CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
+  CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
+  CCDelegateTo<CC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved Registers.
+//===----------------------------------------------------------------------===//
+
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
+def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>;
+
+def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>;
+def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
+
+def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
+                                     (sequence "XMM%u", 6, 15))>;
+
+// The function used by Darwin to obtain the address of a thread-local variable
+// uses rdi to pass a single parameter and rax for the return value. All other
+// GPRs are preserved.
+def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI,
+                                             R8, R9, R10, R11)>;
+
+// All GPRs - except r11
+def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI,
+                                              R8, R9, R10, RSP)>;
+
+// All registers - except r11
+def CSR_64_RT_AllRegs     : CalleeSavedRegs<(add CSR_64_RT_MostRegs,
+                                                 (sequence "XMM%u", 0, 15))>;
+def CSR_64_RT_AllRegs_AVX : CalleeSavedRegs<(add CSR_64_RT_MostRegs,
+                                                 (sequence "YMM%u", 0, 15))>;
+
+def CSR_64_MostRegs : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10,
+                                           R11, R12, R13, R14, R15, RBP,
+                                           (sequence "XMM%u", 0, 15))>;
+
+def CSR_32_AllRegs     : CalleeSavedRegs<(add EAX, EBX, ECX, EDX, EBP, ESI,
+                                              EDI, ESP)>;
+def CSR_32_AllRegs_SSE : CalleeSavedRegs<(add CSR_32_AllRegs,
+                                              (sequence "XMM%u", 0, 7))>;
+
+def CSR_64_AllRegs     : CalleeSavedRegs<(add CSR_64_MostRegs, RAX, RSP,
+                                              (sequence "XMM%u", 16, 31))>;
+def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX, RSP,
+                                                   (sequence "YMM%u", 0, 31)),
+                                              (sequence "XMM%u", 0, 15))>;
+
+// Standard C + YMM6-15
+def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
+                                                  R13, R14, R15,
+                                                  (sequence "YMM%u", 6, 15))>;
+
+def CSR_Win64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI,
+                                                     R12, R13, R14, R15,
+                                                     (sequence "ZMM%u", 6, 21),
+                                                     K4, K5, K6, K7)>;
+//Standard C + XMM 8-15
+def CSR_64_Intel_OCL_BI       : CalleeSavedRegs<(add CSR_64,
+                                                 (sequence "XMM%u", 8, 15))>;
+
+//Standard C + YMM 8-15
+def CSR_64_Intel_OCL_BI_AVX    : CalleeSavedRegs<(add CSR_64,
+                                                  (sequence "YMM%u", 8, 15))>;
+
+def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15,
+                                                  (sequence "ZMM%u", 16, 31),
+                                                  K4, K5, K6, K7)>;
+
+// Only R12 is preserved for PHP calls in HHVM.
+def CSR_64_HHVM : CalleeSavedRegs<(add R12)>;
diff --git a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
new file mode 100644
index 0000000..a09d065
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -0,0 +1,198 @@
+//===------- X86ExpandPseudo.cpp - Expand pseudo instructions -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling, if-conversion, other late
+// optimizations, or simply the encoding of the instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved.
+#include "llvm/IR/GlobalValue.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-pseudo"
+
+namespace {
+class X86ExpandPseudo : public MachineFunctionPass {
+public:
+  static char ID;
+  X86ExpandPseudo() : MachineFunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreservedID(MachineLoopInfoID);
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  const X86Subtarget *STI;
+  const X86InstrInfo *TII;
+  const X86RegisterInfo *TRI;
+  const X86FrameLowering *X86FL;
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  const char *getPassName() const override {
+    return "X86 pseudo instruction expansion pass";
+  }
+
+private:
+  bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+  bool ExpandMBB(MachineBasicBlock &MBB);
+};
+char X86ExpandPseudo::ID = 0;
+} // End anonymous namespace.
+
+/// If \p MBBI is a pseudo instruction, this method expands
+/// it to the corresponding (sequence of) actual instruction(s).
+/// \returns true if \p MBBI has been expanded.
+bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  DebugLoc DL = MBBI->getDebugLoc();
+  switch (Opcode) {
+  default:
+    return false;
+  case X86::TCRETURNdi:
+  case X86::TCRETURNri:
+  case X86::TCRETURNmi:
+  case X86::TCRETURNdi64:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNmi64: {
+    bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64;
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1);
+    assert(StackAdjust.isImm() && "Expecting immediate value.");
+
+    // Adjust stack pointer.
+    int StackAdj = StackAdjust.getImm();
+
+    if (StackAdj) {
+      // Check for possible merge with preceding ADD instruction.
+      StackAdj += X86FL->mergeSPUpdates(MBB, MBBI, true);
+      X86FL->emitSPUpdate(MBB, MBBI, StackAdj, /*InEpilogue=*/true);
+    }
+
+    // Jump to label or value in register.
+    bool IsWin64 = STI->isTargetWin64();
+    if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdi64) {
+      unsigned Op = (Opcode == X86::TCRETURNdi)
+                        ? X86::TAILJMPd
+                        : (IsWin64 ? X86::TAILJMPd64_REX : X86::TAILJMPd64);
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
+      if (JumpTarget.isGlobal())
+        MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+                             JumpTarget.getTargetFlags());
+      else {
+        assert(JumpTarget.isSymbol());
+        MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+                              JumpTarget.getTargetFlags());
+      }
+    } else if (Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64) {
+      unsigned Op = (Opcode == X86::TCRETURNmi)
+                        ? X86::TAILJMPm
+                        : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64);
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
+      for (unsigned i = 0; i != 5; ++i)
+        MIB.addOperand(MBBI->getOperand(i));
+    } else if (Opcode == X86::TCRETURNri64) {
+      BuildMI(MBB, MBBI, DL,
+              TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
+          .addReg(JumpTarget.getReg(), RegState::Kill);
+    } else {
+      BuildMI(MBB, MBBI, DL, TII->get(X86::TAILJMPr))
+          .addReg(JumpTarget.getReg(), RegState::Kill);
+    }
+
+    MachineInstr *NewMI = std::prev(MBBI);
+    NewMI->copyImplicitOps(*MBBI->getParent()->getParent(), MBBI);
+
+    // Delete the pseudo instruction TCRETURN.
+    MBB.erase(MBBI);
+
+    return true;
+  }
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64: {
+    MachineOperand &DestAddr = MBBI->getOperand(0);
+    assert(DestAddr.isReg() && "Offset should be in register!");
+    const bool Uses64BitFramePtr =
+        STI->isTarget64BitLP64() || STI->isTargetNaCl64();
+    unsigned StackPtr = TRI->getStackRegister();
+    BuildMI(MBB, MBBI, DL,
+            TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr)
+        .addReg(DestAddr.getReg());
+    // The EH_RETURN pseudo is really removed during the MC Lowering.
+    return true;
+  }
+  case X86::IRET: {
+    // Adjust stack to erase error code
+    int64_t StackAdj = MBBI->getOperand(0).getImm();
+    X86FL->emitSPUpdate(MBB, MBBI, StackAdj, true);
+    // Replace pseudo with machine iret
+    BuildMI(MBB, MBBI, DL,
+            TII->get(STI->is64Bit() ? X86::IRET64 : X86::IRET32));
+    MBB.erase(MBBI);
+    return true;
+  }
+  case X86::EH_RESTORE: {
+    // Restore ESP and EBP, and optionally ESI if required.
+    bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(
+        MBB.getParent()->getFunction()->getPersonalityFn()));
+    X86FL->restoreWin32EHStackPointers(MBB, MBBI, DL, /*RestoreSP=*/IsSEH);
+    MBBI->eraseFromParent();
+    return true;
+  }
+  }
+  llvm_unreachable("Previous switch has a fallthrough?");
+}
+
+/// Expand all pseudo instructions contained in \p MBB.
+/// \returns true if any expansion occurred for \p MBB.
+bool X86ExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  // MBBI may be invalidated by the expansion.
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+    Modified |= ExpandMI(MBB, MBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool X86ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+  STI = &static_cast<const X86Subtarget &>(MF.getSubtarget());
+  TII = STI->getInstrInfo();
+  TRI = STI->getRegisterInfo();
+  X86FL = STI->getFrameLowering();
+
+  bool Modified = false;
+  for (MachineBasicBlock &MBB : MF)
+    Modified |= ExpandMBB(MBB);
+  return Modified;
+}
+
+/// Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createX86ExpandPseudoPass() {
+  return new X86ExpandPseudo();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
new file mode 100644
index 0000000..629d4d3
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
@@ -0,0 +1,3607 @@
+//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86-specific support for the FastISel class. Much
+// of the target-specific code is generated by tablegen in the file
+// X86GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86CallingConv.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+
+class X86FastISel final : public FastISel {
+  /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const X86Subtarget *Subtarget;
+
+  /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
+  /// floating point ops.
+  /// When SSE is available, use it for f32 operations.
+  /// When SSE2 is available, use it for f64 operations.
+  bool X86ScalarSSEf64;
+  bool X86ScalarSSEf32;
+
+public:
+  explicit X86FastISel(FunctionLoweringInfo &funcInfo,
+                       const TargetLibraryInfo *libInfo)
+      : FastISel(funcInfo, libInfo) {
+    Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();
+    X86ScalarSSEf64 = Subtarget->hasSSE2();
+    X86ScalarSSEf32 = Subtarget->hasSSE1();
+  }
+
+  bool fastSelectInstruction(const Instruction *I) override;
+
+  /// \brief The specified machine instr operand is a vreg, and that
+  /// vreg is being provided by the specified load instruction.  If possible,
+  /// try to fold the load as an operand to the instruction, returning true if
+  /// possible.
+  bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                           const LoadInst *LI) override;
+
+  bool fastLowerArguments() override;
+  bool fastLowerCall(CallLoweringInfo &CLI) override;
+  bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
+
+#include "X86GenFastISel.inc"
+
+private:
+  bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, DebugLoc DL);
+
+  bool X86FastEmitLoad(EVT VT, X86AddressMode &AM, MachineMemOperand *MMO,
+                       unsigned &ResultReg, unsigned Alignment = 1);
+
+  bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM,
+                        MachineMemOperand *MMO = nullptr, bool Aligned = false);
+  bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
+                        X86AddressMode &AM,
+                        MachineMemOperand *MMO = nullptr, bool Aligned = false);
+
+  bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
+                         unsigned &ResultReg);
+
+  bool X86SelectAddress(const Value *V, X86AddressMode &AM);
+  bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);
+
+  bool X86SelectLoad(const Instruction *I);
+
+  bool X86SelectStore(const Instruction *I);
+
+  bool X86SelectRet(const Instruction *I);
+
+  bool X86SelectCmp(const Instruction *I);
+
+  bool X86SelectZExt(const Instruction *I);
+
+  bool X86SelectBranch(const Instruction *I);
+
+  bool X86SelectShift(const Instruction *I);
+
+  bool X86SelectDivRem(const Instruction *I);
+
+  bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);
+
+  bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);
+
+  bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);
+
+  bool X86SelectSelect(const Instruction *I);
+
+  bool X86SelectTrunc(const Instruction *I);
+
+  bool X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned Opc,
+                               const TargetRegisterClass *RC);
+
+  bool X86SelectFPExt(const Instruction *I);
+  bool X86SelectFPTrunc(const Instruction *I);
+  bool X86SelectSIToFP(const Instruction *I);
+
+  const X86InstrInfo *getInstrInfo() const {
+    return Subtarget->getInstrInfo();
+  }
+  const X86TargetMachine *getTargetMachine() const {
+    return static_cast<const X86TargetMachine *>(&TM);
+  }
+
+  bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
+
+  unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT);
+  unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT);
+  unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT);
+  unsigned fastMaterializeConstant(const Constant *C) override;
+
+  unsigned fastMaterializeAlloca(const AllocaInst *C) override;
+
+  unsigned fastMaterializeFloatZero(const ConstantFP *CF) override;
+
+  /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+  /// computed in an SSE register, not on the X87 floating point stack.
+  bool isScalarFPTypeInSSEReg(EVT VT) const {
+    return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+      (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
+  }
+
+  bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
+
+  bool IsMemcpySmall(uint64_t Len);
+
+  bool TryEmitSmallMemcpy(X86AddressMode DestAM,
+                          X86AddressMode SrcAM, uint64_t Len);
+
+  bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
+                            const Value *Cond);
+
+  const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
+                                            X86AddressMode &AM);
+};
+
+} // end anonymous namespace.
+
+static std::pair<X86::CondCode, bool>
+getX86ConditionCode(CmpInst::Predicate Predicate) {
+  X86::CondCode CC = X86::COND_INVALID;
+  bool NeedSwap = false;
+  switch (Predicate) {
+  default: break;
+  // Floating-point Predicates
+  case CmpInst::FCMP_UEQ: CC = X86::COND_E;       break;
+  case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_OGT: CC = X86::COND_A;       break;
+  case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_OGE: CC = X86::COND_AE;      break;
+  case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_ULT: CC = X86::COND_B;       break;
+  case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_ULE: CC = X86::COND_BE;      break;
+  case CmpInst::FCMP_ONE: CC = X86::COND_NE;      break;
+  case CmpInst::FCMP_UNO: CC = X86::COND_P;       break;
+  case CmpInst::FCMP_ORD: CC = X86::COND_NP;      break;
+  case CmpInst::FCMP_OEQ: // fall-through
+  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
+
+  // Integer Predicates
+  case CmpInst::ICMP_EQ:  CC = X86::COND_E;       break;
+  case CmpInst::ICMP_NE:  CC = X86::COND_NE;      break;
+  case CmpInst::ICMP_UGT: CC = X86::COND_A;       break;
+  case CmpInst::ICMP_UGE: CC = X86::COND_AE;      break;
+  case CmpInst::ICMP_ULT: CC = X86::COND_B;       break;
+  case CmpInst::ICMP_ULE: CC = X86::COND_BE;      break;
+  case CmpInst::ICMP_SGT: CC = X86::COND_G;       break;
+  case CmpInst::ICMP_SGE: CC = X86::COND_GE;      break;
+  case CmpInst::ICMP_SLT: CC = X86::COND_L;       break;
+  case CmpInst::ICMP_SLE: CC = X86::COND_LE;      break;
+  }
+
+  return std::make_pair(CC, NeedSwap);
+}
+
+static std::pair<unsigned, bool>
+getX86SSEConditionCode(CmpInst::Predicate Predicate) {
+  unsigned CC;
+  bool NeedSwap = false;
+
+  // SSE Condition code mapping:
+  //  0 - EQ
+  //  1 - LT
+  //  2 - LE
+  //  3 - UNORD
+  //  4 - NEQ
+  //  5 - NLT
+  //  6 - NLE
+  //  7 - ORD
+  switch (Predicate) {
+  default: llvm_unreachable("Unexpected predicate");
+  case CmpInst::FCMP_OEQ: CC = 0;          break;
+  case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_OLT: CC = 1;          break;
+  case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_OLE: CC = 2;          break;
+  case CmpInst::FCMP_UNO: CC = 3;          break;
+  case CmpInst::FCMP_UNE: CC = 4;          break;
+  case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_UGE: CC = 5;          break;
+  case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_UGT: CC = 6;          break;
+  case CmpInst::FCMP_ORD: CC = 7;          break;
+  case CmpInst::FCMP_UEQ:
+  case CmpInst::FCMP_ONE: CC = 8;          break;
+  }
+
+  return std::make_pair(CC, NeedSwap);
+}
+
+/// \brief Adds a complex addressing mode to the given machine instr builder.
+/// Note, this will constrain the index register.  If its not possible to
+/// constrain the given index register, then a new one will be created.  The
+/// IndexReg field of the addressing mode will be updated to match in this case.
+const MachineInstrBuilder &
+X86FastISel::addFullAddress(const MachineInstrBuilder &MIB,
+                            X86AddressMode &AM) {
+  // First constrain the index register.  It needs to be a GR64_NOSP.
+  AM.IndexReg = constrainOperandRegClass(MIB->getDesc(), AM.IndexReg,
+                                         MIB->getNumOperands() +
+                                         X86::AddrIndexReg);
+  return ::addFullAddress(MIB, AM);
+}
+
+/// \brief Check if it is possible to fold the condition from the XALU intrinsic
+/// into the user. The condition code will only be updated on success.
+bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
+                                       const Value *Cond) {
+  if (!isa<ExtractValueInst>(Cond))
+    return false;
+
+  const auto *EV = cast<ExtractValueInst>(Cond);
+  if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
+    return false;
+
+  const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
+  MVT RetVT;
+  const Function *Callee = II->getCalledFunction();
+  Type *RetTy =
+    cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
+  if (!isTypeLegal(RetTy, RetVT))
+    return false;
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return false;
+
+  X86::CondCode TmpCC;
+  switch (II->getIntrinsicID()) {
+  default: return false;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;
+  }
+
+  // Check if both instructions are in the same basic block.
+  if (II->getParent() != I->getParent())
+    return false;
+
+  // Make sure nothing is in the way
+  BasicBlock::const_iterator Start(I);
+  BasicBlock::const_iterator End(II);
+  for (auto Itr = std::prev(Start); Itr != End; --Itr) {
+    // We only expect extractvalue instructions between the intrinsic and the
+    // instruction to be selected.
+    if (!isa<ExtractValueInst>(Itr))
+      return false;
+
+    // Check that the extractvalue operand comes from the intrinsic.
+    const auto *EVI = cast<ExtractValueInst>(Itr);
+    if (EVI->getAggregateOperand() != II)
+      return false;
+  }
+
+  CC = TmpCC;
+  return true;
+}
+
+bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
+  EVT evt = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true);
+  if (evt == MVT::Other || !evt.isSimple())
+    // Unhandled type. Halt "fast" selection and bail.
+    return false;
+
+  VT = evt.getSimpleVT();
+  // For now, require SSE/SSE2 for performing floating-point operations,
+  // since x87 requires additional work.
+  if (VT == MVT::f64 && !X86ScalarSSEf64)
+    return false;
+  if (VT == MVT::f32 && !X86ScalarSSEf32)
+    return false;
+  // Similarly, no f80 support yet.
+  if (VT == MVT::f80)
+    return false;
+  // We only handle legal types. For example, on x86-32 the instruction
+  // selector contains all of the 64-bit instructions from x86-64,
+  // under the assumption that i64 won't be used if the target doesn't
+  // support it.
+  return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
+}
+
+#include "X86GenCallingConv.inc"
+
+/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
+/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
+/// Return true and the result register by reference if it is possible.
+bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
+                                  MachineMemOperand *MMO, unsigned &ResultReg,
+                                  unsigned Alignment) {
+  // Get opcode and regclass of the output for the given load instruction.
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = nullptr;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+    Opc = X86::MOV8rm;
+    RC  = &X86::GR8RegClass;
+    break;
+  case MVT::i16:
+    Opc = X86::MOV16rm;
+    RC  = &X86::GR16RegClass;
+    break;
+  case MVT::i32:
+    Opc = X86::MOV32rm;
+    RC  = &X86::GR32RegClass;
+    break;
+  case MVT::i64:
+    // Must be in x86-64 mode.
+    Opc = X86::MOV64rm;
+    RC  = &X86::GR64RegClass;
+    break;
+  case MVT::f32:
+    if (X86ScalarSSEf32) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
+      RC  = &X86::FR32RegClass;
+    } else {
+      Opc = X86::LD_Fp32m;
+      RC  = &X86::RFP32RegClass;
+    }
+    break;
+  case MVT::f64:
+    if (X86ScalarSSEf64) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
+      RC  = &X86::FR64RegClass;
+    } else {
+      Opc = X86::LD_Fp64m;
+      RC  = &X86::RFP64RegClass;
+    }
+    break;
+  case MVT::f80:
+    // No f80 support yet.
+    return false;
+  case MVT::v4f32:
+    if (Alignment >= 16)
+      Opc = Subtarget->hasAVX() ? X86::VMOVAPSrm : X86::MOVAPSrm;
+    else
+      Opc = Subtarget->hasAVX() ? X86::VMOVUPSrm : X86::MOVUPSrm;
+    RC  = &X86::VR128RegClass;
+    break;
+  case MVT::v2f64:
+    if (Alignment >= 16)
+      Opc = Subtarget->hasAVX() ? X86::VMOVAPDrm : X86::MOVAPDrm;
+    else
+      Opc = Subtarget->hasAVX() ? X86::VMOVUPDrm : X86::MOVUPDrm;
+    RC  = &X86::VR128RegClass;
+    break;
+  case MVT::v4i32:
+  case MVT::v2i64:
+  case MVT::v8i16:
+  case MVT::v16i8:
+    if (Alignment >= 16)
+      Opc = Subtarget->hasAVX() ? X86::VMOVDQArm : X86::MOVDQArm;
+    else
+      Opc = Subtarget->hasAVX() ? X86::VMOVDQUrm : X86::MOVDQUrm;
+    RC  = &X86::VR128RegClass;
+    break;
+  }
+
+  ResultReg = createResultReg(RC);
+  MachineInstrBuilder MIB =
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+  addFullAddress(MIB, AM);
+  if (MMO)
+    MIB->addMemOperand(*FuncInfo.MF, MMO);
+  return true;
+}
+
+/// X86FastEmitStore - Emit a machine instruction to store a value Val of
+/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
+/// and a displacement offset, or a GlobalAddress,
+/// i.e. V. Return true if it is possible.
+bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
+                                   X86AddressMode &AM,
+                                   MachineMemOperand *MMO, bool Aligned) {
+  bool HasSSE2 = Subtarget->hasSSE2();
+  bool HasSSE4A = Subtarget->hasSSE4A();
+  bool HasAVX = Subtarget->hasAVX();
+  bool IsNonTemporal = MMO && MMO->isNonTemporal();
+
+  // Get opcode and regclass of the output for the given store instruction.
+  unsigned Opc = 0;
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f80: // No f80 support yet.
+  default: return false;
+  case MVT::i1: {
+    // Mask out all but lowest bit.
+    unsigned AndResult = createResultReg(&X86::GR8RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(X86::AND8ri), AndResult)
+      .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
+    ValReg = AndResult;
+  }
+  // FALLTHROUGH, handling i1 as i8.
+  case MVT::i8:  Opc = X86::MOV8mr;  break;
+  case MVT::i16: Opc = X86::MOV16mr; break;
+  case MVT::i32:
+    Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr;
+    break;
+  case MVT::i64:
+    // Must be in x86-64 mode.
+    Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr;
+    break;
+  case MVT::f32:
+    if (X86ScalarSSEf32) {
+      if (IsNonTemporal && HasSSE4A)
+        Opc = X86::MOVNTSS;
+      else
+        Opc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
+    } else
+      Opc = X86::ST_Fp32m;
+    break;
+  case MVT::f64:
+    if (X86ScalarSSEf32) {
+      if (IsNonTemporal && HasSSE4A)
+        Opc = X86::MOVNTSD;
+      else
+        Opc = HasAVX ? X86::VMOVSDmr : X86::MOVSDmr;
+    } else
+      Opc = X86::ST_Fp64m;
+    break;
+  case MVT::v4f32:
+    if (Aligned) {
+      if (IsNonTemporal)
+        Opc = HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr;
+      else
+        Opc = HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr;
+    } else
+      Opc = HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr;
+    break;
+  case MVT::v2f64:
+    if (Aligned) {
+      if (IsNonTemporal)
+        Opc = HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr;
+      else
+        Opc = HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr;
+    } else
+      Opc = HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr;
+    break;
+  case MVT::v4i32:
+  case MVT::v2i64:
+  case MVT::v8i16:
+  case MVT::v16i8:
+    if (Aligned) {
+      if (IsNonTemporal)
+        Opc = HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr;
+      else
+        Opc = HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr;
+    } else
+      Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr;
+    break;
+  }
+
+  MachineInstrBuilder MIB =
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+  addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill));
+  if (MMO)
+    MIB->addMemOperand(*FuncInfo.MF, MMO);
+
+  return true;
+}
+
+bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
+                                   X86AddressMode &AM,
+                                   MachineMemOperand *MMO, bool Aligned) {
+  // Handle 'null' like i32/i64 0.
+  if (isa<ConstantPointerNull>(Val))
+    Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));
+
+  // If this is a store of a simple constant, fold the constant into the store.
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+    unsigned Opc = 0;
+    bool Signed = true;
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: break;
+    case MVT::i1:  Signed = false;     // FALLTHROUGH to handle as i8.
+    case MVT::i8:  Opc = X86::MOV8mi;  break;
+    case MVT::i16: Opc = X86::MOV16mi; break;
+    case MVT::i32: Opc = X86::MOV32mi; break;
+    case MVT::i64:
+      // Must be a 32-bit sign extended value.
+      if (isInt<32>(CI->getSExtValue()))
+        Opc = X86::MOV64mi32;
+      break;
+    }
+
+    if (Opc) {
+      MachineInstrBuilder MIB =
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+      addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
+                                            : CI->getZExtValue());
+      if (MMO)
+        MIB->addMemOperand(*FuncInfo.MF, MMO);
+      return true;
+    }
+  }
+
+  unsigned ValReg = getRegForValue(Val);
+  if (ValReg == 0)
+    return false;
+
+  bool ValKill = hasTrivialKill(Val);
+  return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned);
+}
+
+/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
+/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
+/// ISD::SIGN_EXTEND).
+bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
+                                    unsigned Src, EVT SrcVT,
+                                    unsigned &ResultReg) {
+  unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
+                           Src, /*TODO: Kill=*/false);
+  if (RR == 0)
+    return false;
+
+  ResultReg = RR;
+  return true;
+}
+
+bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
+  // Handle constant address.
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    // Can't handle alternate code models yet.
+    if (TM.getCodeModel() != CodeModel::Small)
+      return false;
+
+    // Can't handle TLS yet.
+    if (GV->isThreadLocal())
+      return false;
+
+    // RIP-relative addresses can't have additional register operands, so if
+    // we've already folded stuff into the addressing mode, just force the
+    // global value into its own register, which we can use as the basereg.
+    if (!Subtarget->isPICStyleRIPRel() ||
+        (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
+      // Okay, we've committed to selecting this global. Set up the address.
+      AM.GV = GV;
+
+      // Allow the subtarget to classify the global.
+      unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+      // If this reference is relative to the pic base, set it now.
+      if (isGlobalRelativeToPICBase(GVFlags)) {
+        // FIXME: How do we know Base.Reg is free??
+        AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+      }
+
+      // Unless the ABI requires an extra load, return a direct reference to
+      // the global.
+      if (!isGlobalStubReference(GVFlags)) {
+        if (Subtarget->isPICStyleRIPRel()) {
+          // Use rip-relative addressing if we can.  Above we verified that the
+          // base and index registers are unused.
+          assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+          AM.Base.Reg = X86::RIP;
+        }
+        AM.GVOpFlags = GVFlags;
+        return true;
+      }
+
+      // Ok, we need to do a load from a stub.  If we've already loaded from
+      // this stub, reuse the loaded pointer, otherwise emit the load now.
+      DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V);
+      unsigned LoadReg;
+      if (I != LocalValueMap.end() && I->second != 0) {
+        LoadReg = I->second;
+      } else {
+        // Issue load from stub.
+        unsigned Opc = 0;
+        const TargetRegisterClass *RC = nullptr;
+        X86AddressMode StubAM;
+        StubAM.Base.Reg = AM.Base.Reg;
+        StubAM.GV = GV;
+        StubAM.GVOpFlags = GVFlags;
+
+        // Prepare for inserting code in the local-value area.
+        SavePoint SaveInsertPt = enterLocalValueArea();
+
+        if (TLI.getPointerTy(DL) == MVT::i64) {
+          Opc = X86::MOV64rm;
+          RC  = &X86::GR64RegClass;
+
+          if (Subtarget->isPICStyleRIPRel())
+            StubAM.Base.Reg = X86::RIP;
+        } else {
+          Opc = X86::MOV32rm;
+          RC  = &X86::GR32RegClass;
+        }
+
+        LoadReg = createResultReg(RC);
+        MachineInstrBuilder LoadMI =
+          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg);
+        addFullAddress(LoadMI, StubAM);
+
+        // Ok, back to normal mode.
+        leaveLocalValueArea(SaveInsertPt);
+
+        // Prevent loading GV stub multiple times in same MBB.
+        LocalValueMap[V] = LoadReg;
+      }
+
+      // Now construct the final address. Note that the Disp, Scale,
+      // and Index values may already be set here.
+      AM.Base.Reg = LoadReg;
+      AM.GV = nullptr;
+      return true;
+    }
+  }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+    if (AM.Base.Reg == 0) {
+      AM.Base.Reg = getRegForValue(V);
+      return AM.Base.Reg != 0;
+    }
+    if (AM.IndexReg == 0) {
+      assert(AM.Scale == 1 && "Scale with no index!");
+      AM.IndexReg = getRegForValue(V);
+      return AM.IndexReg != 0;
+    }
+  }
+
+  return false;
+}
+
+/// X86SelectAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
+  SmallVector<const Value *, 32> GEPs;
+redo_gep:
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+    // Don't walk into other basic blocks; it's possible we haven't
+    // visited them yet, so the instructions may not yet be assigned
+    // virtual registers.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
+    if (Ty->getAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+  switch (Opcode) {
+  default: break;
+  case Instruction::BitCast:
+    // Look past bitcasts.
+    return X86SelectAddress(U->getOperand(0), AM);
+
+  case Instruction::IntToPtr:
+    // Look past no-op inttoptrs.
+    if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+        TLI.getPointerTy(DL))
+      return X86SelectAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::PtrToInt:
+    // Look past no-op ptrtoints.
+    if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+      return X86SelectAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::Alloca: {
+    // Do static allocas.
+    const AllocaInst *A = cast<AllocaInst>(V);
+    DenseMap<const AllocaInst *, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(A);
+    if (SI != FuncInfo.StaticAllocaMap.end()) {
+      AM.BaseType = X86AddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = SI->second;
+      return true;
+    }
+    break;
+  }
+
+  case Instruction::Add: {
+    // Adds of constants are common and easy enough.
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
+      uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
+      // They have to fit in the 32-bit signed displacement field though.
+      if (isInt<32>(Disp)) {
+        AM.Disp = (uint32_t)Disp;
+        return X86SelectAddress(U->getOperand(0), AM);
+      }
+    }
+    break;
+  }
+
+  case Instruction::GetElementPtr: {
+    X86AddressMode SavedAM = AM;
+
+    // Pattern-match simple GEPs.
+    uint64_t Disp = (int32_t)AM.Disp;
+    unsigned IndexReg = AM.IndexReg;
+    unsigned Scale = AM.Scale;
+    gep_type_iterator GTI = gep_type_begin(U);
+    // Iterate through the indices, folding what we can. Constants can be
+    // folded, and one dynamic index can be handled, if the scale is supported.
+    for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
+         i != e; ++i, ++GTI) {
+      const Value *Op = *i;
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = DL.getStructLayout(STy);
+        Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
+        continue;
+      }
+
+      // A array/variable index is always of the form i*S where S is the
+      // constant scale size.  See if we can push the scale into immediates.
+      uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+      for (;;) {
+        if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+          // Constant-offset addressing.
+          Disp += CI->getSExtValue() * S;
+          break;
+        }
+        if (canFoldAddIntoGEP(U, Op)) {
+          // A compatible add with a constant operand. Fold the constant.
+          ConstantInt *CI =
+            cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+          Disp += CI->getSExtValue() * S;
+          // Iterate on the other operand.
+          Op = cast<AddOperator>(Op)->getOperand(0);
+          continue;
+        }
+        if (IndexReg == 0 &&
+            (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
+            (S == 1 || S == 2 || S == 4 || S == 8)) {
+          // Scaled-index addressing.
+          Scale = S;
+          IndexReg = getRegForGEPIndex(Op).first;
+          if (IndexReg == 0)
+            return false;
+          break;
+        }
+        // Unsupported.
+        goto unsupported_gep;
+      }
+    }
+
+    // Check for displacement overflow.
+    if (!isInt<32>(Disp))
+      break;
+
+    AM.IndexReg = IndexReg;
+    AM.Scale = Scale;
+    AM.Disp = (uint32_t)Disp;
+    GEPs.push_back(V);
+
+    if (const GetElementPtrInst *GEP =
+          dyn_cast<GetElementPtrInst>(U->getOperand(0))) {
+      // Ok, the GEP indices were covered by constant-offset and scaled-index
+      // addressing. Update the address state and move on to examining the base.
+      V = GEP;
+      goto redo_gep;
+    } else if (X86SelectAddress(U->getOperand(0), AM)) {
+      return true;
+    }
+
+    // If we couldn't merge the gep value into this addr mode, revert back to
+    // our address and just match the value instead of completely failing.
+    AM = SavedAM;
+
+    for (SmallVectorImpl<const Value *>::reverse_iterator
+           I = GEPs.rbegin(), E = GEPs.rend(); I != E; ++I)
+      if (handleConstantAddresses(*I, AM))
+        return true;
+
+    return false;
+  unsupported_gep:
+    // Ok, the GEP indices weren't all covered.
+    break;
+  }
+  }
+
+  return handleConstantAddresses(V, AM);
+}
+
+/// X86SelectCallAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  const Instruction *I = dyn_cast<Instruction>(V);
+  // Record if the value is defined in the same basic block.
+  //
+  // This information is crucial to know whether or not folding an
+  // operand is valid.
+  // Indeed, FastISel generates or reuses a virtual register for all
+  // operands of all instructions it selects. Obviously, the definition and
+  // its uses must use the same virtual register otherwise the produced
+  // code is incorrect.
+  // Before instruction selection, FunctionLoweringInfo::set sets the virtual
+  // registers for values that are alive across basic blocks. This ensures
+  // that the values are consistently set between across basic block, even
+  // if different instruction selection mechanisms are used (e.g., a mix of
+  // SDISel and FastISel).
+  // For values local to a basic block, the instruction selection process
+  // generates these virtual registers with whatever method is appropriate
+  // for its needs. In particular, FastISel and SDISel do not share the way
+  // local virtual registers are set.
+  // Therefore, this is impossible (or at least unsafe) to share values
+  // between basic blocks unless they use the same instruction selection
+  // method, which is not guarantee for X86.
+  // Moreover, things like hasOneUse could not be used accurately, if we
+  // allow to reference values across basic blocks whereas they are not
+  // alive across basic blocks initially.
+  bool InMBB = true;
+  if (I) {
+    Opcode = I->getOpcode();
+    U = I;
+    InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+  default: break;
+  case Instruction::BitCast:
+    // Look past bitcasts if its operand is in the same BB.
+    if (InMBB)
+      return X86SelectCallAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::IntToPtr:
+    // Look past no-op inttoptrs if its operand is in the same BB.
+    if (InMBB &&
+        TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+            TLI.getPointerTy(DL))
+      return X86SelectCallAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::PtrToInt:
+    // Look past no-op ptrtoints if its operand is in the same BB.
+    if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+      return X86SelectCallAddress(U->getOperand(0), AM);
+    break;
+  }
+
+  // Handle constant address.
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    // Can't handle alternate code models yet.
+    if (TM.getCodeModel() != CodeModel::Small)
+      return false;
+
+    // RIP-relative addresses can't have additional register operands.
+    if (Subtarget->isPICStyleRIPRel() &&
+        (AM.Base.Reg != 0 || AM.IndexReg != 0))
+      return false;
+
+    // Can't handle DLL Import.
+    if (GV->hasDLLImportStorageClass())
+      return false;
+
+    // Can't handle TLS.
+    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+      if (GVar->isThreadLocal())
+        return false;
+
+    // Okay, we've committed to selecting this global. Set up the basic address.
+    AM.GV = GV;
+
+    // No ABI requires an extra load for anything other than DLLImport, which
+    // we rejected above. Return a direct reference to the global.
+    if (Subtarget->isPICStyleRIPRel()) {
+      // Use rip-relative addressing if we can.  Above we verified that the
+      // base and index registers are unused.
+      assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+      AM.Base.Reg = X86::RIP;
+    } else if (Subtarget->isPICStyleStubPIC()) {
+      AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET;
+    } else if (Subtarget->isPICStyleGOT()) {
+      AM.GVOpFlags = X86II::MO_GOTOFF;
+    }
+
+    return true;
+  }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+    if (AM.Base.Reg == 0) {
+      AM.Base.Reg = getRegForValue(V);
+      return AM.Base.Reg != 0;
+    }
+    if (AM.IndexReg == 0) {
+      assert(AM.Scale == 1 && "Scale with no index!");
+      AM.IndexReg = getRegForValue(V);
+      return AM.IndexReg != 0;
+    }
+  }
+
+  return false;
+}
+
+
+/// X86SelectStore - Select and emit code to implement store instructions.
+bool X86FastISel::X86SelectStore(const Instruction *I) {
+  // Atomic stores need special handling.
+  const StoreInst *S = cast<StoreInst>(I);
+
+  if (S->isAtomic())
+    return false;
+
+  const Value *Val = S->getValueOperand();
+  const Value *Ptr = S->getPointerOperand();
+
+  MVT VT;
+  if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
+    return false;
+
+  unsigned Alignment = S->getAlignment();
+  unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType());
+  if (Alignment == 0) // Ensure that codegen never sees alignment 0
+    Alignment = ABIAlignment;
+  bool Aligned = Alignment >= ABIAlignment;
+
+  X86AddressMode AM;
+  if (!X86SelectAddress(Ptr, AM))
+    return false;
+
+  return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
+}
+
+/// X86SelectRet - Select and emit code to implement ret instructions.
+bool X86FastISel::X86SelectRet(const Instruction *I) {
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+  const X86MachineFunctionInfo *X86MFInfo =
+      FuncInfo.MF->getInfo<X86MachineFunctionInfo>();
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  CallingConv::ID CC = F.getCallingConv();
+  if (CC != CallingConv::C &&
+      CC != CallingConv::Fast &&
+      CC != CallingConv::X86_FastCall &&
+      CC != CallingConv::X86_64_SysV)
+    return false;
+
+  if (Subtarget->isCallingConvWin64(CC))
+    return false;
+
+  // Don't handle popping bytes on return for now.
+  if (X86MFInfo->getBytesToPopOnReturn() != 0)
+    return false;
+
+  // fastcc with -tailcallopt is intended to provide a guaranteed
+  // tail call optimization. Fastisel doesn't know how to do that.
+  if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+    return false;
+
+  // Let SDISel handle vararg functions.
+  if (F.isVarArg())
+    return false;
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+
+  if (Ret->getNumOperands() > 0) {
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
+    CCInfo.AnalyzeReturn(Outs, RetCC_X86);
+
+    const Value *RV = Ret->getOperand(0);
+    unsigned Reg = getRegForValue(RV);
+    if (Reg == 0)
+      return false;
+
+    // Only handle a single return value for now.
+    if (ValLocs.size() != 1)
+      return false;
+
+    CCValAssign &VA = ValLocs[0];
+
+    // Don't bother handling odd stuff for now.
+    if (VA.getLocInfo() != CCValAssign::Full)
+      return false;
+    // Only handle register returns for now.
+    if (!VA.isRegLoc())
+      return false;
+
+    // The calling-convention tables for x87 returns don't tell
+    // the whole story.
+    if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
+      return false;
+
+    unsigned SrcReg = Reg + VA.getValNo();
+    EVT SrcVT = TLI.getValueType(DL, RV->getType());
+    EVT DstVT = VA.getValVT();
+    // Special handling for extended integers.
+    if (SrcVT != DstVT) {
+      if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
+        return false;
+
+      if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
+        return false;
+
+      assert(DstVT == MVT::i32 && "X86 should always ext to i32");
+
+      if (SrcVT == MVT::i1) {
+        if (Outs[0].Flags.isSExt())
+          return false;
+        SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
+        SrcVT = MVT::i8;
+      }
+      unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
+                                             ISD::SIGN_EXTEND;
+      SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
+                          SrcReg, /*TODO: Kill=*/false);
+    }
+
+    // Make the copy.
+    unsigned DstReg = VA.getLocReg();
+    const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+    // Avoid a cross-class copy. This is very unlikely.
+    if (!SrcRC->contains(DstReg))
+      return false;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);
+
+    // Add register to return instruction.
+    RetRegs.push_back(VA.getLocReg());
+  }
+
+  // All x86 ABIs require that for returning structs by value we copy
+  // the sret argument into %rax/%eax (depending on ABI) for the return.
+  // We saved the argument into a virtual register in the entry block,
+  // so now we copy the value out and into %rax/%eax.
+  if (F.hasStructRetAttr()) {
+    unsigned Reg = X86MFInfo->getSRetReturnReg();
+    assert(Reg &&
+           "SRetReturnReg should have been set in LowerFormalArguments()!");
+    unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), RetReg).addReg(Reg);
+    RetRegs.push_back(RetReg);
+  }
+
+  // Now emit the RET.
+  MachineInstrBuilder MIB =
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
+  return true;
+}
+
+/// X86SelectLoad - Select and emit code to implement load instructions.
+///
+bool X86FastISel::X86SelectLoad(const Instruction *I) {
+  const LoadInst *LI = cast<LoadInst>(I);
+
+  // Atomic loads need special handling.
+  if (LI->isAtomic())
+    return false;
+
+  MVT VT;
+  if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))
+    return false;
+
+  const Value *Ptr = LI->getPointerOperand();
+
+  X86AddressMode AM;
+  if (!X86SelectAddress(Ptr, AM))
+    return false;
+
+  unsigned Alignment = LI->getAlignment();
+  unsigned ABIAlignment = DL.getABITypeAlignment(LI->getType());
+  if (Alignment == 0) // Ensure that codegen never sees alignment 0
+    Alignment = ABIAlignment;
+
+  unsigned ResultReg = 0;
+  if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg,
+                       Alignment))
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
+  bool HasAVX = Subtarget->hasAVX();
+  bool X86ScalarSSEf32 = Subtarget->hasSSE1();
+  bool X86ScalarSSEf64 = Subtarget->hasSSE2();
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:       return 0;
+  case MVT::i8:  return X86::CMP8rr;
+  case MVT::i16: return X86::CMP16rr;
+  case MVT::i32: return X86::CMP32rr;
+  case MVT::i64: return X86::CMP64rr;
+  case MVT::f32:
+    return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0;
+  case MVT::f64:
+    return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0;
+  }
+}
+
+/// If we have a comparison with RHS as the RHS  of the comparison, return an
+/// opcode that works for the compare (e.g. CMP32ri) otherwise return 0.
+static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
+  int64_t Val = RHSC->getSExtValue();
+  switch (VT.getSimpleVT().SimpleTy) {
+  // Otherwise, we can't fold the immediate into this comparison.
+  default:
+    return 0;
+  case MVT::i8:
+    return X86::CMP8ri;
+  case MVT::i16:
+    if (isInt<8>(Val))
+      return X86::CMP16ri8;
+    return X86::CMP16ri;
+  case MVT::i32:
+    if (isInt<8>(Val))
+      return X86::CMP32ri8;
+    return X86::CMP32ri;
+  case MVT::i64:
+    if (isInt<8>(Val))
+      return X86::CMP64ri8;
+    // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
+    // field.
+    if (isInt<32>(Val))
+      return X86::CMP64ri32;
+    return 0;
+  }
+}
+
+bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
+                                     EVT VT, DebugLoc CurDbgLoc) {
+  unsigned Op0Reg = getRegForValue(Op0);
+  if (Op0Reg == 0) return false;
+
+  // Handle 'null' like i32/i64 0.
+  if (isa<ConstantPointerNull>(Op1))
+    Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext()));
+
+  // We have two options: compare with register or immediate.  If the RHS of
+  // the compare is an immediate that we can fold into this compare, use
+  // CMPri, otherwise use CMPrr.
+  if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+    if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc))
+        .addReg(Op0Reg)
+        .addImm(Op1C->getSExtValue());
+      return true;
+    }
+  }
+
+  unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
+  if (CompareOpc == 0) return false;
+
+  unsigned Op1Reg = getRegForValue(Op1);
+  if (Op1Reg == 0) return false;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc))
+    .addReg(Op0Reg)
+    .addReg(Op1Reg);
+
+  return true;
+}
+
+bool X86FastISel::X86SelectCmp(const Instruction *I) {
+  const CmpInst *CI = cast<CmpInst>(I);
+
+  MVT VT;
+  if (!isTypeLegal(I->getOperand(0)->getType(), VT))
+    return false;
+
+  // Try to optimize or fold the cmp.
+  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+  unsigned ResultReg = 0;
+  switch (Predicate) {
+  default: break;
+  case CmpInst::FCMP_FALSE: {
+    ResultReg = createResultReg(&X86::GR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
+            ResultReg);
+    ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
+                                           X86::sub_8bit);
+    if (!ResultReg)
+      return false;
+    break;
+  }
+  case CmpInst::FCMP_TRUE: {
+    ResultReg = createResultReg(&X86::GR8RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
+            ResultReg).addImm(1);
+    break;
+  }
+  }
+
+  if (ResultReg) {
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  const Value *LHS = CI->getOperand(0);
+  const Value *RHS = CI->getOperand(1);
+
+  // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
+  // We don't have to materialize a zero constant for this case and can just use
+  // %x again on the RHS.
+  if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+    const auto *RHSC = dyn_cast<ConstantFP>(RHS);
+    if (RHSC && RHSC->isNullValue())
+      RHS = LHS;
+  }
+
+  // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
+  static unsigned SETFOpcTable[2][3] = {
+    { X86::SETEr,  X86::SETNPr, X86::AND8rr },
+    { X86::SETNEr, X86::SETPr,  X86::OR8rr  }
+  };
+  unsigned *SETFOpc = nullptr;
+  switch (Predicate) {
+  default: break;
+  case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
+  case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
+  }
+
+  ResultReg = createResultReg(&X86::GR8RegClass);
+  if (SETFOpc) {
+    if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
+      return false;
+
+    unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
+    unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
+            FlagReg1);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
+            FlagReg2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
+            ResultReg).addReg(FlagReg1).addReg(FlagReg2);
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  X86::CondCode CC;
+  bool SwapArgs;
+  std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+  assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+  unsigned Opc = X86::getSETFromCond(CC);
+
+  if (SwapArgs)
+    std::swap(LHS, RHS);
+
+  // Emit a compare of LHS/RHS.
+  if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
+    return false;
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectZExt(const Instruction *I) {
+  EVT DstVT = TLI.getValueType(DL, I->getType());
+  if (!TLI.isTypeLegal(DstVT))
+    return false;
+
+  unsigned ResultReg = getRegForValue(I->getOperand(0));
+  if (ResultReg == 0)
+    return false;
+
+  // Handle zero-extension from i1 to i8, which is common.
+  MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
+  if (SrcVT.SimpleTy == MVT::i1) {
+    // Set the high bits to zero.
+    ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
+    SrcVT = MVT::i8;
+
+    if (ResultReg == 0)
+      return false;
+  }
+
+  if (DstVT == MVT::i64) {
+    // Handle extension to 64-bits via sub-register shenanigans.
+    unsigned MovInst;
+
+    switch (SrcVT.SimpleTy) {
+    case MVT::i8:  MovInst = X86::MOVZX32rr8;  break;
+    case MVT::i16: MovInst = X86::MOVZX32rr16; break;
+    case MVT::i32: MovInst = X86::MOV32rr;     break;
+    default: llvm_unreachable("Unexpected zext to i64 source type");
+    }
+
+    unsigned Result32 = createResultReg(&X86::GR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32)
+      .addReg(ResultReg);
+
+    ResultReg = createResultReg(&X86::GR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG),
+            ResultReg)
+      .addImm(0).addReg(Result32).addImm(X86::sub_32bit);
+  } else if (DstVT != MVT::i8) {
+    ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
+                           ResultReg, /*Kill=*/true);
+    if (ResultReg == 0)
+      return false;
+  }
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectBranch(const Instruction *I) {
+  // Unconditional branches are selected by tablegen-generated code.
+  // Handle a conditional branch.
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  // Fold the common case of a conditional branch with a comparison
+  // in the same block (values defined on other blocks may not have
+  // initialized registers).
+  X86::CondCode CC;
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
+      EVT VT = TLI.getValueType(DL, CI->getOperand(0)->getType());
+
+      // Try to optimize or fold the cmp.
+      CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+      switch (Predicate) {
+      default: break;
+      case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true;
+      case CmpInst::FCMP_TRUE:  fastEmitBranch(TrueMBB, DbgLoc); return true;
+      }
+
+      const Value *CmpLHS = CI->getOperand(0);
+      const Value *CmpRHS = CI->getOperand(1);
+
+      // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
+      // 0.0.
+      // We don't have to materialize a zero constant for this case and can just
+      // use %x again on the RHS.
+      if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+        const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
+        if (CmpRHSC && CmpRHSC->isNullValue())
+          CmpRHS = CmpLHS;
+      }
+
+      // Try to take advantage of fallthrough opportunities.
+      if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
+        std::swap(TrueMBB, FalseMBB);
+        Predicate = CmpInst::getInversePredicate(Predicate);
+      }
+
+      // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition
+      // code check. Instead two branch instructions are required to check all
+      // the flags. First we change the predicate to a supported condition code,
+      // which will be the first branch. Later one we will emit the second
+      // branch.
+      bool NeedExtraBranch = false;
+      switch (Predicate) {
+      default: break;
+      case CmpInst::FCMP_OEQ:
+        std::swap(TrueMBB, FalseMBB); // fall-through
+      case CmpInst::FCMP_UNE:
+        NeedExtraBranch = true;
+        Predicate = CmpInst::FCMP_ONE;
+        break;
+      }
+
+      bool SwapArgs;
+      unsigned BranchOpc;
+      std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+      assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+
+      BranchOpc = X86::GetCondBranchFromCond(CC);
+      if (SwapArgs)
+        std::swap(CmpLHS, CmpRHS);
+
+      // Emit a compare of the LHS and RHS, setting the flags.
+      if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc()))
+        return false;
+
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
+        .addMBB(TrueMBB);
+
+      // X86 requires a second branch to handle UNE (and OEQ, which is mapped
+      // to UNE above).
+      if (NeedExtraBranch) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1))
+          .addMBB(TrueMBB);
+      }
+
+      finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+      return true;
+    }
+  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+    // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
+    // typically happen for _Bool and C++ bools.
+    MVT SourceVT;
+    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+        isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
+      unsigned TestOpc = 0;
+      switch (SourceVT.SimpleTy) {
+      default: break;
+      case MVT::i8:  TestOpc = X86::TEST8ri; break;
+      case MVT::i16: TestOpc = X86::TEST16ri; break;
+      case MVT::i32: TestOpc = X86::TEST32ri; break;
+      case MVT::i64: TestOpc = X86::TEST64ri32; break;
+      }
+      if (TestOpc) {
+        unsigned OpReg = getRegForValue(TI->getOperand(0));
+        if (OpReg == 0) return false;
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
+          .addReg(OpReg).addImm(1);
+
+        unsigned JmpOpc = X86::JNE_1;
+        if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
+          std::swap(TrueMBB, FalseMBB);
+          JmpOpc = X86::JE_1;
+        }
+
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
+          .addMBB(TrueMBB);
+
+        finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+        return true;
+      }
+    }
+  } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
+    // Fake request the condition, otherwise the intrinsic might be completely
+    // optimized away.
+    unsigned TmpReg = getRegForValue(BI->getCondition());
+    if (TmpReg == 0)
+      return false;
+
+    unsigned BranchOpc = X86::GetCondBranchFromCond(CC);
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
+      .addMBB(TrueMBB);
+    finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+    return true;
+  }
+
+  // Otherwise do a clumsy setcc and re-test it.
+  // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
+  // in an explicit cast, so make sure to handle that correctly.
+  unsigned OpReg = getRegForValue(BI->getCondition());
+  if (OpReg == 0) return false;
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+    .addReg(OpReg).addImm(1);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1))
+    .addMBB(TrueMBB);
+  finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+  return true;
+}
+
+bool X86FastISel::X86SelectShift(const Instruction *I) {
+  unsigned CReg = 0, OpReg = 0;
+  const TargetRegisterClass *RC = nullptr;
+  if (I->getType()->isIntegerTy(8)) {
+    CReg = X86::CL;
+    RC = &X86::GR8RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR8rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR8rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL8rCL; break;
+    default: return false;
+    }
+  } else if (I->getType()->isIntegerTy(16)) {
+    CReg = X86::CX;
+    RC = &X86::GR16RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR16rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR16rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL16rCL; break;
+    default: return false;
+    }
+  } else if (I->getType()->isIntegerTy(32)) {
+    CReg = X86::ECX;
+    RC = &X86::GR32RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR32rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR32rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL32rCL; break;
+    default: return false;
+    }
+  } else if (I->getType()->isIntegerTy(64)) {
+    CReg = X86::RCX;
+    RC = &X86::GR64RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR64rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR64rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL64rCL; break;
+    default: return false;
+    }
+  } else {
+    return false;
+  }
+
+  MVT VT;
+  if (!isTypeLegal(I->getType(), VT))
+    return false;
+
+  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  if (Op0Reg == 0) return false;
+
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (Op1Reg == 0) return false;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
+          CReg).addReg(Op1Reg);
+
+  // The shift instruction uses X86::CL. If we defined a super-register
+  // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
+  if (CReg != X86::CL)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::KILL), X86::CL)
+      .addReg(CReg, RegState::Kill);
+
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
+    .addReg(Op0Reg);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectDivRem(const Instruction *I) {
+  const static unsigned NumTypes = 4; // i8, i16, i32, i64
+  const static unsigned NumOps   = 4; // SDiv, SRem, UDiv, URem
+  const static bool S = true;  // IsSigned
+  const static bool U = false; // !IsSigned
+  const static unsigned Copy = TargetOpcode::COPY;
+  // For the X86 DIV/IDIV instruction, in most cases the dividend
+  // (numerator) must be in a specific register pair highreg:lowreg,
+  // producing the quotient in lowreg and the remainder in highreg.
+  // For most data types, to set up the instruction, the dividend is
+  // copied into lowreg, and lowreg is sign-extended or zero-extended
+  // into highreg.  The exception is i8, where the dividend is defined
+  // as a single register rather than a register pair, and we
+  // therefore directly sign-extend or zero-extend the dividend into
+  // lowreg, instead of copying, and ignore the highreg.
+  const static struct DivRemEntry {
+    // The following portion depends only on the data type.
+    const TargetRegisterClass *RC;
+    unsigned LowInReg;  // low part of the register pair
+    unsigned HighInReg; // high part of the register pair
+    // The following portion depends on both the data type and the operation.
+    struct DivRemResult {
+    unsigned OpDivRem;        // The specific DIV/IDIV opcode to use.
+    unsigned OpSignExtend;    // Opcode for sign-extending lowreg into
+                              // highreg, or copying a zero into highreg.
+    unsigned OpCopy;          // Opcode for copying dividend into lowreg, or
+                              // zero/sign-extending into lowreg for i8.
+    unsigned DivRemResultReg; // Register containing the desired result.
+    bool IsOpSigned;          // Whether to use signed or unsigned form.
+    } ResultTable[NumOps];
+  } OpTable[NumTypes] = {
+    { &X86::GR8RegClass,  X86::AX,  0, {
+        { X86::IDIV8r,  0,            X86::MOVSX16rr8, X86::AL,  S }, // SDiv
+        { X86::IDIV8r,  0,            X86::MOVSX16rr8, X86::AH,  S }, // SRem
+        { X86::DIV8r,   0,            X86::MOVZX16rr8, X86::AL,  U }, // UDiv
+        { X86::DIV8r,   0,            X86::MOVZX16rr8, X86::AH,  U }, // URem
+      }
+    }, // i8
+    { &X86::GR16RegClass, X86::AX,  X86::DX, {
+        { X86::IDIV16r, X86::CWD,     Copy,            X86::AX,  S }, // SDiv
+        { X86::IDIV16r, X86::CWD,     Copy,            X86::DX,  S }, // SRem
+        { X86::DIV16r,  X86::MOV32r0, Copy,            X86::AX,  U }, // UDiv
+        { X86::DIV16r,  X86::MOV32r0, Copy,            X86::DX,  U }, // URem
+      }
+    }, // i16
+    { &X86::GR32RegClass, X86::EAX, X86::EDX, {
+        { X86::IDIV32r, X86::CDQ,     Copy,            X86::EAX, S }, // SDiv
+        { X86::IDIV32r, X86::CDQ,     Copy,            X86::EDX, S }, // SRem
+        { X86::DIV32r,  X86::MOV32r0, Copy,            X86::EAX, U }, // UDiv
+        { X86::DIV32r,  X86::MOV32r0, Copy,            X86::EDX, U }, // URem
+      }
+    }, // i32
+    { &X86::GR64RegClass, X86::RAX, X86::RDX, {
+        { X86::IDIV64r, X86::CQO,     Copy,            X86::RAX, S }, // SDiv
+        { X86::IDIV64r, X86::CQO,     Copy,            X86::RDX, S }, // SRem
+        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RAX, U }, // UDiv
+        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RDX, U }, // URem
+      }
+    }, // i64
+  };
+
+  MVT VT;
+  if (!isTypeLegal(I->getType(), VT))
+    return false;
+
+  unsigned TypeIndex, OpIndex;
+  switch (VT.SimpleTy) {
+  default: return false;
+  case MVT::i8:  TypeIndex = 0; break;
+  case MVT::i16: TypeIndex = 1; break;
+  case MVT::i32: TypeIndex = 2; break;
+  case MVT::i64: TypeIndex = 3;
+    if (!Subtarget->is64Bit())
+      return false;
+    break;
+  }
+
+  switch (I->getOpcode()) {
+  default: llvm_unreachable("Unexpected div/rem opcode");
+  case Instruction::SDiv: OpIndex = 0; break;
+  case Instruction::SRem: OpIndex = 1; break;
+  case Instruction::UDiv: OpIndex = 2; break;
+  case Instruction::URem: OpIndex = 3; break;
+  }
+
+  const DivRemEntry &TypeEntry = OpTable[TypeIndex];
+  const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
+  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  if (Op0Reg == 0)
+    return false;
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (Op1Reg == 0)
+    return false;
+
+  // Move op0 into low-order input register.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
+  // Zero-extend or sign-extend into high-order input register.
+  if (OpEntry.OpSignExtend) {
+    if (OpEntry.IsOpSigned)
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(OpEntry.OpSignExtend));
+    else {
+      unsigned Zero32 = createResultReg(&X86::GR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(X86::MOV32r0), Zero32);
+
+      // Copy the zero into the appropriate sub/super/identical physical
+      // register. Unfortunately the operations needed are not uniform enough
+      // to fit neatly into the table above.
+      if (VT.SimpleTy == MVT::i16) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(Copy), TypeEntry.HighInReg)
+          .addReg(Zero32, 0, X86::sub_16bit);
+      } else if (VT.SimpleTy == MVT::i32) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(Copy), TypeEntry.HighInReg)
+            .addReg(Zero32);
+      } else if (VT.SimpleTy == MVT::i64) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
+            .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
+      }
+    }
+  }
+  // Generate the DIV/IDIV instruction.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
+  // For i8 remainder, we can't reference AH directly, as we'll end
+  // up with bogus copies like %R9B = COPY %AH. Reference AX
+  // instead to prevent AH references in a REX instruction.
+  //
+  // The current assumption of the fast register allocator is that isel
+  // won't generate explicit references to the GPR8_NOREX registers. If
+  // the allocator and/or the backend get enhanced to be more robust in
+  // that regard, this can be, and should be, removed.
+  unsigned ResultReg = 0;
+  if ((I->getOpcode() == Instruction::SRem ||
+       I->getOpcode() == Instruction::URem) &&
+      OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
+    unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass);
+    unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(Copy), SourceSuperReg).addReg(X86::AX);
+
+    // Shift AX right by 8 bits instead of using AH.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri),
+            ResultSuperReg).addReg(SourceSuperReg).addImm(8);
+
+    // Now reference the 8-bit subreg of the result.
+    ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
+                                           /*Kill=*/true, X86::sub_8bit);
+  }
+  // Copy the result out of the physreg if we haven't already.
+  if (!ResultReg) {
+    ResultReg = createResultReg(TypeEntry.RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg)
+        .addReg(OpEntry.DivRemResultReg);
+  }
+  updateValueMap(I, ResultReg);
+
+  return true;
+}
+
+/// \brief Emit a conditional move instruction (if the are supported) to lower
+/// the select.
+bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
+  // Check if the subtarget supports these instructions.
+  if (!Subtarget->hasCMov())
+    return false;
+
+  // FIXME: Add support for i8.
+  if (RetVT < MVT::i16 || RetVT > MVT::i64)
+    return false;
+
+  const Value *Cond = I->getOperand(0);
+  const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+  bool NeedTest = true;
+  X86::CondCode CC = X86::COND_NE;
+
+  // Optimize conditions coming from a compare if both instructions are in the
+  // same basic block (values defined in other basic blocks may not have
+  // initialized registers).
+  const auto *CI = dyn_cast<CmpInst>(Cond);
+  if (CI && (CI->getParent() == I->getParent())) {
+    CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+    // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
+    static unsigned SETFOpcTable[2][3] = {
+      { X86::SETNPr, X86::SETEr , X86::TEST8rr },
+      { X86::SETPr,  X86::SETNEr, X86::OR8rr   }
+    };
+    unsigned *SETFOpc = nullptr;
+    switch (Predicate) {
+    default: break;
+    case CmpInst::FCMP_OEQ:
+      SETFOpc = &SETFOpcTable[0][0];
+      Predicate = CmpInst::ICMP_NE;
+      break;
+    case CmpInst::FCMP_UNE:
+      SETFOpc = &SETFOpcTable[1][0];
+      Predicate = CmpInst::ICMP_NE;
+      break;
+    }
+
+    bool NeedSwap;
+    std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate);
+    assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+
+    const Value *CmpLHS = CI->getOperand(0);
+    const Value *CmpRHS = CI->getOperand(1);
+    if (NeedSwap)
+      std::swap(CmpLHS, CmpRHS);
+
+    EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
+    // Emit a compare of the LHS and RHS, setting the flags.
+    if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
+      return false;
+
+    if (SETFOpc) {
+      unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
+      unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
+              FlagReg1);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
+              FlagReg2);
+      auto const &II = TII.get(SETFOpc[2]);
+      if (II.getNumDefs()) {
+        unsigned TmpReg = createResultReg(&X86::GR8RegClass);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg)
+          .addReg(FlagReg2).addReg(FlagReg1);
+      } else {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+          .addReg(FlagReg2).addReg(FlagReg1);
+      }
+    }
+    NeedTest = false;
+  } else if (foldX86XALUIntrinsic(CC, I, Cond)) {
+    // Fake request the condition, otherwise the intrinsic might be completely
+    // optimized away.
+    unsigned TmpReg = getRegForValue(Cond);
+    if (TmpReg == 0)
+      return false;
+
+    NeedTest = false;
+  }
+
+  if (NeedTest) {
+    // Selects operate on i1, however, CondReg is 8 bits width and may contain
+    // garbage. Indeed, only the less significant bit is supposed to be
+    // accurate. If we read more than the lsb, we may see non-zero values
+    // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
+    // the select. This is achieved by performing TEST against 1.
+    unsigned CondReg = getRegForValue(Cond);
+    if (CondReg == 0)
+      return false;
+    bool CondIsKill = hasTrivialKill(Cond);
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+      .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
+  }
+
+  const Value *LHS = I->getOperand(1);
+  const Value *RHS = I->getOperand(2);
+
+  unsigned RHSReg = getRegForValue(RHS);
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  unsigned LHSReg = getRegForValue(LHS);
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  if (!LHSReg || !RHSReg)
+    return false;
+
+  unsigned Opc = X86::getCMovFromCond(CC, RC->getSize());
+  unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
+                                       LHSReg, LHSIsKill);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+/// \brief Emit SSE or AVX instructions to lower the select.
+///
+/// Try to use SSE1/SSE2 instructions to simulate a select without branches.
+/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
+/// SSE instructions are available. If AVX is available, try to use a VBLENDV.
+bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
+  // Optimize conditions coming from a compare if both instructions are in the
+  // same basic block (values defined in other basic blocks may not have
+  // initialized registers).
+  const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0));
+  if (!CI || (CI->getParent() != I->getParent()))
+    return false;
+
+  if (I->getType() != CI->getOperand(0)->getType() ||
+      !((Subtarget->hasSSE1() && RetVT == MVT::f32) ||
+        (Subtarget->hasSSE2() && RetVT == MVT::f64)))
+    return false;
+
+  const Value *CmpLHS = CI->getOperand(0);
+  const Value *CmpRHS = CI->getOperand(1);
+  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+  // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
+  // We don't have to materialize a zero constant for this case and can just use
+  // %x again on the RHS.
+  if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+    const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
+    if (CmpRHSC && CmpRHSC->isNullValue())
+      CmpRHS = CmpLHS;
+  }
+
+  unsigned CC;
+  bool NeedSwap;
+  std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
+  if (CC > 7)
+    return false;
+
+  if (NeedSwap)
+    std::swap(CmpLHS, CmpRHS);
+
+  // Choose the SSE instruction sequence based on data type (float or double).
+  static unsigned OpcTable[2][4] = {
+    { X86::CMPSSrr,  X86::FsANDPSrr,  X86::FsANDNPSrr,  X86::FsORPSrr  },
+    { X86::CMPSDrr,  X86::FsANDPDrr,  X86::FsANDNPDrr,  X86::FsORPDrr  }
+  };
+
+  unsigned *Opc = nullptr;
+  switch (RetVT.SimpleTy) {
+  default: return false;
+  case MVT::f32: Opc = &OpcTable[0][0]; break;
+  case MVT::f64: Opc = &OpcTable[1][0]; break;
+  }
+
+  const Value *LHS = I->getOperand(1);
+  const Value *RHS = I->getOperand(2);
+
+  unsigned LHSReg = getRegForValue(LHS);
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  unsigned RHSReg = getRegForValue(RHS);
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  unsigned CmpLHSReg = getRegForValue(CmpLHS);
+  bool CmpLHSIsKill = hasTrivialKill(CmpLHS);
+
+  unsigned CmpRHSReg = getRegForValue(CmpRHS);
+  bool CmpRHSIsKill = hasTrivialKill(CmpRHS);
+
+  if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS)
+    return false;
+
+  const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+  unsigned ResultReg;
+  
+  if (Subtarget->hasAVX()) {
+    const TargetRegisterClass *FR32 = &X86::FR32RegClass;
+    const TargetRegisterClass *VR128 = &X86::VR128RegClass;
+
+    // If we have AVX, create 1 blendv instead of 3 logic instructions.
+    // Blendv was introduced with SSE 4.1, but the 2 register form implicitly
+    // uses XMM0 as the selection register. That may need just as many
+    // instructions as the AND/ANDN/OR sequence due to register moves, so
+    // don't bother.
+    unsigned CmpOpcode =
+      (RetVT.SimpleTy == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr;
+    unsigned BlendOpcode =
+      (RetVT.SimpleTy == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr;
+    
+    unsigned CmpReg = fastEmitInst_rri(CmpOpcode, FR32, CmpLHSReg, CmpLHSIsKill,
+                                       CmpRHSReg, CmpRHSIsKill, CC);
+    unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill,
+                                          LHSReg, LHSIsKill, CmpReg, true);
+    ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);
+  } else {
+    unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
+                                       CmpRHSReg, CmpRHSIsKill, CC);
+    unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false,
+                                      LHSReg, LHSIsKill);
+    unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true,
+                                       RHSReg, RHSIsKill);
+    ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true,
+                                         AndReg, /*IsKill=*/true);
+  }
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
+  // These are pseudo CMOV instructions and will be later expanded into control-
+  // flow.
+  unsigned Opc;
+  switch (RetVT.SimpleTy) {
+  default: return false;
+  case MVT::i8:  Opc = X86::CMOV_GR8;  break;
+  case MVT::i16: Opc = X86::CMOV_GR16; break;
+  case MVT::i32: Opc = X86::CMOV_GR32; break;
+  case MVT::f32: Opc = X86::CMOV_FR32; break;
+  case MVT::f64: Opc = X86::CMOV_FR64; break;
+  }
+
+  const Value *Cond = I->getOperand(0);
+  X86::CondCode CC = X86::COND_NE;
+
+  // Optimize conditions coming from a compare if both instructions are in the
+  // same basic block (values defined in other basic blocks may not have
+  // initialized registers).
+  const auto *CI = dyn_cast<CmpInst>(Cond);
+  if (CI && (CI->getParent() == I->getParent())) {
+    bool NeedSwap;
+    std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate());
+    if (CC > X86::LAST_VALID_COND)
+      return false;
+
+    const Value *CmpLHS = CI->getOperand(0);
+    const Value *CmpRHS = CI->getOperand(1);
+
+    if (NeedSwap)
+      std::swap(CmpLHS, CmpRHS);
+
+    EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
+    if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
+      return false;
+  } else {
+    unsigned CondReg = getRegForValue(Cond);
+    if (CondReg == 0)
+      return false;
+    bool CondIsKill = hasTrivialKill(Cond);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+      .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
+  }
+
+  const Value *LHS = I->getOperand(1);
+  const Value *RHS = I->getOperand(2);
+
+  unsigned LHSReg = getRegForValue(LHS);
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  unsigned RHSReg = getRegForValue(RHS);
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  if (!LHSReg || !RHSReg)
+    return false;
+
+  const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+
+  unsigned ResultReg =
+    fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectSelect(const Instruction *I) {
+  MVT RetVT;
+  if (!isTypeLegal(I->getType(), RetVT))
+    return false;
+
+  // Check if we can fold the select.
+  if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) {
+    CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+    const Value *Opnd = nullptr;
+    switch (Predicate) {
+    default:                              break;
+    case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;
+    case CmpInst::FCMP_TRUE:  Opnd = I->getOperand(1); break;
+    }
+    // No need for a select anymore - this is an unconditional move.
+    if (Opnd) {
+      unsigned OpReg = getRegForValue(Opnd);
+      if (OpReg == 0)
+        return false;
+      bool OpIsKill = hasTrivialKill(Opnd);
+      const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+      unsigned ResultReg = createResultReg(RC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(OpReg, getKillRegState(OpIsKill));
+      updateValueMap(I, ResultReg);
+      return true;
+    }
+  }
+
+  // First try to use real conditional move instructions.
+  if (X86FastEmitCMoveSelect(RetVT, I))
+    return true;
+
+  // Try to use a sequence of SSE instructions to simulate a conditional move.
+  if (X86FastEmitSSESelect(RetVT, I))
+    return true;
+
+  // Fall-back to pseudo conditional move instructions, which will be later
+  // converted to control-flow.
+  if (X86FastEmitPseudoSelect(RetVT, I))
+    return true;
+
+  return false;
+}
+
+bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
+  // The target-independent selection algorithm in FastISel already knows how
+  // to select a SINT_TO_FP if the target is SSE but not AVX.
+  // Early exit if the subtarget doesn't have AVX.
+  if (!Subtarget->hasAVX())
+    return false;
+
+  if (!I->getOperand(0)->getType()->isIntegerTy(32))
+    return false;
+
+  // Select integer to float/double conversion.
+  unsigned OpReg = getRegForValue(I->getOperand(0));
+  if (OpReg == 0)
+    return false;
+
+  const TargetRegisterClass *RC = nullptr;
+  unsigned Opcode;
+
+  if (I->getType()->isDoubleTy()) {
+    // sitofp int -> double
+    Opcode = X86::VCVTSI2SDrr;
+    RC = &X86::FR64RegClass;
+  } else if (I->getType()->isFloatTy()) {
+    // sitofp int -> float
+    Opcode = X86::VCVTSI2SSrr;
+    RC = &X86::FR32RegClass;
+  } else
+    return false;
+
+  unsigned ImplicitDefReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+  unsigned ResultReg =
+      fastEmitInst_rr(Opcode, RC, ImplicitDefReg, true, OpReg, false);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+// Helper method used by X86SelectFPExt and X86SelectFPTrunc.
+bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
+                                          unsigned TargetOpc,
+                                          const TargetRegisterClass *RC) {
+  assert((I->getOpcode() == Instruction::FPExt ||
+          I->getOpcode() == Instruction::FPTrunc) &&
+         "Instruction must be an FPExt or FPTrunc!");
+
+  unsigned OpReg = getRegForValue(I->getOperand(0));
+  if (OpReg == 0)
+    return false;
+
+  unsigned ResultReg = createResultReg(RC);
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc),
+                ResultReg);
+  if (Subtarget->hasAVX())
+    MIB.addReg(OpReg);
+  MIB.addReg(OpReg);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectFPExt(const Instruction *I) {
+  if (X86ScalarSSEf64 && I->getType()->isDoubleTy() &&
+      I->getOperand(0)->getType()->isFloatTy()) {
+    // fpext from float to double.
+    unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr;
+    return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR64RegClass);
+  }
+
+  return false;
+}
+
+bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
+  if (X86ScalarSSEf64 && I->getType()->isFloatTy() &&
+      I->getOperand(0)->getType()->isDoubleTy()) {
+    // fptrunc from double to float.
+    unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr;
+    return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR32RegClass);
+  }
+
+  return false;
+}
+
+bool X86FastISel::X86SelectTrunc(const Instruction *I) {
+  EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+  EVT DstVT = TLI.getValueType(DL, I->getType());
+
+  // This code only handles truncation to byte.
+  if (DstVT != MVT::i8 && DstVT != MVT::i1)
+    return false;
+  if (!TLI.isTypeLegal(SrcVT))
+    return false;
+
+  unsigned InputReg = getRegForValue(I->getOperand(0));
+  if (!InputReg)
+    // Unhandled operand.  Halt "fast" selection and bail.
+    return false;
+
+  if (SrcVT == MVT::i8) {
+    // Truncate from i8 to i1; no code needed.
+    updateValueMap(I, InputReg);
+    return true;
+  }
+
+  bool KillInputReg = false;
+  if (!Subtarget->is64Bit()) {
+    // If we're on x86-32; we can't extract an i8 from a general register.
+    // First issue a copy to GR16_ABCD or GR32_ABCD.
+    const TargetRegisterClass *CopyRC =
+      (SrcVT == MVT::i16) ? &X86::GR16_ABCDRegClass : &X86::GR32_ABCDRegClass;
+    unsigned CopyReg = createResultReg(CopyRC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg);
+    InputReg = CopyReg;
+    KillInputReg = true;
+  }
+
+  // Issue an extract_subreg.
+  unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8,
+                                                  InputReg, KillInputReg,
+                                                  X86::sub_8bit);
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::IsMemcpySmall(uint64_t Len) {
+  return Len <= (Subtarget->is64Bit() ? 32 : 16);
+}
+
+bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
+                                     X86AddressMode SrcAM, uint64_t Len) {
+
+  // Make sure we don't bloat code by inlining very large memcpy's.
+  if (!IsMemcpySmall(Len))
+    return false;
+
+  bool i64Legal = Subtarget->is64Bit();
+
+  // We don't care about alignment here since we just emit integer accesses.
+  while (Len) {
+    MVT VT;
+    if (Len >= 8 && i64Legal)
+      VT = MVT::i64;
+    else if (Len >= 4)
+      VT = MVT::i32;
+    else if (Len >= 2)
+      VT = MVT::i16;
+    else
+      VT = MVT::i8;
+
+    unsigned Reg;
+    bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
+    RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM);
+    assert(RV && "Failed to emit load or store??");
+
+    unsigned Size = VT.getSizeInBits()/8;
+    Len -= Size;
+    DestAM.Disp += Size;
+    SrcAM.Disp += Size;
+  }
+
+  return true;
+}
+
+bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
+  // FIXME: Handle more intrinsics.
+  switch (II->getIntrinsicID()) {
+  default: return false;
+  case Intrinsic::convert_from_fp16:
+  case Intrinsic::convert_to_fp16: {
+    if (Subtarget->useSoftFloat() || !Subtarget->hasF16C())
+      return false;
+
+    const Value *Op = II->getArgOperand(0);
+    unsigned InputReg = getRegForValue(Op);
+    if (InputReg == 0)
+      return false;
+
+    // F16C only allows converting from float to half and from half to float.
+    bool IsFloatToHalf = II->getIntrinsicID() == Intrinsic::convert_to_fp16;
+    if (IsFloatToHalf) {
+      if (!Op->getType()->isFloatTy())
+        return false;
+    } else {
+      if (!II->getType()->isFloatTy())
+        return false;
+    }
+
+    unsigned ResultReg = 0;
+    const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::v8i16);
+    if (IsFloatToHalf) {
+      // 'InputReg' is implicitly promoted from register class FR32 to
+      // register class VR128 by method 'constrainOperandRegClass' which is
+      // directly called by 'fastEmitInst_ri'.
+      // Instruction VCVTPS2PHrr takes an extra immediate operand which is
+      // used to provide rounding control.
+      InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 0);
+
+      // Move the lower 32-bits of ResultReg to another register of class GR32.
+      ResultReg = createResultReg(&X86::GR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(X86::VMOVPDI2DIrr), ResultReg)
+          .addReg(InputReg, RegState::Kill);
+      
+      // The result value is in the lower 16-bits of ResultReg.
+      unsigned RegIdx = X86::sub_16bit;
+      ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx);
+    } else {
+      assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
+      // Explicitly sign-extend the input to 32-bit.
+      InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::SIGN_EXTEND, InputReg,
+                            /*Kill=*/false);
+
+      // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
+      InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
+                            InputReg, /*Kill=*/true);
+
+      InputReg = fastEmitInst_r(X86::VCVTPH2PSrr, RC, InputReg, /*Kill=*/true);
+
+      // The result value is in the lower 32-bits of ResultReg.
+      // Emit an explicit copy from register class VR128 to register class FR32.
+      ResultReg = createResultReg(&X86::FR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+          .addReg(InputReg, RegState::Kill);
+    }
+
+    updateValueMap(II, ResultReg);
+    return true;
+  }
+  case Intrinsic::frameaddress: {
+    MachineFunction *MF = FuncInfo.MF;
+    if (MF->getTarget().getMCAsmInfo()->usesWindowsCFI())
+      return false;
+
+    Type *RetTy = II->getCalledFunction()->getReturnType();
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    unsigned Opc;
+    const TargetRegisterClass *RC = nullptr;
+
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Invalid result type for frameaddress.");
+    case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
+    case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
+    }
+
+    // This needs to be set before we call getPtrSizedFrameRegister, otherwise
+    // we get the wrong frame register.
+    MachineFrameInfo *MFI = MF->getFrameInfo();
+    MFI->setFrameAddressIsTaken(true);
+
+    const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+    unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF);
+    assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
+            (FrameReg == X86::EBP && VT == MVT::i32)) &&
+           "Invalid Frame Register!");
+
+    // Always make a copy of the frame register to to a vreg first, so that we
+    // never directly reference the frame register (the TwoAddressInstruction-
+    // Pass doesn't like that).
+    unsigned SrcReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
+
+    // Now recursively load from the frame address.
+    // movq (%rbp), %rax
+    // movq (%rax), %rax
+    // movq (%rax), %rax
+    // ...
+    unsigned DestReg;
+    unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
+    while (Depth--) {
+      DestReg = createResultReg(RC);
+      addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                           TII.get(Opc), DestReg), SrcReg);
+      SrcReg = DestReg;
+    }
+
+    updateValueMap(II, SrcReg);
+    return true;
+  }
+  case Intrinsic::memcpy: {
+    const MemCpyInst *MCI = cast<MemCpyInst>(II);
+    // Don't handle volatile or variable length memcpys.
+    if (MCI->isVolatile())
+      return false;
+
+    if (isa<ConstantInt>(MCI->getLength())) {
+      // Small memcpy's are common enough that we want to do them
+      // without a call if possible.
+      uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue();
+      if (IsMemcpySmall(Len)) {
+        X86AddressMode DestAM, SrcAM;
+        if (!X86SelectAddress(MCI->getRawDest(), DestAM) ||
+            !X86SelectAddress(MCI->getRawSource(), SrcAM))
+          return false;
+        TryEmitSmallMemcpy(DestAM, SrcAM, Len);
+        return true;
+      }
+    }
+
+    unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+    if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth))
+      return false;
+
+    if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)
+      return false;
+
+    return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2);
+  }
+  case Intrinsic::memset: {
+    const MemSetInst *MSI = cast<MemSetInst>(II);
+
+    if (MSI->isVolatile())
+      return false;
+
+    unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+    if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth))
+      return false;
+
+    if (MSI->getDestAddressSpace() > 255)
+      return false;
+
+    return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+  }
+  case Intrinsic::stackprotector: {
+    // Emit code to store the stack guard onto the stack.
+    EVT PtrTy = TLI.getPointerTy(DL);
+
+    const Value *Op1 = II->getArgOperand(0); // The guard's value.
+    const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));
+
+    MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);
+
+    // Grab the frame index.
+    X86AddressMode AM;
+    if (!X86SelectAddress(Slot, AM)) return false;
+    if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
+    return true;
+  }
+  case Intrinsic::dbg_declare: {
+    const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
+    X86AddressMode AM;
+    assert(DI->getAddress() && "Null address should be checked earlier!");
+    if (!X86SelectAddress(DI->getAddress(), AM))
+      return false;
+    const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
+    // FIXME may need to add RegState::Debug to any registers produced,
+    // although ESP/EBP should be the only ones at the moment.
+    assert(DI->getVariable()->isValidLocationForIntrinsic(DbgLoc) &&
+           "Expected inlined-at fields to agree");
+    addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM)
+        .addImm(0)
+        .addMetadata(DI->getVariable())
+        .addMetadata(DI->getExpression());
+    return true;
+  }
+  case Intrinsic::trap: {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP));
+    return true;
+  }
+  case Intrinsic::sqrt: {
+    if (!Subtarget->hasSSE1())
+      return false;
+
+    Type *RetTy = II->getCalledFunction()->getReturnType();
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
+    // is not generated by FastISel yet.
+    // FIXME: Update this code once tablegen can handle it.
+    static const unsigned SqrtOpc[2][2] = {
+      {X86::SQRTSSr, X86::VSQRTSSr},
+      {X86::SQRTSDr, X86::VSQRTSDr}
+    };
+    bool HasAVX = Subtarget->hasAVX();
+    unsigned Opc;
+    const TargetRegisterClass *RC;
+    switch (VT.SimpleTy) {
+    default: return false;
+    case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break;
+    case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break;
+    }
+
+    const Value *SrcVal = II->getArgOperand(0);
+    unsigned SrcReg = getRegForValue(SrcVal);
+
+    if (SrcReg == 0)
+      return false;
+
+    unsigned ImplicitDefReg = 0;
+    if (HasAVX) {
+      ImplicitDefReg = createResultReg(RC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+    }
+
+    unsigned ResultReg = createResultReg(RC);
+    MachineInstrBuilder MIB;
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                  ResultReg);
+
+    if (ImplicitDefReg)
+      MIB.addReg(ImplicitDefReg);
+
+    MIB.addReg(SrcReg);
+
+    updateValueMap(II, ResultReg);
+    return true;
+  }
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow: {
+    // This implements the basic lowering of the xalu with overflow intrinsics
+    // into add/sub/mul followed by either seto or setb.
+    const Function *Callee = II->getCalledFunction();
+    auto *Ty = cast<StructType>(Callee->getReturnType());
+    Type *RetTy = Ty->getTypeAtIndex(0U);
+    Type *CondTy = Ty->getTypeAtIndex(1);
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    if (VT < MVT::i8 || VT > MVT::i64)
+      return false;
+
+    const Value *LHS = II->getArgOperand(0);
+    const Value *RHS = II->getArgOperand(1);
+
+    // Canonicalize immediate to the RHS.
+    if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+        isCommutativeIntrinsic(II))
+      std::swap(LHS, RHS);
+
+    bool UseIncDec = false;
+    if (isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isOne())
+      UseIncDec = true;
+
+    unsigned BaseOpc, CondOpc;
+    switch (II->getIntrinsicID()) {
+    default: llvm_unreachable("Unexpected intrinsic!");
+    case Intrinsic::sadd_with_overflow:
+      BaseOpc = UseIncDec ? unsigned(X86ISD::INC) : unsigned(ISD::ADD);
+      CondOpc = X86::SETOr;
+      break;
+    case Intrinsic::uadd_with_overflow:
+      BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
+    case Intrinsic::ssub_with_overflow:
+      BaseOpc = UseIncDec ? unsigned(X86ISD::DEC) : unsigned(ISD::SUB);
+      CondOpc = X86::SETOr;
+      break;
+    case Intrinsic::usub_with_overflow:
+      BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
+    case Intrinsic::smul_with_overflow:
+      BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break;
+    case Intrinsic::umul_with_overflow:
+      BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break;
+    }
+
+    unsigned LHSReg = getRegForValue(LHS);
+    if (LHSReg == 0)
+      return false;
+    bool LHSIsKill = hasTrivialKill(LHS);
+
+    unsigned ResultReg = 0;
+    // Check if we have an immediate version.
+    if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
+      static const unsigned Opc[2][4] = {
+        { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r },
+        { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r }
+      };
+
+      if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) {
+        ResultReg = createResultReg(TLI.getRegClassFor(VT));
+        bool IsDec = BaseOpc == X86ISD::DEC;
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
+          .addReg(LHSReg, getKillRegState(LHSIsKill));
+      } else
+        ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
+                                CI->getZExtValue());
+    }
+
+    unsigned RHSReg;
+    bool RHSIsKill;
+    if (!ResultReg) {
+      RHSReg = getRegForValue(RHS);
+      if (RHSReg == 0)
+        return false;
+      RHSIsKill = hasTrivialKill(RHS);
+      ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
+                              RHSIsKill);
+    }
+
+    // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit
+    // it manually.
+    if (BaseOpc == X86ISD::UMUL && !ResultReg) {
+      static const unsigned MULOpc[] =
+        { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
+      static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
+      // First copy the first operand into RAX, which is an implicit input to
+      // the X86::MUL*r instruction.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
+        .addReg(LHSReg, getKillRegState(LHSIsKill));
+      ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
+                                 TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
+    } else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
+      static const unsigned MULOpc[] =
+        { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
+      if (VT == MVT::i8) {
+        // Copy the first operand into AL, which is an implicit input to the
+        // X86::IMUL8r instruction.
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+               TII.get(TargetOpcode::COPY), X86::AL)
+          .addReg(LHSReg, getKillRegState(LHSIsKill));
+        ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg,
+                                   RHSIsKill);
+      } else
+        ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
+                                    TLI.getRegClassFor(VT), LHSReg, LHSIsKill,
+                                    RHSReg, RHSIsKill);
+    }
+
+    if (!ResultReg)
+      return false;
+
+    unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy);
+    assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
+            ResultReg2);
+
+    updateValueMap(II, ResultReg, 2);
+    return true;
+  }
+  case Intrinsic::x86_sse_cvttss2si:
+  case Intrinsic::x86_sse_cvttss2si64:
+  case Intrinsic::x86_sse2_cvttsd2si:
+  case Intrinsic::x86_sse2_cvttsd2si64: {
+    bool IsInputDouble;
+    switch (II->getIntrinsicID()) {
+    default: llvm_unreachable("Unexpected intrinsic.");
+    case Intrinsic::x86_sse_cvttss2si:
+    case Intrinsic::x86_sse_cvttss2si64:
+      if (!Subtarget->hasSSE1())
+        return false;
+      IsInputDouble = false;
+      break;
+    case Intrinsic::x86_sse2_cvttsd2si:
+    case Intrinsic::x86_sse2_cvttsd2si64:
+      if (!Subtarget->hasSSE2())
+        return false;
+      IsInputDouble = true;
+      break;
+    }
+
+    Type *RetTy = II->getCalledFunction()->getReturnType();
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    static const unsigned CvtOpc[2][2][2] = {
+      { { X86::CVTTSS2SIrr,   X86::VCVTTSS2SIrr   },
+        { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr }  },
+      { { X86::CVTTSD2SIrr,   X86::VCVTTSD2SIrr   },
+        { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr }  }
+    };
+    bool HasAVX = Subtarget->hasAVX();
+    unsigned Opc;
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected result type.");
+    case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break;
+    case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break;
+    }
+
+    // Check if we can fold insertelement instructions into the convert.
+    const Value *Op = II->getArgOperand(0);
+    while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
+      const Value *Index = IE->getOperand(2);
+      if (!isa<ConstantInt>(Index))
+        break;
+      unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
+
+      if (Idx == 0) {
+        Op = IE->getOperand(1);
+        break;
+      }
+      Op = IE->getOperand(0);
+    }
+
+    unsigned Reg = getRegForValue(Op);
+    if (Reg == 0)
+      return false;
+
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(Reg);
+
+    updateValueMap(II, ResultReg);
+    return true;
+  }
+  }
+}
+
+bool X86FastISel::fastLowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const Function *F = FuncInfo.Fn;
+  if (F->isVarArg())
+    return false;
+
+  CallingConv::ID CC = F->getCallingConv();
+  if (CC != CallingConv::C)
+    return false;
+
+  if (Subtarget->isCallingConvWin64(CC))
+    return false;
+
+  if (!Subtarget->is64Bit())
+    return false;
+
+  // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
+  unsigned GPRCnt = 0;
+  unsigned FPRCnt = 0;
+  unsigned Idx = 0;
+  for (auto const &Arg : F->args()) {
+    // The first argument is at index 1.
+    ++Idx;
+    if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::Nest))
+      return false;
+
+    Type *ArgTy = Arg.getType();
+    if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+      return false;
+
+    EVT ArgVT = TLI.getValueType(DL, ArgTy);
+    if (!ArgVT.isSimple()) return false;
+    switch (ArgVT.getSimpleVT().SimpleTy) {
+    default: return false;
+    case MVT::i32:
+    case MVT::i64:
+      ++GPRCnt;
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      if (!Subtarget->hasSSE1())
+        return false;
+      ++FPRCnt;
+      break;
+    }
+
+    if (GPRCnt > 6)
+      return false;
+
+    if (FPRCnt > 8)
+      return false;
+  }
+
+  static const MCPhysReg GPR32ArgRegs[] = {
+    X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
+  };
+  static const MCPhysReg GPR64ArgRegs[] = {
+    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
+  };
+  static const MCPhysReg XMMArgRegs[] = {
+    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+    X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+  };
+
+  unsigned GPRIdx = 0;
+  unsigned FPRIdx = 0;
+  for (auto const &Arg : F->args()) {
+    MVT VT = TLI.getSimpleValueType(DL, Arg.getType());
+    const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
+    unsigned SrcReg;
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type.");
+    case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
+    case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
+    case MVT::f32: // fall-through
+    case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
+    }
+    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+    // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+    // Without this, EmitLiveInCopies may eliminate the livein if its only
+    // use is a bitcast (which isn't turned into an instruction).
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg)
+      .addReg(DstReg, getKillRegState(true));
+    updateValueMap(&Arg, ResultReg);
+  }
+  return true;
+}
+
+static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget,
+                                           CallingConv::ID CC,
+                                           ImmutableCallSite *CS) {
+  if (Subtarget->is64Bit())
+    return 0;
+  if (Subtarget->getTargetTriple().isOSMSVCRT())
+    return 0;
+  if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
+      CC == CallingConv::HiPE)
+    return 0;
+
+  if (CS)
+    if (CS->arg_empty() || !CS->paramHasAttr(1, Attribute::StructRet) ||
+        CS->paramHasAttr(1, Attribute::InReg) || Subtarget->isTargetMCU())
+      return 0;
+
+  return 4;
+}
+
+bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
+  auto &OutVals       = CLI.OutVals;
+  auto &OutFlags      = CLI.OutFlags;
+  auto &OutRegs       = CLI.OutRegs;
+  auto &Ins           = CLI.Ins;
+  auto &InRegs        = CLI.InRegs;
+  CallingConv::ID CC  = CLI.CallConv;
+  bool &IsTailCall    = CLI.IsTailCall;
+  bool IsVarArg       = CLI.IsVarArg;
+  const Value *Callee = CLI.Callee;
+  MCSymbol *Symbol = CLI.Symbol;
+
+  bool Is64Bit        = Subtarget->is64Bit();
+  bool IsWin64        = Subtarget->isCallingConvWin64(CC);
+
+  // Handle only C, fastcc, and webkit_js calling conventions for now.
+  switch (CC) {
+  default: return false;
+  case CallingConv::C:
+  case CallingConv::Fast:
+  case CallingConv::WebKit_JS:
+  case CallingConv::X86_FastCall:
+  case CallingConv::X86_64_Win64:
+  case CallingConv::X86_64_SysV:
+    break;
+  }
+
+  // Allow SelectionDAG isel to handle tail calls.
+  if (IsTailCall)
+    return false;
+
+  // fastcc with -tailcallopt is intended to provide a guaranteed
+  // tail call optimization. Fastisel doesn't know how to do that.
+  if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+    return false;
+
+  // Don't know how to handle Win64 varargs yet.  Nothing special needed for
+  // x86-32. Special handling for x86-64 is implemented.
+  if (IsVarArg && IsWin64)
+    return false;
+
+  // Don't know about inalloca yet.
+  if (CLI.CS && CLI.CS->hasInAllocaArgument())
+    return false;
+
+  // Fast-isel doesn't know about callee-pop yet.
+  if (X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
+                       TM.Options.GuaranteedTailCallOpt))
+    return false;
+
+  SmallVector<MVT, 16> OutVTs;
+  SmallVector<unsigned, 16> ArgRegs;
+
+  // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
+  // instruction. This is safe because it is common to all FastISel supported
+  // calling conventions on x86.
+  for (int i = 0, e = OutVals.size(); i != e; ++i) {
+    Value *&Val = OutVals[i];
+    ISD::ArgFlagsTy Flags = OutFlags[i];
+    if (auto *CI = dyn_cast<ConstantInt>(Val)) {
+      if (CI->getBitWidth() < 32) {
+        if (Flags.isSExt())
+          Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext()));
+        else
+          Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext()));
+      }
+    }
+
+    // Passing bools around ends up doing a trunc to i1 and passing it.
+    // Codegen this as an argument + "and 1".
+    MVT VT;
+    auto *TI = dyn_cast<TruncInst>(Val);
+    unsigned ResultReg;
+    if (TI && TI->getType()->isIntegerTy(1) && CLI.CS &&
+              (TI->getParent() == CLI.CS->getInstruction()->getParent()) &&
+              TI->hasOneUse()) {
+      Value *PrevVal = TI->getOperand(0);
+      ResultReg = getRegForValue(PrevVal);
+
+      if (!ResultReg)
+        return false;
+
+      if (!isTypeLegal(PrevVal->getType(), VT))
+        return false;
+
+      ResultReg =
+        fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1);
+    } else {
+      if (!isTypeLegal(Val->getType(), VT))
+        return false;
+      ResultReg = getRegForValue(Val);
+    }
+
+    if (!ResultReg)
+      return false;
+
+    ArgRegs.push_back(ResultReg);
+    OutVTs.push_back(VT);
+  }
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext());
+
+  // Allocate shadow area for Win64
+  if (IsWin64)
+    CCInfo.AllocateStack(32, 8);
+
+  CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
+
+  // Issue CALLSEQ_START
+  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
+    .addImm(NumBytes).addImm(0);
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign const &VA = ArgLocs[i];
+    const Value *ArgVal = OutVals[VA.getValNo()];
+    MVT ArgVT = OutVTs[VA.getValNo()];
+
+    if (ArgVT == MVT::x86mmx)
+      return false;
+
+    unsigned ArgReg = ArgRegs[VA.getValNo()];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt: {
+      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+             "Unexpected extend");
+      bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
+                                       ArgVT, ArgReg);
+      assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::ZExt: {
+      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+             "Unexpected extend");
+      bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
+                                       ArgVT, ArgReg);
+      assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::AExt: {
+      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+             "Unexpected extend");
+      bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg,
+                                       ArgVT, ArgReg);
+      if (!Emitted)
+        Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
+                                    ArgVT, ArgReg);
+      if (!Emitted)
+        Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
+                                    ArgVT, ArgReg);
+
+      assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::BCvt: {
+      ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg,
+                          /*TODO: Kill=*/false);
+      assert(ArgReg && "Failed to emit a bitcast!");
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::VExt:
+      // VExt has not been implemented, so this should be impossible to reach
+      // for now.  However, fallback to Selection DAG isel once implemented.
+      return false;
+    case CCValAssign::AExtUpper:
+    case CCValAssign::SExtUpper:
+    case CCValAssign::ZExtUpper:
+    case CCValAssign::FPExt:
+      llvm_unreachable("Unexpected loc info!");
+    case CCValAssign::Indirect:
+      // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
+      // support this.
+      return false;
+    }
+
+    if (VA.isRegLoc()) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
+      OutRegs.push_back(VA.getLocReg());
+    } else {
+      assert(VA.isMemLoc());
+
+      // Don't emit stores for undef values.
+      if (isa<UndefValue>(ArgVal))
+        continue;
+
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      X86AddressMode AM;
+      AM.Base.Reg = RegInfo->getStackRegister();
+      AM.Disp = LocMemOffset;
+      ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
+      unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+      MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+          MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset),
+          MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
+      if (Flags.isByVal()) {
+        X86AddressMode SrcAM;
+        SrcAM.Base.Reg = ArgReg;
+        if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()))
+          return false;
+      } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
+        // If this is a really simple value, emit this with the Value* version
+        // of X86FastEmitStore.  If it isn't simple, we don't want to do this,
+        // as it can cause us to reevaluate the argument.
+        if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))
+          return false;
+      } else {
+        bool ValIsKill = hasTrivialKill(ArgVal);
+        if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO))
+          return false;
+      }
+    }
+  }
+
+  // ELF / PIC requires GOT in the EBX register before function calls via PLT
+  // GOT pointer.
+  if (Subtarget->isPICStyleGOT()) {
+    unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);
+  }
+
+  if (Is64Bit && IsVarArg && !IsWin64) {
+    // From AMD64 ABI document:
+    // For calls that may call functions that use varargs or stdargs
+    // (prototype-less calls or calls to functions containing ellipsis (...) in
+    // the declaration) %al is used as hidden argument to specify the number
+    // of SSE registers used. The contents of %al do not need to match exactly
+    // the number of registers, but must be an ubound on the number of SSE
+    // registers used and is in the range 0 - 8 inclusive.
+
+    // Count the number of XMM registers allocated.
+    static const MCPhysReg XMMArgRegs[] = {
+      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+    };
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
+    assert((Subtarget->hasSSE1() || !NumXMMRegs)
+           && "SSE registers cannot be used when SSE is disabled");
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
+            X86::AL).addImm(NumXMMRegs);
+  }
+
+  // Materialize callee address in a register. FIXME: GV address can be
+  // handled with a CALLpcrel32 instead.
+  X86AddressMode CalleeAM;
+  if (!X86SelectCallAddress(Callee, CalleeAM))
+    return false;
+
+  unsigned CalleeOp = 0;
+  const GlobalValue *GV = nullptr;
+  if (CalleeAM.GV != nullptr) {
+    GV = CalleeAM.GV;
+  } else if (CalleeAM.Base.Reg != 0) {
+    CalleeOp = CalleeAM.Base.Reg;
+  } else
+    return false;
+
+  // Issue the call.
+  MachineInstrBuilder MIB;
+  if (CalleeOp) {
+    // Register-indirect call.
+    unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc))
+      .addReg(CalleeOp);
+  } else {
+    // Direct call.
+    assert(GV && "Not a direct call");
+    unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
+
+    // See if we need any target-specific flags on the GV operand.
+    unsigned char OpFlags = 0;
+
+    // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
+    // external symbols most go through the PLT in PIC mode.  If the symbol
+    // has hidden or protected visibility, or if it is static or local, then
+    // we don't need to use the PLT - we can directly call it.
+    if (Subtarget->isTargetELF() &&
+        TM.getRelocationModel() == Reloc::PIC_ &&
+        GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
+      OpFlags = X86II::MO_PLT;
+    } else if (Subtarget->isPICStyleStubAny() &&
+               !GV->isStrongDefinitionForLinker() &&
+               (!Subtarget->getTargetTriple().isMacOSX() ||
+                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
+      // PC-relative references to external symbols should go through $stub,
+      // unless we're building with the leopard linker or later, which
+      // automatically synthesizes these stubs.
+      OpFlags = X86II::MO_DARWIN_STUB;
+    }
+
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc));
+    if (Symbol)
+      MIB.addSym(Symbol, OpFlags);
+    else
+      MIB.addGlobalAddress(GV, 0, OpFlags);
+  }
+
+  // Add a register mask operand representing the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
+
+  // Add an implicit use GOT pointer in EBX.
+  if (Subtarget->isPICStyleGOT())
+    MIB.addReg(X86::EBX, RegState::Implicit);
+
+  if (Is64Bit && IsVarArg && !IsWin64)
+    MIB.addReg(X86::AL, RegState::Implicit);
+
+  // Add implicit physical register uses to the call.
+  for (auto Reg : OutRegs)
+    MIB.addReg(Reg, RegState::Implicit);
+
+  // Issue CALLSEQ_END
+  unsigned NumBytesForCalleeToPop =
+    computeBytesPoppedByCallee(Subtarget, CC, CLI.CS);
+  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
+    .addImm(NumBytes).addImm(NumBytesForCalleeToPop);
+
+  // Now handle call return values.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs,
+                    CLI.RetTy->getContext());
+  CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
+
+  // Copy all of the result registers out of their specified physreg.
+  unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    EVT CopyVT = VA.getValVT();
+    unsigned CopyReg = ResultReg + i;
+
+    // If this is x86-64, and we disabled SSE, we can't return FP values
+    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
+      report_fatal_error("SSE register return with SSE disabled");
+    }
+
+    // If we prefer to use the value in xmm registers, copy it out as f80 and
+    // use a truncate to move it from fp stack reg to xmm reg.
+    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+        isScalarFPTypeInSSEReg(VA.getValVT())) {
+      CopyVT = MVT::f80;
+      CopyReg = createResultReg(&X86::RFP80RegClass);
+    }
+
+    // Copy out the result.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
+    InRegs.push_back(VA.getLocReg());
+
+    // Round the f80 to the right size, which also moves it to the appropriate
+    // xmm register. This is accomplished by storing the f80 value in memory
+    // and then loading it back.
+    if (CopyVT != VA.getValVT()) {
+      EVT ResVT = VA.getValVT();
+      unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
+      unsigned MemSize = ResVT.getSizeInBits()/8;
+      int FI = MFI.CreateStackObject(MemSize, MemSize, false);
+      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                TII.get(Opc)), FI)
+        .addReg(CopyReg);
+      Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
+      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                TII.get(Opc), ResultReg + i), FI);
+    }
+  }
+
+  CLI.ResultReg = ResultReg;
+  CLI.NumResultRegs = RVLocs.size();
+  CLI.Call = MIB;
+
+  return true;
+}
+
+bool
+X86FastISel::fastSelectInstruction(const Instruction *I)  {
+  switch (I->getOpcode()) {
+  default: break;
+  case Instruction::Load:
+    return X86SelectLoad(I);
+  case Instruction::Store:
+    return X86SelectStore(I);
+  case Instruction::Ret:
+    return X86SelectRet(I);
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    return X86SelectCmp(I);
+  case Instruction::ZExt:
+    return X86SelectZExt(I);
+  case Instruction::Br:
+    return X86SelectBranch(I);
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::Shl:
+    return X86SelectShift(I);
+  case Instruction::SDiv:
+  case Instruction::UDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    return X86SelectDivRem(I);
+  case Instruction::Select:
+    return X86SelectSelect(I);
+  case Instruction::Trunc:
+    return X86SelectTrunc(I);
+  case Instruction::FPExt:
+    return X86SelectFPExt(I);
+  case Instruction::FPTrunc:
+    return X86SelectFPTrunc(I);
+  case Instruction::SIToFP:
+    return X86SelectSIToFP(I);
+  case Instruction::IntToPtr: // Deliberate fall-through.
+  case Instruction::PtrToInt: {
+    EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+    EVT DstVT = TLI.getValueType(DL, I->getType());
+    if (DstVT.bitsGT(SrcVT))
+      return X86SelectZExt(I);
+    if (DstVT.bitsLT(SrcVT))
+      return X86SelectTrunc(I);
+    unsigned Reg = getRegForValue(I->getOperand(0));
+    if (Reg == 0) return false;
+    updateValueMap(I, Reg);
+    return true;
+  }
+  case Instruction::BitCast: {
+    // Select SSE2/AVX bitcasts between 128/256 bit vector types.
+    if (!Subtarget->hasSSE2())
+      return false;
+
+    EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+    EVT DstVT = TLI.getValueType(DL, I->getType());
+
+    if (!SrcVT.isSimple() || !DstVT.isSimple())
+      return false;
+
+    if (!SrcVT.is128BitVector() &&
+        !(Subtarget->hasAVX() && SrcVT.is256BitVector()))
+      return false;
+
+    unsigned Reg = getRegForValue(I->getOperand(0));
+    if (Reg == 0)
+      return false;
+      
+    // No instruction is needed for conversion. Reuse the register used by
+    // the fist operand.
+    updateValueMap(I, Reg);
+    return true;
+  }
+  }
+
+  return false;
+}
+
+unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
+  if (VT > MVT::i64)
+    return 0;
+
+  uint64_t Imm = CI->getZExtValue();
+  if (Imm == 0) {
+    unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type");
+    case MVT::i1:
+    case MVT::i8:
+      return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
+                                        X86::sub_8bit);
+    case MVT::i16:
+      return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Kill=*/true,
+                                        X86::sub_16bit);
+    case MVT::i32:
+      return SrcReg;
+    case MVT::i64: {
+      unsigned ResultReg = createResultReg(&X86::GR64RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
+        .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
+      return ResultReg;
+    }
+    }
+  }
+
+  unsigned Opc = 0;
+  switch (VT.SimpleTy) {
+  default: llvm_unreachable("Unexpected value type");
+  case MVT::i1:  VT = MVT::i8; // fall-through
+  case MVT::i8:  Opc = X86::MOV8ri;  break;
+  case MVT::i16: Opc = X86::MOV16ri; break;
+  case MVT::i32: Opc = X86::MOV32ri; break;
+  case MVT::i64: {
+    if (isUInt<32>(Imm))
+      Opc = X86::MOV32ri;
+    else if (isInt<32>(Imm))
+      Opc = X86::MOV64ri32;
+    else
+      Opc = X86::MOV64ri;
+    break;
+  }
+  }
+  if (VT == MVT::i64 && Opc == X86::MOV32ri) {
+    unsigned SrcReg = fastEmitInst_i(Opc, &X86::GR32RegClass, Imm);
+    unsigned ResultReg = createResultReg(&X86::GR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
+      .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
+    return ResultReg;
+  }
+  return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
+}
+
+unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
+  if (CFP->isNullValue())
+    return fastMaterializeFloatZero(CFP);
+
+  // Can't handle alternate code models yet.
+  CodeModel::Model CM = TM.getCodeModel();
+  if (CM != CodeModel::Small && CM != CodeModel::Large)
+    return 0;
+
+  // Get opcode and regclass of the output for the given load instruction.
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = nullptr;
+  switch (VT.SimpleTy) {
+  default: return 0;
+  case MVT::f32:
+    if (X86ScalarSSEf32) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
+      RC  = &X86::FR32RegClass;
+    } else {
+      Opc = X86::LD_Fp32m;
+      RC  = &X86::RFP32RegClass;
+    }
+    break;
+  case MVT::f64:
+    if (X86ScalarSSEf64) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
+      RC  = &X86::FR64RegClass;
+    } else {
+      Opc = X86::LD_Fp64m;
+      RC  = &X86::RFP64RegClass;
+    }
+    break;
+  case MVT::f80:
+    // No f80 support yet.
+    return 0;
+  }
+
+  // MachineConstantPool wants an explicit alignment.
+  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
+  if (Align == 0) {
+    // Alignment of vector types. FIXME!
+    Align = DL.getTypeAllocSize(CFP->getType());
+  }
+
+  // x86-32 PIC requires a PIC base register for constant pools.
+  unsigned PICBase = 0;
+  unsigned char OpFlag = 0;
+  if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+    PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+  } else if (Subtarget->isPICStyleGOT()) {
+    OpFlag = X86II::MO_GOTOFF;
+    PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+  } else if (Subtarget->isPICStyleRIPRel() &&
+             TM.getCodeModel() == CodeModel::Small) {
+    PICBase = X86::RIP;
+  }
+
+  // Create the load from the constant pool.
+  unsigned CPI = MCP.getConstantPoolIndex(CFP, Align);
+  unsigned ResultReg = createResultReg(RC);
+
+  if (CM == CodeModel::Large) {
+    unsigned AddrReg = createResultReg(&X86::GR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
+            AddrReg)
+      .addConstantPoolIndex(CPI, 0, OpFlag);
+    MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                      TII.get(Opc), ResultReg);
+    addDirectMem(MIB, AddrReg);
+    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getConstantPool(*FuncInfo.MF),
+        MachineMemOperand::MOLoad, DL.getPointerSize(), Align);
+    MIB->addMemOperand(*FuncInfo.MF, MMO);
+    return ResultReg;
+  }
+
+  addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                   TII.get(Opc), ResultReg),
+                           CPI, PICBase, OpFlag);
+  return ResultReg;
+}
+
+unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
+  // Can't handle alternate code models yet.
+  if (TM.getCodeModel() != CodeModel::Small)
+    return 0;
+
+  // Materialize addresses with LEA/MOV instructions.
+  X86AddressMode AM;
+  if (X86SelectAddress(GV, AM)) {
+    // If the expression is just a basereg, then we're done, otherwise we need
+    // to emit an LEA.
+    if (AM.BaseType == X86AddressMode::RegBase &&
+        AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
+      return AM.Base.Reg;
+
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    if (TM.getRelocationModel() == Reloc::Static &&
+        TLI.getPointerTy(DL) == MVT::i64) {
+      // The displacement code could be more than 32 bits away so we need to use
+      // an instruction with a 64 bit immediate
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
+              ResultReg)
+        .addGlobalAddress(GV);
+    } else {
+      unsigned Opc =
+          TLI.getPointerTy(DL) == MVT::i32
+              ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
+              : X86::LEA64r;
+      addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                             TII.get(Opc), ResultReg), AM);
+    }
+    return ResultReg;
+  }
+  return 0;
+}
+
+unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(DL, C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple())
+    return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  if (const auto *CI = dyn_cast<ConstantInt>(C))
+    return X86MaterializeInt(CI, VT);
+  else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return X86MaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return X86MaterializeGV(GV, VT);
+
+  return 0;
+}
+
+unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
+  // Fail on dynamic allocas. At this point, getRegForValue has already
+  // checked its CSE maps, so if we're here trying to handle a dynamic
+  // alloca, we're not going to succeed. X86SelectAddress has a
+  // check for dynamic allocas, because it's called directly from
+  // various places, but targetMaterializeAlloca also needs a check
+  // in order to avoid recursion between getRegForValue,
+  // X86SelectAddrss, and targetMaterializeAlloca.
+  if (!FuncInfo.StaticAllocaMap.count(C))
+    return 0;
+  assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?");
+
+  X86AddressMode AM;
+  if (!X86SelectAddress(C, AM))
+    return 0;
+  unsigned Opc =
+      TLI.getPointerTy(DL) == MVT::i32
+          ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
+          : X86::LEA64r;
+  const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
+  unsigned ResultReg = createResultReg(RC);
+  addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                         TII.get(Opc), ResultReg), AM);
+  return ResultReg;
+}
+
+unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
+  MVT VT;
+  if (!isTypeLegal(CF->getType(), VT))
+    return 0;
+
+  // Get opcode and regclass for the given zero.
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = nullptr;
+  switch (VT.SimpleTy) {
+  default: return 0;
+  case MVT::f32:
+    if (X86ScalarSSEf32) {
+      Opc = X86::FsFLD0SS;
+      RC  = &X86::FR32RegClass;
+    } else {
+      Opc = X86::LD_Fp032;
+      RC  = &X86::RFP32RegClass;
+    }
+    break;
+  case MVT::f64:
+    if (X86ScalarSSEf64) {
+      Opc = X86::FsFLD0SD;
+      RC  = &X86::FR64RegClass;
+    } else {
+      Opc = X86::LD_Fp064;
+      RC  = &X86::RFP64RegClass;
+    }
+    break;
+  case MVT::f80:
+    // No f80 support yet.
+    return 0;
+  }
+
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+  return ResultReg;
+}
+
+
+bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                                      const LoadInst *LI) {
+  const Value *Ptr = LI->getPointerOperand();
+  X86AddressMode AM;
+  if (!X86SelectAddress(Ptr, AM))
+    return false;
+
+  const X86InstrInfo &XII = (const X86InstrInfo &)TII;
+
+  unsigned Size = DL.getTypeAllocSize(LI->getType());
+  unsigned Alignment = LI->getAlignment();
+
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = DL.getABITypeAlignment(LI->getType());
+
+  SmallVector<MachineOperand, 8> AddrOps;
+  AM.getFullAddress(AddrOps);
+
+  MachineInstr *Result = XII.foldMemoryOperandImpl(
+      *FuncInfo.MF, MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, Alignment,
+      /*AllowCommute=*/true);
+  if (!Result)
+    return false;
+
+  // The index register could be in the wrong register class.  Unfortunately,
+  // foldMemoryOperandImpl could have commuted the instruction so its not enough
+  // to just look at OpNo + the offset to the index reg.  We actually need to
+  // scan the instruction to find the index reg and see if its the correct reg
+  // class.
+  unsigned OperandNo = 0;
+  for (MachineInstr::mop_iterator I = Result->operands_begin(),
+       E = Result->operands_end(); I != E; ++I, ++OperandNo) {
+    MachineOperand &MO = *I;
+    if (!MO.isReg() || MO.isDef() || MO.getReg() != AM.IndexReg)
+      continue;
+    // Found the index reg, now try to rewrite it.
+    unsigned IndexReg = constrainOperandRegClass(Result->getDesc(),
+                                                 MO.getReg(), OperandNo);
+    if (IndexReg == MO.getReg())
+      continue;
+    MO.setReg(IndexReg);
+  }
+
+  Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
+  MI->eraseFromParent();
+  return true;
+}
+
+
+namespace llvm {
+  FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
+                                const TargetLibraryInfo *libInfo) {
+    return new X86FastISel(funcInfo, libInfo);
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
new file mode 100644
index 0000000..1dd69e8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -0,0 +1,410 @@
+//===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass that finds instructions that can be
+// re-written as LEA instructions in order to reduce pipeline delays.
+// When optimizing for size it replaces suitable LEAs with INC or DEC.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-fixup-LEAs"
+
+STATISTIC(NumLEAs, "Number of LEA instructions created");
+
+namespace {
+class FixupLEAPass : public MachineFunctionPass {
+  enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
+  static char ID;
+  /// \brief Loop over all of the instructions in the basic block
+  /// replacing applicable instructions with LEA instructions,
+  /// where appropriate.
+  bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
+
+  const char *getPassName() const override { return "X86 LEA Fixup"; }
+
+  /// \brief Given a machine register, look for the instruction
+  /// which writes it in the current basic block. If found,
+  /// try to replace it with an equivalent LEA instruction.
+  /// If replacement succeeds, then also process the newly created
+  /// instruction.
+  void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I,
+                    MachineFunction::iterator MFI);
+
+  /// \brief Given a memory access or LEA instruction
+  /// whose address mode uses a base and/or index register, look for
+  /// an opportunity to replace the instruction which sets the base or index
+  /// register with an equivalent LEA instruction.
+  void processInstruction(MachineBasicBlock::iterator &I,
+                          MachineFunction::iterator MFI);
+
+  /// \brief Given a LEA instruction which is unprofitable
+  /// on Silvermont try to replace it with an equivalent ADD instruction
+  void processInstructionForSLM(MachineBasicBlock::iterator &I,
+                                MachineFunction::iterator MFI);
+
+  /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg
+  /// and convert them to INC or DEC respectively.
+  bool fixupIncDec(MachineBasicBlock::iterator &I,
+                   MachineFunction::iterator MFI) const;
+
+  /// \brief Determine if an instruction references a machine register
+  /// and, if so, whether it reads or writes the register.
+  RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I);
+
+  /// \brief Step backwards through a basic block, looking
+  /// for an instruction which writes a register within
+  /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
+  MachineBasicBlock::iterator searchBackwards(MachineOperand &p,
+                                              MachineBasicBlock::iterator &I,
+                                              MachineFunction::iterator MFI);
+
+  /// \brief if an instruction can be converted to an
+  /// equivalent LEA, insert the new instruction into the basic block
+  /// and return a pointer to it. Otherwise, return zero.
+  MachineInstr *postRAConvertToLEA(MachineFunction::iterator &MFI,
+                                   MachineBasicBlock::iterator &MBBI) const;
+
+public:
+  FixupLEAPass() : MachineFunctionPass(ID) {}
+
+  /// \brief Loop over all of the basic blocks,
+  /// replacing instructions by equivalent LEA instructions
+  /// if needed and when possible.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  MachineFunction *MF;
+  const X86InstrInfo *TII; // Machine instruction info.
+  bool OptIncDec;
+  bool OptLEA;
+};
+char FixupLEAPass::ID = 0;
+}
+
+MachineInstr *
+FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
+                                 MachineBasicBlock::iterator &MBBI) const {
+  MachineInstr *MI = MBBI;
+  MachineInstr *NewMI;
+  switch (MI->getOpcode()) {
+  case X86::MOV32rr:
+  case X86::MOV64rr: {
+    const MachineOperand &Src = MI->getOperand(1);
+    const MachineOperand &Dest = MI->getOperand(0);
+    NewMI = BuildMI(*MF, MI->getDebugLoc(),
+                    TII->get(MI->getOpcode() == X86::MOV32rr ? X86::LEA32r
+                                                             : X86::LEA64r))
+                .addOperand(Dest)
+                .addOperand(Src)
+                .addImm(1)
+                .addReg(0)
+                .addImm(0)
+                .addReg(0);
+    MFI->insert(MBBI, NewMI); // Insert the new inst
+    return NewMI;
+  }
+  case X86::ADD64ri32:
+  case X86::ADD64ri8:
+  case X86::ADD64ri32_DB:
+  case X86::ADD64ri8_DB:
+  case X86::ADD32ri:
+  case X86::ADD32ri8:
+  case X86::ADD32ri_DB:
+  case X86::ADD32ri8_DB:
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD16ri_DB:
+  case X86::ADD16ri8_DB:
+    if (!MI->getOperand(2).isImm()) {
+      // convertToThreeAddress will call getImm()
+      // which requires isImm() to be true
+      return nullptr;
+    }
+    break;
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB:
+    if (MI->getOperand(1).getReg() != MI->getOperand(2).getReg()) {
+      // if src1 != src2, then convertToThreeAddress will
+      // need to create a Virtual register, which we cannot do
+      // after register allocation.
+      return nullptr;
+    }
+  }
+  return TII->convertToThreeAddress(MFI, MBBI, nullptr);
+}
+
+FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); }
+
+bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
+  MF = &Func;
+  const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
+  OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize();
+  OptLEA = ST.LEAusesAG() || ST.slowLEA();
+
+  if (!OptLEA && !OptIncDec)
+    return false;
+
+  TII = ST.getInstrInfo();
+
+  DEBUG(dbgs() << "Start X86FixupLEAs\n";);
+  // Process all basic blocks.
+  for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I)
+    processBasicBlock(Func, I);
+  DEBUG(dbgs() << "End X86FixupLEAs\n";);
+
+  return true;
+}
+
+FixupLEAPass::RegUsageState
+FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
+  RegUsageState RegUsage = RU_NotUsed;
+  MachineInstr *MI = I;
+
+  for (unsigned int i = 0; i < MI->getNumOperands(); ++i) {
+    MachineOperand &opnd = MI->getOperand(i);
+    if (opnd.isReg() && opnd.getReg() == p.getReg()) {
+      if (opnd.isDef())
+        return RU_Write;
+      RegUsage = RU_Read;
+    }
+  }
+  return RegUsage;
+}
+
+/// getPreviousInstr - Given a reference to an instruction in a basic
+/// block, return a reference to the previous instruction in the block,
+/// wrapping around to the last instruction of the block if the block
+/// branches to itself.
+static inline bool getPreviousInstr(MachineBasicBlock::iterator &I,
+                                    MachineFunction::iterator MFI) {
+  if (I == MFI->begin()) {
+    if (MFI->isPredecessor(&*MFI)) {
+      I = --MFI->end();
+      return true;
+    } else
+      return false;
+  }
+  --I;
+  return true;
+}
+
+MachineBasicBlock::iterator
+FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
+                              MachineFunction::iterator MFI) {
+  int InstrDistance = 1;
+  MachineBasicBlock::iterator CurInst;
+  static const int INSTR_DISTANCE_THRESHOLD = 5;
+
+  CurInst = I;
+  bool Found;
+  Found = getPreviousInstr(CurInst, MFI);
+  while (Found && I != CurInst) {
+    if (CurInst->isCall() || CurInst->isInlineAsm())
+      break;
+    if (InstrDistance > INSTR_DISTANCE_THRESHOLD)
+      break; // too far back to make a difference
+    if (usesRegister(p, CurInst) == RU_Write) {
+      return CurInst;
+    }
+    InstrDistance += TII->getInstrLatency(
+        MF->getSubtarget().getInstrItineraryData(), CurInst);
+    Found = getPreviousInstr(CurInst, MFI);
+  }
+  return nullptr;
+}
+
+static inline bool isLEA(const int opcode) {
+  return opcode == X86::LEA16r || opcode == X86::LEA32r ||
+         opcode == X86::LEA64r || opcode == X86::LEA64_32r;
+}
+
+/// isLEASimpleIncOrDec - Does this LEA have one these forms:
+/// lea  %reg, 1(%reg)
+/// lea  %reg, -1(%reg)
+static inline bool isLEASimpleIncOrDec(MachineInstr *LEA) {
+  unsigned SrcReg = LEA->getOperand(1 + X86::AddrBaseReg).getReg();
+  unsigned DstReg = LEA->getOperand(0).getReg();
+  unsigned AddrDispOp = 1 + X86::AddrDisp;
+  return SrcReg == DstReg &&
+         LEA->getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
+         LEA->getOperand(1 + X86::AddrSegmentReg).getReg() == 0 &&
+         LEA->getOperand(AddrDispOp).isImm() &&
+         (LEA->getOperand(AddrDispOp).getImm() == 1 ||
+          LEA->getOperand(AddrDispOp).getImm() == -1);
+}
+
+bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I,
+                               MachineFunction::iterator MFI) const {
+  MachineInstr *MI = I;
+  int Opcode = MI->getOpcode();
+  if (!isLEA(Opcode))
+    return false;
+
+  if (isLEASimpleIncOrDec(MI) && TII->isSafeToClobberEFLAGS(*MFI, I)) {
+    int NewOpcode;
+    bool isINC = MI->getOperand(4).getImm() == 1;
+    switch (Opcode) {
+    case X86::LEA16r:
+      NewOpcode = isINC ? X86::INC16r : X86::DEC16r;
+      break;
+    case X86::LEA32r:
+    case X86::LEA64_32r:
+      NewOpcode = isINC ? X86::INC32r : X86::DEC32r;
+      break;
+    case X86::LEA64r:
+      NewOpcode = isINC ? X86::INC64r : X86::DEC64r;
+      break;
+    }
+
+    MachineInstr *NewMI =
+        BuildMI(*MFI, I, MI->getDebugLoc(), TII->get(NewOpcode))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1));
+    MFI->erase(I);
+    I = static_cast<MachineBasicBlock::iterator>(NewMI);
+    return true;
+  }
+  return false;
+}
+
+void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I,
+                                      MachineFunction::iterator MFI) {
+  // Process a load, store, or LEA instruction.
+  MachineInstr *MI = I;
+  int opcode = MI->getOpcode();
+  const MCInstrDesc &Desc = MI->getDesc();
+  int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags, opcode);
+  if (AddrOffset >= 0) {
+    AddrOffset += X86II::getOperandBias(Desc);
+    MachineOperand &p = MI->getOperand(AddrOffset + X86::AddrBaseReg);
+    if (p.isReg() && p.getReg() != X86::ESP) {
+      seekLEAFixup(p, I, MFI);
+    }
+    MachineOperand &q = MI->getOperand(AddrOffset + X86::AddrIndexReg);
+    if (q.isReg() && q.getReg() != X86::ESP) {
+      seekLEAFixup(q, I, MFI);
+    }
+  }
+}
+
+void FixupLEAPass::seekLEAFixup(MachineOperand &p,
+                                MachineBasicBlock::iterator &I,
+                                MachineFunction::iterator MFI) {
+  MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI);
+  if (MBI) {
+    MachineInstr *NewMI = postRAConvertToLEA(MFI, MBI);
+    if (NewMI) {
+      ++NumLEAs;
+      DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
+      // now to replace with an equivalent LEA...
+      DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
+      MFI->erase(MBI);
+      MachineBasicBlock::iterator J =
+          static_cast<MachineBasicBlock::iterator>(NewMI);
+      processInstruction(J, MFI);
+    }
+  }
+}
+
+void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
+                                            MachineFunction::iterator MFI) {
+  MachineInstr *MI = I;
+  const int opcode = MI->getOpcode();
+  if (!isLEA(opcode))
+    return;
+  if (MI->getOperand(5).getReg() != 0 || !MI->getOperand(4).isImm() ||
+      !TII->isSafeToClobberEFLAGS(*MFI, I))
+    return;
+  const unsigned DstR = MI->getOperand(0).getReg();
+  const unsigned SrcR1 = MI->getOperand(1).getReg();
+  const unsigned SrcR2 = MI->getOperand(3).getReg();
+  if ((SrcR1 == 0 || SrcR1 != DstR) && (SrcR2 == 0 || SrcR2 != DstR))
+    return;
+  if (MI->getOperand(2).getImm() > 1)
+    return;
+  int addrr_opcode, addri_opcode;
+  switch (opcode) {
+  default:
+    llvm_unreachable("Unexpected LEA instruction");
+  case X86::LEA16r:
+    addrr_opcode = X86::ADD16rr;
+    addri_opcode = X86::ADD16ri;
+    break;
+  case X86::LEA32r:
+    addrr_opcode = X86::ADD32rr;
+    addri_opcode = X86::ADD32ri;
+    break;
+  case X86::LEA64_32r:
+  case X86::LEA64r:
+    addrr_opcode = X86::ADD64rr;
+    addri_opcode = X86::ADD64ri32;
+    break;
+  }
+  DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
+  DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+  MachineInstr *NewMI = nullptr;
+  const MachineOperand &Dst = MI->getOperand(0);
+  // Make ADD instruction for two registers writing to LEA's destination
+  if (SrcR1 != 0 && SrcR2 != 0) {
+    const MachineOperand &Src1 = MI->getOperand(SrcR1 == DstR ? 1 : 3);
+    const MachineOperand &Src2 = MI->getOperand(SrcR1 == DstR ? 3 : 1);
+    NewMI = BuildMI(*MF, MI->getDebugLoc(), TII->get(addrr_opcode))
+                .addOperand(Dst)
+                .addOperand(Src1)
+                .addOperand(Src2);
+    MFI->insert(I, NewMI);
+    DEBUG(NewMI->dump(););
+  }
+  // Make ADD instruction for immediate
+  if (MI->getOperand(4).getImm() != 0) {
+    const MachineOperand &SrcR = MI->getOperand(SrcR1 == DstR ? 1 : 3);
+    NewMI = BuildMI(*MF, MI->getDebugLoc(), TII->get(addri_opcode))
+                .addOperand(Dst)
+                .addOperand(SrcR)
+                .addImm(MI->getOperand(4).getImm());
+    MFI->insert(I, NewMI);
+    DEBUG(NewMI->dump(););
+  }
+  if (NewMI) {
+    MFI->erase(I);
+    I = static_cast<MachineBasicBlock::iterator>(NewMI);
+  }
+}
+
+bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
+                                     MachineFunction::iterator MFI) {
+
+  for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
+    if (OptIncDec)
+      if (fixupIncDec(I, MFI))
+        continue;
+
+    if (OptLEA) {
+      if (MF.getSubtarget<X86Subtarget>().isSLM())
+        processInstructionForSLM(I, MFI);
+      else
+        processInstruction(I, MFI);
+    }
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
new file mode 100644
index 0000000..97bb8ab
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -0,0 +1,1651 @@
+//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which converts floating point instructions from
+// pseudo registers into register stack instructions.  This pass uses live
+// variable information to indicate where the FPn registers are used and their
+// lifetimes.
+//
+// The x87 hardware tracks liveness of the stack registers, so it is necessary
+// to implement exact liveness tracking between basic blocks. The CFG edges are
+// partitioned into bundles where the same FP registers must be live in
+// identical stack positions. Instructions are inserted at the end of each basic
+// block to rearrange the live registers to match the outgoing bundle.
+//
+// This approach avoids splitting critical edges at the potential cost of more
+// live register shuffling instructions when critical edges are present.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <bitset>
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-codegen"
+
+STATISTIC(NumFXCH, "Number of fxch instructions inserted");
+STATISTIC(NumFP  , "Number of floating point instructions");
+
+namespace {
+  const unsigned ScratchFPReg = 7;
+
+  struct FPS : public MachineFunctionPass {
+    static char ID;
+    FPS() : MachineFunctionPass(ID) {
+      initializeEdgeBundlesPass(*PassRegistry::getPassRegistry());
+      // This is really only to keep valgrind quiet.
+      // The logic in isLive() is too much for it.
+      memset(Stack, 0, sizeof(Stack));
+      memset(RegMap, 0, sizeof(RegMap));
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<EdgeBundles>();
+      AU.addPreservedID(MachineLoopInfoID);
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    const char *getPassName() const override { return "X86 FP Stackifier"; }
+
+  private:
+    const TargetInstrInfo *TII; // Machine instruction info.
+
+    // Two CFG edges are related if they leave the same block, or enter the same
+    // block. The transitive closure of an edge under this relation is a
+    // LiveBundle. It represents a set of CFG edges where the live FP stack
+    // registers must be allocated identically in the x87 stack.
+    //
+    // A LiveBundle is usually all the edges leaving a block, or all the edges
+    // entering a block, but it can contain more edges if critical edges are
+    // present.
+    //
+    // The set of live FP registers in a LiveBundle is calculated by bundleCFG,
+    // but the exact mapping of FP registers to stack slots is fixed later.
+    struct LiveBundle {
+      // Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c.
+      unsigned Mask;
+
+      // Number of pre-assigned live registers in FixStack. This is 0 when the
+      // stack order has not yet been fixed.
+      unsigned FixCount;
+
+      // Assigned stack order for live-in registers.
+      // FixStack[i] == getStackEntry(i) for all i < FixCount.
+      unsigned char FixStack[8];
+
+      LiveBundle() : Mask(0), FixCount(0) {}
+
+      // Have the live registers been assigned a stack order yet?
+      bool isFixed() const { return !Mask || FixCount; }
+    };
+
+    // Numbered LiveBundle structs. LiveBundles[0] is used for all CFG edges
+    // with no live FP registers.
+    SmallVector<LiveBundle, 8> LiveBundles;
+
+    // The edge bundle analysis provides indices into the LiveBundles vector.
+    EdgeBundles *Bundles;
+
+    // Return a bitmask of FP registers in block's live-in list.
+    static unsigned calcLiveInMask(MachineBasicBlock *MBB) {
+      unsigned Mask = 0;
+      for (const auto &LI : MBB->liveins()) {
+        if (LI.PhysReg < X86::FP0 || LI.PhysReg > X86::FP6)
+          continue;
+        Mask |= 1 << (LI.PhysReg - X86::FP0);
+      }
+      return Mask;
+    }
+
+    // Partition all the CFG edges into LiveBundles.
+    void bundleCFG(MachineFunction &MF);
+
+    MachineBasicBlock *MBB;     // Current basic block
+
+    // The hardware keeps track of how many FP registers are live, so we have
+    // to model that exactly. Usually, each live register corresponds to an
+    // FP<n> register, but when dealing with calls, returns, and inline
+    // assembly, it is sometimes necessary to have live scratch registers.
+    unsigned Stack[8];          // FP<n> Registers in each stack slot...
+    unsigned StackTop;          // The current top of the FP stack.
+
+    enum {
+      NumFPRegs = 8             // Including scratch pseudo-registers.
+    };
+
+    // For each live FP<n> register, point to its Stack[] entry.
+    // The first entries correspond to FP0-FP6, the rest are scratch registers
+    // used when we need slightly different live registers than what the
+    // register allocator thinks.
+    unsigned RegMap[NumFPRegs];
+
+    // Set up our stack model to match the incoming registers to MBB.
+    void setupBlockStack();
+
+    // Shuffle live registers to match the expectations of successor blocks.
+    void finishBlockStack();
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    void dumpStack() const {
+      dbgs() << "Stack contents:";
+      for (unsigned i = 0; i != StackTop; ++i) {
+        dbgs() << " FP" << Stack[i];
+        assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
+      }
+    }
+#endif
+
+    /// getSlot - Return the stack slot number a particular register number is
+    /// in.
+    unsigned getSlot(unsigned RegNo) const {
+      assert(RegNo < NumFPRegs && "Regno out of range!");
+      return RegMap[RegNo];
+    }
+
+    /// isLive - Is RegNo currently live in the stack?
+    bool isLive(unsigned RegNo) const {
+      unsigned Slot = getSlot(RegNo);
+      return Slot < StackTop && Stack[Slot] == RegNo;
+    }
+
+    /// getStackEntry - Return the X86::FP<n> register in register ST(i).
+    unsigned getStackEntry(unsigned STi) const {
+      if (STi >= StackTop)
+        report_fatal_error("Access past stack top!");
+      return Stack[StackTop-1-STi];
+    }
+
+    /// getSTReg - Return the X86::ST(i) register which contains the specified
+    /// FP<RegNo> register.
+    unsigned getSTReg(unsigned RegNo) const {
+      return StackTop - 1 - getSlot(RegNo) + X86::ST0;
+    }
+
+    // pushReg - Push the specified FP<n> register onto the stack.
+    void pushReg(unsigned Reg) {
+      assert(Reg < NumFPRegs && "Register number out of range!");
+      if (StackTop >= 8)
+        report_fatal_error("Stack overflow!");
+      Stack[StackTop] = Reg;
+      RegMap[Reg] = StackTop++;
+    }
+
+    bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; }
+    void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) {
+      DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
+      if (isAtTop(RegNo)) return;
+
+      unsigned STReg = getSTReg(RegNo);
+      unsigned RegOnTop = getStackEntry(0);
+
+      // Swap the slots the regs are in.
+      std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+
+      // Swap stack slot contents.
+      if (RegMap[RegOnTop] >= StackTop)
+        report_fatal_error("Access past stack top!");
+      std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+
+      // Emit an fxch to update the runtime processors version of the state.
+      BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg);
+      ++NumFXCH;
+    }
+
+    void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) {
+      DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
+      unsigned STReg = getSTReg(RegNo);
+      pushReg(AsReg);   // New register on top of stack
+
+      BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
+    }
+
+    /// popStackAfter - Pop the current value off of the top of the FP stack
+    /// after the specified instruction.
+    void popStackAfter(MachineBasicBlock::iterator &I);
+
+    /// freeStackSlotAfter - Free the specified register from the register
+    /// stack, so that it is no longer in a register.  If the register is
+    /// currently at the top of the stack, we just pop the current instruction,
+    /// otherwise we store the current top-of-stack into the specified slot,
+    /// then pop the top of stack.
+    void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg);
+
+    /// freeStackSlotBefore - Just the pop, no folding. Return the inserted
+    /// instruction.
+    MachineBasicBlock::iterator
+    freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo);
+
+    /// Adjust the live registers to be the set in Mask.
+    void adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I);
+
+    /// Shuffle the top FixCount stack entries such that FP reg FixStack[0] is
+    /// st(0), FP reg FixStack[1] is st(1) etc.
+    void shuffleStackTop(const unsigned char *FixStack, unsigned FixCount,
+                         MachineBasicBlock::iterator I);
+
+    bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+    void handleCall(MachineBasicBlock::iterator &I);
+    void handleZeroArgFP(MachineBasicBlock::iterator &I);
+    void handleOneArgFP(MachineBasicBlock::iterator &I);
+    void handleOneArgFPRW(MachineBasicBlock::iterator &I);
+    void handleTwoArgFP(MachineBasicBlock::iterator &I);
+    void handleCompareFP(MachineBasicBlock::iterator &I);
+    void handleCondMovFP(MachineBasicBlock::iterator &I);
+    void handleSpecialFP(MachineBasicBlock::iterator &I);
+
+    // Check if a COPY instruction is using FP registers.
+    static bool isFPCopy(MachineInstr *MI) {
+      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned SrcReg = MI->getOperand(1).getReg();
+
+      return X86::RFP80RegClass.contains(DstReg) ||
+        X86::RFP80RegClass.contains(SrcReg);
+    }
+
+    void setKillFlags(MachineBasicBlock &MBB) const;
+  };
+  char FPS::ID = 0;
+}
+
+FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
+
+/// getFPReg - Return the X86::FPx register number for the specified operand.
+/// For example, this returns 3 for X86::FP3.
+static unsigned getFPReg(const MachineOperand &MO) {
+  assert(MO.isReg() && "Expected an FP register!");
+  unsigned Reg = MO.getReg();
+  assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!");
+  return Reg - X86::FP0;
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
+/// register references into FP stack references.
+///
+bool FPS::runOnMachineFunction(MachineFunction &MF) {
+  // We only need to run this pass if there are any FP registers used in this
+  // function.  If it is all integer, there is nothing for us to do!
+  bool FPIsUsed = false;
+
+  static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!");
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (unsigned i = 0; i <= 6; ++i)
+    if (!MRI.reg_nodbg_empty(X86::FP0 + i)) {
+      FPIsUsed = true;
+      break;
+    }
+
+  // Early exit.
+  if (!FPIsUsed) return false;
+
+  Bundles = &getAnalysis<EdgeBundles>();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  // Prepare cross-MBB liveness.
+  bundleCFG(MF);
+
+  StackTop = 0;
+
+  // Process the function in depth first order so that we process at least one
+  // of the predecessors for every reachable block in the function.
+  SmallPtrSet<MachineBasicBlock*, 8> Processed;
+  MachineBasicBlock *Entry = &MF.front();
+
+  bool Changed = false;
+  for (MachineBasicBlock *BB : depth_first_ext(Entry, Processed))
+    Changed |= processBasicBlock(MF, *BB);
+
+  // Process any unreachable blocks in arbitrary order now.
+  if (MF.size() != Processed.size())
+    for (MachineBasicBlock &BB : MF)
+      if (Processed.insert(&BB).second)
+        Changed |= processBasicBlock(MF, BB);
+
+  LiveBundles.clear();
+
+  return Changed;
+}
+
+/// bundleCFG - Scan all the basic blocks to determine consistent live-in and
+/// live-out sets for the FP registers. Consistent means that the set of
+/// registers live-out from a block is identical to the live-in set of all
+/// successors. This is not enforced by the normal live-in lists since
+/// registers may be implicitly defined, or not used by all successors.
+void FPS::bundleCFG(MachineFunction &MF) {
+  assert(LiveBundles.empty() && "Stale data in LiveBundles");
+  LiveBundles.resize(Bundles->getNumBundles());
+
+  // Gather the actual live-in masks for all MBBs.
+  for (MachineBasicBlock &MBB : MF) {
+    const unsigned Mask = calcLiveInMask(&MBB);
+    if (!Mask)
+      continue;
+    // Update MBB ingoing bundle mask.
+    LiveBundles[Bundles->getBundle(MBB.getNumber(), false)].Mask |= Mask;
+  }
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// transforming FP instructions into their stack form.
+///
+bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
+  bool Changed = false;
+  MBB = &BB;
+
+  setKillFlags(BB);
+  setupBlockStack();
+
+  for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+    MachineInstr *MI = I;
+    uint64_t Flags = MI->getDesc().TSFlags;
+
+    unsigned FPInstClass = Flags & X86II::FPTypeMask;
+    if (MI->isInlineAsm())
+      FPInstClass = X86II::SpecialFP;
+
+    if (MI->isCopy() && isFPCopy(MI))
+      FPInstClass = X86II::SpecialFP;
+
+    if (MI->isImplicitDef() &&
+        X86::RFP80RegClass.contains(MI->getOperand(0).getReg()))
+      FPInstClass = X86II::SpecialFP;
+
+    if (MI->isCall())
+      FPInstClass = X86II::SpecialFP;
+
+    if (FPInstClass == X86II::NotFP)
+      continue;  // Efficiently ignore non-fp insts!
+
+    MachineInstr *PrevMI = nullptr;
+    if (I != BB.begin())
+      PrevMI = std::prev(I);
+
+    ++NumFP;  // Keep track of # of pseudo instrs
+    DEBUG(dbgs() << "\nFPInst:\t" << *MI);
+
+    // Get dead variables list now because the MI pointer may be deleted as part
+    // of processing!
+    SmallVector<unsigned, 8> DeadRegs;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isDead())
+        DeadRegs.push_back(MO.getReg());
+    }
+
+    switch (FPInstClass) {
+    case X86II::ZeroArgFP:  handleZeroArgFP(I); break;
+    case X86II::OneArgFP:   handleOneArgFP(I);  break;  // fstp ST(0)
+    case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0))
+    case X86II::TwoArgFP:   handleTwoArgFP(I);  break;
+    case X86II::CompareFP:  handleCompareFP(I); break;
+    case X86II::CondMovFP:  handleCondMovFP(I); break;
+    case X86II::SpecialFP:  handleSpecialFP(I); break;
+    default: llvm_unreachable("Unknown FP Type!");
+    }
+
+    // Check to see if any of the values defined by this instruction are dead
+    // after definition.  If so, pop them.
+    for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
+      unsigned Reg = DeadRegs[i];
+      // Check if Reg is live on the stack. An inline-asm register operand that
+      // is in the clobber list and marked dead might not be live on the stack.
+      if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) {
+        DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n");
+        freeStackSlotAfter(I, Reg-X86::FP0);
+      }
+    }
+
+    // Print out all of the instructions expanded to if -debug
+    DEBUG(
+      MachineBasicBlock::iterator PrevI(PrevMI);
+      if (I == PrevI) {
+        dbgs() << "Just deleted pseudo instruction\n";
+      } else {
+        MachineBasicBlock::iterator Start = I;
+        // Rewind to first instruction newly inserted.
+        while (Start != BB.begin() && std::prev(Start) != PrevI) --Start;
+        dbgs() << "Inserted instructions:\n\t";
+        Start->print(dbgs());
+        while (++Start != std::next(I)) {}
+      }
+      dumpStack();
+    );
+    (void)PrevMI;
+
+    Changed = true;
+  }
+
+  finishBlockStack();
+
+  return Changed;
+}
+
+/// setupBlockStack - Use the live bundles to set up our model of the stack
+/// to match predecessors' live out stack.
+void FPS::setupBlockStack() {
+  DEBUG(dbgs() << "\nSetting up live-ins for BB#" << MBB->getNumber()
+               << " derived from " << MBB->getName() << ".\n");
+  StackTop = 0;
+  // Get the live-in bundle for MBB.
+  const LiveBundle &Bundle =
+    LiveBundles[Bundles->getBundle(MBB->getNumber(), false)];
+
+  if (!Bundle.Mask) {
+    DEBUG(dbgs() << "Block has no FP live-ins.\n");
+    return;
+  }
+
+  // Depth-first iteration should ensure that we always have an assigned stack.
+  assert(Bundle.isFixed() && "Reached block before any predecessors");
+
+  // Push the fixed live-in registers.
+  for (unsigned i = Bundle.FixCount; i > 0; --i) {
+    MBB->addLiveIn(X86::ST0+i-1);
+    DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %FP"
+                 << unsigned(Bundle.FixStack[i-1]) << '\n');
+    pushReg(Bundle.FixStack[i-1]);
+  }
+
+  // Kill off unwanted live-ins. This can happen with a critical edge.
+  // FIXME: We could keep these live registers around as zombies. They may need
+  // to be revived at the end of a short block. It might save a few instrs.
+  adjustLiveRegs(calcLiveInMask(MBB), MBB->begin());
+  DEBUG(MBB->dump());
+}
+
+/// finishBlockStack - Revive live-outs that are implicitly defined out of
+/// MBB. Shuffle live registers to match the expected fixed stack of any
+/// predecessors, and ensure that all predecessors are expecting the same
+/// stack.
+void FPS::finishBlockStack() {
+  // The RET handling below takes care of return blocks for us.
+  if (MBB->succ_empty())
+    return;
+
+  DEBUG(dbgs() << "Setting up live-outs for BB#" << MBB->getNumber()
+               << " derived from " << MBB->getName() << ".\n");
+
+  // Get MBB's live-out bundle.
+  unsigned BundleIdx = Bundles->getBundle(MBB->getNumber(), true);
+  LiveBundle &Bundle = LiveBundles[BundleIdx];
+
+  // We may need to kill and define some registers to match successors.
+  // FIXME: This can probably be combined with the shuffle below.
+  MachineBasicBlock::iterator Term = MBB->getFirstTerminator();
+  adjustLiveRegs(Bundle.Mask, Term);
+
+  if (!Bundle.Mask) {
+    DEBUG(dbgs() << "No live-outs.\n");
+    return;
+  }
+
+  // Has the stack order been fixed yet?
+  DEBUG(dbgs() << "LB#" << BundleIdx << ": ");
+  if (Bundle.isFixed()) {
+    DEBUG(dbgs() << "Shuffling stack to match.\n");
+    shuffleStackTop(Bundle.FixStack, Bundle.FixCount, Term);
+  } else {
+    // Not fixed yet, we get to choose.
+    DEBUG(dbgs() << "Fixing stack order now.\n");
+    Bundle.FixCount = StackTop;
+    for (unsigned i = 0; i < StackTop; ++i)
+      Bundle.FixStack[i] = getStackEntry(i);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+// Efficient Lookup Table Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+  struct TableEntry {
+    uint16_t from;
+    uint16_t to;
+    bool operator<(const TableEntry &TE) const { return from < TE.from; }
+    friend bool operator<(const TableEntry &TE, unsigned V) {
+      return TE.from < V;
+    }
+    friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned V,
+                                                const TableEntry &TE) {
+      return V < TE.from;
+    }
+  };
+}
+
+static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) {
+  const TableEntry *I = std::lower_bound(Table.begin(), Table.end(), Opcode);
+  if (I != Table.end() && I->from == Opcode)
+    return I->to;
+  return -1;
+}
+
+#ifdef NDEBUG
+#define ASSERT_SORTED(TABLE)
+#else
+#define ASSERT_SORTED(TABLE)                                              \
+  { static bool TABLE##Checked = false;                                   \
+    if (!TABLE##Checked) {                                                \
+       assert(std::is_sorted(std::begin(TABLE), std::end(TABLE)) &&       \
+              "All lookup tables must be sorted for efficient access!");  \
+       TABLE##Checked = true;                                             \
+    }                                                                     \
+  }
+#endif
+
+//===----------------------------------------------------------------------===//
+// Register File -> Register Stack Mapping Methods
+//===----------------------------------------------------------------------===//
+
+// OpcodeTable - Sorted map of register instructions to their stack version.
+// The first element is an register file pseudo instruction, the second is the
+// concrete X86 instruction which uses the register stack.
+//
+static const TableEntry OpcodeTable[] = {
+  { X86::ABS_Fp32     , X86::ABS_F     },
+  { X86::ABS_Fp64     , X86::ABS_F     },
+  { X86::ABS_Fp80     , X86::ABS_F     },
+  { X86::ADD_Fp32m    , X86::ADD_F32m  },
+  { X86::ADD_Fp64m    , X86::ADD_F64m  },
+  { X86::ADD_Fp64m32  , X86::ADD_F32m  },
+  { X86::ADD_Fp80m32  , X86::ADD_F32m  },
+  { X86::ADD_Fp80m64  , X86::ADD_F64m  },
+  { X86::ADD_FpI16m32 , X86::ADD_FI16m },
+  { X86::ADD_FpI16m64 , X86::ADD_FI16m },
+  { X86::ADD_FpI16m80 , X86::ADD_FI16m },
+  { X86::ADD_FpI32m32 , X86::ADD_FI32m },
+  { X86::ADD_FpI32m64 , X86::ADD_FI32m },
+  { X86::ADD_FpI32m80 , X86::ADD_FI32m },
+  { X86::CHS_Fp32     , X86::CHS_F     },
+  { X86::CHS_Fp64     , X86::CHS_F     },
+  { X86::CHS_Fp80     , X86::CHS_F     },
+  { X86::CMOVBE_Fp32  , X86::CMOVBE_F  },
+  { X86::CMOVBE_Fp64  , X86::CMOVBE_F  },
+  { X86::CMOVBE_Fp80  , X86::CMOVBE_F  },
+  { X86::CMOVB_Fp32   , X86::CMOVB_F   },
+  { X86::CMOVB_Fp64   , X86::CMOVB_F  },
+  { X86::CMOVB_Fp80   , X86::CMOVB_F  },
+  { X86::CMOVE_Fp32   , X86::CMOVE_F  },
+  { X86::CMOVE_Fp64   , X86::CMOVE_F   },
+  { X86::CMOVE_Fp80   , X86::CMOVE_F   },
+  { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F },
+  { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F },
+  { X86::CMOVNBE_Fp80 , X86::CMOVNBE_F },
+  { X86::CMOVNB_Fp32  , X86::CMOVNB_F  },
+  { X86::CMOVNB_Fp64  , X86::CMOVNB_F  },
+  { X86::CMOVNB_Fp80  , X86::CMOVNB_F  },
+  { X86::CMOVNE_Fp32  , X86::CMOVNE_F  },
+  { X86::CMOVNE_Fp64  , X86::CMOVNE_F  },
+  { X86::CMOVNE_Fp80  , X86::CMOVNE_F  },
+  { X86::CMOVNP_Fp32  , X86::CMOVNP_F  },
+  { X86::CMOVNP_Fp64  , X86::CMOVNP_F  },
+  { X86::CMOVNP_Fp80  , X86::CMOVNP_F  },
+  { X86::CMOVP_Fp32   , X86::CMOVP_F   },
+  { X86::CMOVP_Fp64   , X86::CMOVP_F   },
+  { X86::CMOVP_Fp80   , X86::CMOVP_F   },
+  { X86::COS_Fp32     , X86::COS_F     },
+  { X86::COS_Fp64     , X86::COS_F     },
+  { X86::COS_Fp80     , X86::COS_F     },
+  { X86::DIVR_Fp32m   , X86::DIVR_F32m },
+  { X86::DIVR_Fp64m   , X86::DIVR_F64m },
+  { X86::DIVR_Fp64m32 , X86::DIVR_F32m },
+  { X86::DIVR_Fp80m32 , X86::DIVR_F32m },
+  { X86::DIVR_Fp80m64 , X86::DIVR_F64m },
+  { X86::DIVR_FpI16m32, X86::DIVR_FI16m},
+  { X86::DIVR_FpI16m64, X86::DIVR_FI16m},
+  { X86::DIVR_FpI16m80, X86::DIVR_FI16m},
+  { X86::DIVR_FpI32m32, X86::DIVR_FI32m},
+  { X86::DIVR_FpI32m64, X86::DIVR_FI32m},
+  { X86::DIVR_FpI32m80, X86::DIVR_FI32m},
+  { X86::DIV_Fp32m    , X86::DIV_F32m  },
+  { X86::DIV_Fp64m    , X86::DIV_F64m  },
+  { X86::DIV_Fp64m32  , X86::DIV_F32m  },
+  { X86::DIV_Fp80m32  , X86::DIV_F32m  },
+  { X86::DIV_Fp80m64  , X86::DIV_F64m  },
+  { X86::DIV_FpI16m32 , X86::DIV_FI16m },
+  { X86::DIV_FpI16m64 , X86::DIV_FI16m },
+  { X86::DIV_FpI16m80 , X86::DIV_FI16m },
+  { X86::DIV_FpI32m32 , X86::DIV_FI32m },
+  { X86::DIV_FpI32m64 , X86::DIV_FI32m },
+  { X86::DIV_FpI32m80 , X86::DIV_FI32m },
+  { X86::ILD_Fp16m32  , X86::ILD_F16m  },
+  { X86::ILD_Fp16m64  , X86::ILD_F16m  },
+  { X86::ILD_Fp16m80  , X86::ILD_F16m  },
+  { X86::ILD_Fp32m32  , X86::ILD_F32m  },
+  { X86::ILD_Fp32m64  , X86::ILD_F32m  },
+  { X86::ILD_Fp32m80  , X86::ILD_F32m  },
+  { X86::ILD_Fp64m32  , X86::ILD_F64m  },
+  { X86::ILD_Fp64m64  , X86::ILD_F64m  },
+  { X86::ILD_Fp64m80  , X86::ILD_F64m  },
+  { X86::ISTT_Fp16m32 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp16m64 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp16m80 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp32m32 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp32m64 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp32m80 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp64m32 , X86::ISTT_FP64m},
+  { X86::ISTT_Fp64m64 , X86::ISTT_FP64m},
+  { X86::ISTT_Fp64m80 , X86::ISTT_FP64m},
+  { X86::IST_Fp16m32  , X86::IST_F16m  },
+  { X86::IST_Fp16m64  , X86::IST_F16m  },
+  { X86::IST_Fp16m80  , X86::IST_F16m  },
+  { X86::IST_Fp32m32  , X86::IST_F32m  },
+  { X86::IST_Fp32m64  , X86::IST_F32m  },
+  { X86::IST_Fp32m80  , X86::IST_F32m  },
+  { X86::IST_Fp64m32  , X86::IST_FP64m },
+  { X86::IST_Fp64m64  , X86::IST_FP64m },
+  { X86::IST_Fp64m80  , X86::IST_FP64m },
+  { X86::LD_Fp032     , X86::LD_F0     },
+  { X86::LD_Fp064     , X86::LD_F0     },
+  { X86::LD_Fp080     , X86::LD_F0     },
+  { X86::LD_Fp132     , X86::LD_F1     },
+  { X86::LD_Fp164     , X86::LD_F1     },
+  { X86::LD_Fp180     , X86::LD_F1     },
+  { X86::LD_Fp32m     , X86::LD_F32m   },
+  { X86::LD_Fp32m64   , X86::LD_F32m   },
+  { X86::LD_Fp32m80   , X86::LD_F32m   },
+  { X86::LD_Fp64m     , X86::LD_F64m   },
+  { X86::LD_Fp64m80   , X86::LD_F64m   },
+  { X86::LD_Fp80m     , X86::LD_F80m   },
+  { X86::MUL_Fp32m    , X86::MUL_F32m  },
+  { X86::MUL_Fp64m    , X86::MUL_F64m  },
+  { X86::MUL_Fp64m32  , X86::MUL_F32m  },
+  { X86::MUL_Fp80m32  , X86::MUL_F32m  },
+  { X86::MUL_Fp80m64  , X86::MUL_F64m  },
+  { X86::MUL_FpI16m32 , X86::MUL_FI16m },
+  { X86::MUL_FpI16m64 , X86::MUL_FI16m },
+  { X86::MUL_FpI16m80 , X86::MUL_FI16m },
+  { X86::MUL_FpI32m32 , X86::MUL_FI32m },
+  { X86::MUL_FpI32m64 , X86::MUL_FI32m },
+  { X86::MUL_FpI32m80 , X86::MUL_FI32m },
+  { X86::SIN_Fp32     , X86::SIN_F     },
+  { X86::SIN_Fp64     , X86::SIN_F     },
+  { X86::SIN_Fp80     , X86::SIN_F     },
+  { X86::SQRT_Fp32    , X86::SQRT_F    },
+  { X86::SQRT_Fp64    , X86::SQRT_F    },
+  { X86::SQRT_Fp80    , X86::SQRT_F    },
+  { X86::ST_Fp32m     , X86::ST_F32m   },
+  { X86::ST_Fp64m     , X86::ST_F64m   },
+  { X86::ST_Fp64m32   , X86::ST_F32m   },
+  { X86::ST_Fp80m32   , X86::ST_F32m   },
+  { X86::ST_Fp80m64   , X86::ST_F64m   },
+  { X86::ST_FpP80m    , X86::ST_FP80m  },
+  { X86::SUBR_Fp32m   , X86::SUBR_F32m },
+  { X86::SUBR_Fp64m   , X86::SUBR_F64m },
+  { X86::SUBR_Fp64m32 , X86::SUBR_F32m },
+  { X86::SUBR_Fp80m32 , X86::SUBR_F32m },
+  { X86::SUBR_Fp80m64 , X86::SUBR_F64m },
+  { X86::SUBR_FpI16m32, X86::SUBR_FI16m},
+  { X86::SUBR_FpI16m64, X86::SUBR_FI16m},
+  { X86::SUBR_FpI16m80, X86::SUBR_FI16m},
+  { X86::SUBR_FpI32m32, X86::SUBR_FI32m},
+  { X86::SUBR_FpI32m64, X86::SUBR_FI32m},
+  { X86::SUBR_FpI32m80, X86::SUBR_FI32m},
+  { X86::SUB_Fp32m    , X86::SUB_F32m  },
+  { X86::SUB_Fp64m    , X86::SUB_F64m  },
+  { X86::SUB_Fp64m32  , X86::SUB_F32m  },
+  { X86::SUB_Fp80m32  , X86::SUB_F32m  },
+  { X86::SUB_Fp80m64  , X86::SUB_F64m  },
+  { X86::SUB_FpI16m32 , X86::SUB_FI16m },
+  { X86::SUB_FpI16m64 , X86::SUB_FI16m },
+  { X86::SUB_FpI16m80 , X86::SUB_FI16m },
+  { X86::SUB_FpI32m32 , X86::SUB_FI32m },
+  { X86::SUB_FpI32m64 , X86::SUB_FI32m },
+  { X86::SUB_FpI32m80 , X86::SUB_FI32m },
+  { X86::TST_Fp32     , X86::TST_F     },
+  { X86::TST_Fp64     , X86::TST_F     },
+  { X86::TST_Fp80     , X86::TST_F     },
+  { X86::UCOM_FpIr32  , X86::UCOM_FIr  },
+  { X86::UCOM_FpIr64  , X86::UCOM_FIr  },
+  { X86::UCOM_FpIr80  , X86::UCOM_FIr  },
+  { X86::UCOM_Fpr32   , X86::UCOM_Fr   },
+  { X86::UCOM_Fpr64   , X86::UCOM_Fr   },
+  { X86::UCOM_Fpr80   , X86::UCOM_Fr   },
+};
+
+static unsigned getConcreteOpcode(unsigned Opcode) {
+  ASSERT_SORTED(OpcodeTable);
+  int Opc = Lookup(OpcodeTable, Opcode);
+  assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!");
+  return Opc;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Methods
+//===----------------------------------------------------------------------===//
+
+// PopTable - Sorted map of instructions to their popping version.  The first
+// element is an instruction, the second is the version which pops.
+//
+static const TableEntry PopTable[] = {
+  { X86::ADD_FrST0 , X86::ADD_FPrST0  },
+
+  { X86::DIVR_FrST0, X86::DIVR_FPrST0 },
+  { X86::DIV_FrST0 , X86::DIV_FPrST0  },
+
+  { X86::IST_F16m  , X86::IST_FP16m   },
+  { X86::IST_F32m  , X86::IST_FP32m   },
+
+  { X86::MUL_FrST0 , X86::MUL_FPrST0  },
+
+  { X86::ST_F32m   , X86::ST_FP32m    },
+  { X86::ST_F64m   , X86::ST_FP64m    },
+  { X86::ST_Frr    , X86::ST_FPrr     },
+
+  { X86::SUBR_FrST0, X86::SUBR_FPrST0 },
+  { X86::SUB_FrST0 , X86::SUB_FPrST0  },
+
+  { X86::UCOM_FIr  , X86::UCOM_FIPr   },
+
+  { X86::UCOM_FPr  , X86::UCOM_FPPr   },
+  { X86::UCOM_Fr   , X86::UCOM_FPr    },
+};
+
+/// popStackAfter - Pop the current value off of the top of the FP stack after
+/// the specified instruction.  This attempts to be sneaky and combine the pop
+/// into the instruction itself if possible.  The iterator is left pointing to
+/// the last instruction, be it a new pop instruction inserted, or the old
+/// instruction if it was modified in place.
+///
+void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
+  MachineInstr* MI = I;
+  DebugLoc dl = MI->getDebugLoc();
+  ASSERT_SORTED(PopTable);
+  if (StackTop == 0)
+    report_fatal_error("Cannot pop empty stack!");
+  RegMap[Stack[--StackTop]] = ~0;     // Update state
+
+  // Check to see if there is a popping version of this instruction...
+  int Opcode = Lookup(PopTable, I->getOpcode());
+  if (Opcode != -1) {
+    I->setDesc(TII->get(Opcode));
+    if (Opcode == X86::UCOM_FPPr)
+      I->RemoveOperand(0);
+  } else {    // Insert an explicit pop
+    I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0);
+  }
+}
+
+/// freeStackSlotAfter - Free the specified register from the register stack, so
+/// that it is no longer in a register.  If the register is currently at the top
+/// of the stack, we just pop the current instruction, otherwise we store the
+/// current top-of-stack into the specified slot, then pop the top of stack.
+void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
+  if (getStackEntry(0) == FPRegNo) {  // already at the top of stack? easy.
+    popStackAfter(I);
+    return;
+  }
+
+  // Otherwise, store the top of stack into the dead slot, killing the operand
+  // without having to add in an explicit xchg then pop.
+  //
+  I = freeStackSlotBefore(++I, FPRegNo);
+}
+
+/// freeStackSlotBefore - Free the specified register without trying any
+/// folding.
+MachineBasicBlock::iterator
+FPS::freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo) {
+  unsigned STReg    = getSTReg(FPRegNo);
+  unsigned OldSlot  = getSlot(FPRegNo);
+  unsigned TopReg   = Stack[StackTop-1];
+  Stack[OldSlot]    = TopReg;
+  RegMap[TopReg]    = OldSlot;
+  RegMap[FPRegNo]   = ~0;
+  Stack[--StackTop] = ~0;
+  return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr))
+      .addReg(STReg)
+      .getInstr();
+}
+
+/// adjustLiveRegs - Kill and revive registers such that exactly the FP
+/// registers with a bit in Mask are live.
+void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
+  unsigned Defs = Mask;
+  unsigned Kills = 0;
+  for (unsigned i = 0; i < StackTop; ++i) {
+    unsigned RegNo = Stack[i];
+    if (!(Defs & (1 << RegNo)))
+      // This register is live, but we don't want it.
+      Kills |= (1 << RegNo);
+    else
+      // We don't need to imp-def this live register.
+      Defs &= ~(1 << RegNo);
+  }
+  assert((Kills & Defs) == 0 && "Register needs killing and def'ing?");
+
+  // Produce implicit-defs for free by using killed registers.
+  while (Kills && Defs) {
+    unsigned KReg = countTrailingZeros(Kills);
+    unsigned DReg = countTrailingZeros(Defs);
+    DEBUG(dbgs() << "Renaming %FP" << KReg << " as imp %FP" << DReg << "\n");
+    std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]);
+    std::swap(RegMap[KReg], RegMap[DReg]);
+    Kills &= ~(1 << KReg);
+    Defs &= ~(1 << DReg);
+  }
+
+  // Kill registers by popping.
+  if (Kills && I != MBB->begin()) {
+    MachineBasicBlock::iterator I2 = std::prev(I);
+    while (StackTop) {
+      unsigned KReg = getStackEntry(0);
+      if (!(Kills & (1 << KReg)))
+        break;
+      DEBUG(dbgs() << "Popping %FP" << KReg << "\n");
+      popStackAfter(I2);
+      Kills &= ~(1 << KReg);
+    }
+  }
+
+  // Manually kill the rest.
+  while (Kills) {
+    unsigned KReg = countTrailingZeros(Kills);
+    DEBUG(dbgs() << "Killing %FP" << KReg << "\n");
+    freeStackSlotBefore(I, KReg);
+    Kills &= ~(1 << KReg);
+  }
+
+  // Load zeros for all the imp-defs.
+  while(Defs) {
+    unsigned DReg = countTrailingZeros(Defs);
+    DEBUG(dbgs() << "Defining %FP" << DReg << " as 0\n");
+    BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0));
+    pushReg(DReg);
+    Defs &= ~(1 << DReg);
+  }
+
+  // Now we should have the correct registers live.
+  DEBUG(dumpStack());
+  assert(StackTop == countPopulation(Mask) && "Live count mismatch");
+}
+
+/// shuffleStackTop - emit fxch instructions before I to shuffle the top
+/// FixCount entries into the order given by FixStack.
+/// FIXME: Is there a better algorithm than insertion sort?
+void FPS::shuffleStackTop(const unsigned char *FixStack,
+                          unsigned FixCount,
+                          MachineBasicBlock::iterator I) {
+  // Move items into place, starting from the desired stack bottom.
+  while (FixCount--) {
+    // Old register at position FixCount.
+    unsigned OldReg = getStackEntry(FixCount);
+    // Desired register at position FixCount.
+    unsigned Reg = FixStack[FixCount];
+    if (Reg == OldReg)
+      continue;
+    // (Reg st0) (OldReg st0) = (Reg OldReg st0)
+    moveToTop(Reg, I);
+    if (FixCount > 0)
+      moveToTop(OldReg, I);
+  }
+  DEBUG(dumpStack());
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction transformation implementation
+//===----------------------------------------------------------------------===//
+
+void FPS::handleCall(MachineBasicBlock::iterator &I) {
+  unsigned STReturns = 0;
+
+  for (const auto &MO : I->operands()) {
+    if (!MO.isReg())
+      continue;
+
+    unsigned R = MO.getReg() - X86::FP0;
+
+    if (R < 8) {
+      assert(MO.isDef() && MO.isImplicit());
+      STReturns |= 1 << R;
+    }
+  }
+
+  unsigned N = countTrailingOnes(STReturns);
+
+  // FP registers used for function return must be consecutive starting at
+  // FP0.
+  assert(STReturns == 0 || (isMask_32(STReturns) && N <= 2));
+
+  for (unsigned I = 0; I < N; ++I)
+    pushReg(N - I - 1);
+}
+
+/// handleZeroArgFP - ST(0) = fld0    ST(0) = flds <mem>
+///
+void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  unsigned DestReg = getFPReg(MI->getOperand(0));
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(0);   // Remove the explicit ST(0) operand
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  // Result gets pushed on the stack.
+  pushReg(DestReg);
+}
+
+/// handleOneArgFP - fst <mem>, ST(0)
+///
+void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  assert((NumOps == X86::AddrNumOperands + 1 || NumOps == 1) &&
+         "Can only handle fst* & ftst instructions!");
+
+  // Is this the last use of the source register?
+  unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
+  bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+
+  // FISTP64m is strange because there isn't a non-popping versions.
+  // If we have one _and_ we don't want to pop the operand, duplicate the value
+  // on the stack instead of moving it.  This ensure that popping the value is
+  // always ok.
+  // Ditto FISTTP16m, FISTTP32m, FISTTP64m, ST_FpP80m.
+  //
+  if (!KillsSrc &&
+      (MI->getOpcode() == X86::IST_Fp64m32 ||
+       MI->getOpcode() == X86::ISTT_Fp16m32 ||
+       MI->getOpcode() == X86::ISTT_Fp32m32 ||
+       MI->getOpcode() == X86::ISTT_Fp64m32 ||
+       MI->getOpcode() == X86::IST_Fp64m64 ||
+       MI->getOpcode() == X86::ISTT_Fp16m64 ||
+       MI->getOpcode() == X86::ISTT_Fp32m64 ||
+       MI->getOpcode() == X86::ISTT_Fp64m64 ||
+       MI->getOpcode() == X86::IST_Fp64m80 ||
+       MI->getOpcode() == X86::ISTT_Fp16m80 ||
+       MI->getOpcode() == X86::ISTT_Fp32m80 ||
+       MI->getOpcode() == X86::ISTT_Fp64m80 ||
+       MI->getOpcode() == X86::ST_FpP80m)) {
+    duplicateToTop(Reg, ScratchFPReg, I);
+  } else {
+    moveToTop(Reg, I);            // Move to the top of the stack...
+  }
+
+  // Convert from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(NumOps-1);    // Remove explicit ST(0) operand
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  if (MI->getOpcode() == X86::IST_FP64m ||
+      MI->getOpcode() == X86::ISTT_FP16m ||
+      MI->getOpcode() == X86::ISTT_FP32m ||
+      MI->getOpcode() == X86::ISTT_FP64m ||
+      MI->getOpcode() == X86::ST_FP80m) {
+    if (StackTop == 0)
+      report_fatal_error("Stack empty??");
+    --StackTop;
+  } else if (KillsSrc) { // Last use of operand?
+    popStackAfter(I);
+  }
+}
+
+
+/// handleOneArgFPRW: Handle instructions that read from the top of stack and
+/// replace the value with a newly computed value.  These instructions may have
+/// non-fp operands after their FP operands.
+///
+///  Examples:
+///     R1 = fchs R2
+///     R1 = fadd R2, [mem]
+///
+void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+#ifndef NDEBUG
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!");
+#endif
+
+  // Is this the last use of the source register?
+  unsigned Reg = getFPReg(MI->getOperand(1));
+  bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+
+  if (KillsSrc) {
+    // If this is the last use of the source register, just make sure it's on
+    // the top of the stack.
+    moveToTop(Reg, I);
+    if (StackTop == 0)
+      report_fatal_error("Stack cannot be empty!");
+    --StackTop;
+    pushReg(getFPReg(MI->getOperand(0)));
+  } else {
+    // If this is not the last use of the source register, _copy_ it to the top
+    // of the stack.
+    duplicateToTop(Reg, getFPReg(MI->getOperand(0)), I);
+  }
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(1);   // Drop the source operand.
+  MI->RemoveOperand(0);   // Drop the destination operand.
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define tables of various ways to map pseudo instructions
+//
+
+// ForwardST0Table - Map: A = B op C  into: ST(0) = ST(0) op ST(i)
+static const TableEntry ForwardST0Table[] = {
+  { X86::ADD_Fp32  , X86::ADD_FST0r },
+  { X86::ADD_Fp64  , X86::ADD_FST0r },
+  { X86::ADD_Fp80  , X86::ADD_FST0r },
+  { X86::DIV_Fp32  , X86::DIV_FST0r },
+  { X86::DIV_Fp64  , X86::DIV_FST0r },
+  { X86::DIV_Fp80  , X86::DIV_FST0r },
+  { X86::MUL_Fp32  , X86::MUL_FST0r },
+  { X86::MUL_Fp64  , X86::MUL_FST0r },
+  { X86::MUL_Fp80  , X86::MUL_FST0r },
+  { X86::SUB_Fp32  , X86::SUB_FST0r },
+  { X86::SUB_Fp64  , X86::SUB_FST0r },
+  { X86::SUB_Fp80  , X86::SUB_FST0r },
+};
+
+// ReverseST0Table - Map: A = B op C  into: ST(0) = ST(i) op ST(0)
+static const TableEntry ReverseST0Table[] = {
+  { X86::ADD_Fp32  , X86::ADD_FST0r  },   // commutative
+  { X86::ADD_Fp64  , X86::ADD_FST0r  },   // commutative
+  { X86::ADD_Fp80  , X86::ADD_FST0r  },   // commutative
+  { X86::DIV_Fp32  , X86::DIVR_FST0r },
+  { X86::DIV_Fp64  , X86::DIVR_FST0r },
+  { X86::DIV_Fp80  , X86::DIVR_FST0r },
+  { X86::MUL_Fp32  , X86::MUL_FST0r  },   // commutative
+  { X86::MUL_Fp64  , X86::MUL_FST0r  },   // commutative
+  { X86::MUL_Fp80  , X86::MUL_FST0r  },   // commutative
+  { X86::SUB_Fp32  , X86::SUBR_FST0r },
+  { X86::SUB_Fp64  , X86::SUBR_FST0r },
+  { X86::SUB_Fp80  , X86::SUBR_FST0r },
+};
+
+// ForwardSTiTable - Map: A = B op C  into: ST(i) = ST(0) op ST(i)
+static const TableEntry ForwardSTiTable[] = {
+  { X86::ADD_Fp32  , X86::ADD_FrST0  },   // commutative
+  { X86::ADD_Fp64  , X86::ADD_FrST0  },   // commutative
+  { X86::ADD_Fp80  , X86::ADD_FrST0  },   // commutative
+  { X86::DIV_Fp32  , X86::DIVR_FrST0 },
+  { X86::DIV_Fp64  , X86::DIVR_FrST0 },
+  { X86::DIV_Fp80  , X86::DIVR_FrST0 },
+  { X86::MUL_Fp32  , X86::MUL_FrST0  },   // commutative
+  { X86::MUL_Fp64  , X86::MUL_FrST0  },   // commutative
+  { X86::MUL_Fp80  , X86::MUL_FrST0  },   // commutative
+  { X86::SUB_Fp32  , X86::SUBR_FrST0 },
+  { X86::SUB_Fp64  , X86::SUBR_FrST0 },
+  { X86::SUB_Fp80  , X86::SUBR_FrST0 },
+};
+
+// ReverseSTiTable - Map: A = B op C  into: ST(i) = ST(i) op ST(0)
+static const TableEntry ReverseSTiTable[] = {
+  { X86::ADD_Fp32  , X86::ADD_FrST0 },
+  { X86::ADD_Fp64  , X86::ADD_FrST0 },
+  { X86::ADD_Fp80  , X86::ADD_FrST0 },
+  { X86::DIV_Fp32  , X86::DIV_FrST0 },
+  { X86::DIV_Fp64  , X86::DIV_FrST0 },
+  { X86::DIV_Fp80  , X86::DIV_FrST0 },
+  { X86::MUL_Fp32  , X86::MUL_FrST0 },
+  { X86::MUL_Fp64  , X86::MUL_FrST0 },
+  { X86::MUL_Fp80  , X86::MUL_FrST0 },
+  { X86::SUB_Fp32  , X86::SUB_FrST0 },
+  { X86::SUB_Fp64  , X86::SUB_FrST0 },
+  { X86::SUB_Fp80  , X86::SUB_FrST0 },
+};
+
+
+/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual
+/// instructions which need to be simplified and possibly transformed.
+///
+/// Result: ST(0) = fsub  ST(0), ST(i)
+///         ST(i) = fsub  ST(0), ST(i)
+///         ST(0) = fsubr ST(0), ST(i)
+///         ST(i) = fsubr ST(0), ST(i)
+///
+void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
+  ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+  ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+  MachineInstr *MI = I;
+
+  unsigned NumOperands = MI->getDesc().getNumOperands();
+  assert(NumOperands == 3 && "Illegal TwoArgFP instruction!");
+  unsigned Dest = getFPReg(MI->getOperand(0));
+  unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+  unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+  bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
+  bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+  DebugLoc dl = MI->getDebugLoc();
+
+  unsigned TOS = getStackEntry(0);
+
+  // One of our operands must be on the top of the stack.  If neither is yet, we
+  // need to move one.
+  if (Op0 != TOS && Op1 != TOS) {   // No operand at TOS?
+    // We can choose to move either operand to the top of the stack.  If one of
+    // the operands is killed by this instruction, we want that one so that we
+    // can update right on top of the old version.
+    if (KillsOp0) {
+      moveToTop(Op0, I);         // Move dead operand to TOS.
+      TOS = Op0;
+    } else if (KillsOp1) {
+      moveToTop(Op1, I);
+      TOS = Op1;
+    } else {
+      // All of the operands are live after this instruction executes, so we
+      // cannot update on top of any operand.  Because of this, we must
+      // duplicate one of the stack elements to the top.  It doesn't matter
+      // which one we pick.
+      //
+      duplicateToTop(Op0, Dest, I);
+      Op0 = TOS = Dest;
+      KillsOp0 = true;
+    }
+  } else if (!KillsOp0 && !KillsOp1) {
+    // If we DO have one of our operands at the top of the stack, but we don't
+    // have a dead operand, we must duplicate one of the operands to a new slot
+    // on the stack.
+    duplicateToTop(Op0, Dest, I);
+    Op0 = TOS = Dest;
+    KillsOp0 = true;
+  }
+
+  // Now we know that one of our operands is on the top of the stack, and at
+  // least one of our operands is killed by this instruction.
+  assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) &&
+         "Stack conditions not set up right!");
+
+  // We decide which form to use based on what is on the top of the stack, and
+  // which operand is killed by this instruction.
+  ArrayRef<TableEntry> InstTable;
+  bool isForward = TOS == Op0;
+  bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0);
+  if (updateST0) {
+    if (isForward)
+      InstTable = ForwardST0Table;
+    else
+      InstTable = ReverseST0Table;
+  } else {
+    if (isForward)
+      InstTable = ForwardSTiTable;
+    else
+      InstTable = ReverseSTiTable;
+  }
+
+  int Opcode = Lookup(InstTable, MI->getOpcode());
+  assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!");
+
+  // NotTOS - The register which is not on the top of stack...
+  unsigned NotTOS = (TOS == Op0) ? Op1 : Op0;
+
+  // Replace the old instruction with a new instruction
+  MBB->remove(I++);
+  I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS));
+
+  // If both operands are killed, pop one off of the stack in addition to
+  // overwriting the other one.
+  if (KillsOp0 && KillsOp1 && Op0 != Op1) {
+    assert(!updateST0 && "Should have updated other operand!");
+    popStackAfter(I);   // Pop the top of stack
+  }
+
+  // Update stack information so that we know the destination register is now on
+  // the stack.
+  unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS);
+  assert(UpdatedSlot < StackTop && Dest < 7);
+  Stack[UpdatedSlot]   = Dest;
+  RegMap[Dest]         = UpdatedSlot;
+  MBB->getParent()->DeleteMachineInstr(MI); // Remove the old instruction
+}
+
+/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP
+/// register arguments and no explicit destinations.
+///
+void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
+  ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+  ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+  MachineInstr *MI = I;
+
+  unsigned NumOperands = MI->getDesc().getNumOperands();
+  assert(NumOperands == 2 && "Illegal FUCOM* instruction!");
+  unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+  unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+  bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
+  bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+
+  // Make sure the first operand is on the top of stack, the other one can be
+  // anywhere.
+  moveToTop(Op0, I);
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->getOperand(0).setReg(getSTReg(Op1));
+  MI->RemoveOperand(1);
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  // If any of the operands are killed by this instruction, free them.
+  if (KillsOp0) freeStackSlotAfter(I, Op0);
+  if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1);
+}
+
+/// handleCondMovFP - Handle two address conditional move instructions.  These
+/// instructions move a st(i) register to st(0) iff a condition is true.  These
+/// instructions require that the first operand is at the top of the stack, but
+/// otherwise don't modify the stack at all.
+void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+
+  unsigned Op0 = getFPReg(MI->getOperand(0));
+  unsigned Op1 = getFPReg(MI->getOperand(2));
+  bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+
+  // The first operand *must* be on the top of the stack.
+  moveToTop(Op0, I);
+
+  // Change the second operand to the stack register that the operand is in.
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(0);
+  MI->RemoveOperand(1);
+  MI->getOperand(0).setReg(getSTReg(Op1));
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  // If we kill the second operand, make sure to pop it from the stack.
+  if (Op0 != Op1 && KillsOp1) {
+    // Get this value off of the register stack.
+    freeStackSlotAfter(I, Op1);
+  }
+}
+
+
+/// handleSpecialFP - Handle special instructions which behave unlike other
+/// floating point instructions.  This is primarily intended for use by pseudo
+/// instructions.
+///
+void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
+  MachineInstr *MI = Inst;
+
+  if (MI->isCall()) {
+    handleCall(Inst);
+    return;
+  }
+
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unknown SpecialFP instruction!");
+  case TargetOpcode::COPY: {
+    // We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP.
+    const MachineOperand &MO1 = MI->getOperand(1);
+    const MachineOperand &MO0 = MI->getOperand(0);
+    bool KillsSrc = MI->killsRegister(MO1.getReg());
+
+    // FP <- FP copy.
+    unsigned DstFP = getFPReg(MO0);
+    unsigned SrcFP = getFPReg(MO1);
+    assert(isLive(SrcFP) && "Cannot copy dead register");
+    if (KillsSrc) {
+      // If the input operand is killed, we can just change the owner of the
+      // incoming stack slot into the result.
+      unsigned Slot = getSlot(SrcFP);
+      Stack[Slot] = DstFP;
+      RegMap[DstFP] = Slot;
+    } else {
+      // For COPY we just duplicate the specified value to a new stack slot.
+      // This could be made better, but would require substantial changes.
+      duplicateToTop(SrcFP, DstFP, Inst);
+    }
+    break;
+  }
+
+  case TargetOpcode::IMPLICIT_DEF: {
+    // All FP registers must be explicitly defined, so load a 0 instead.
+    unsigned Reg = MI->getOperand(0).getReg() - X86::FP0;
+    DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n');
+    BuildMI(*MBB, Inst, MI->getDebugLoc(), TII->get(X86::LD_F0));
+    pushReg(Reg);
+    break;
+  }
+
+  case TargetOpcode::INLINEASM: {
+    // The inline asm MachineInstr currently only *uses* FP registers for the
+    // 'f' constraint.  These should be turned into the current ST(x) register
+    // in the machine instr.
+    //
+    // There are special rules for x87 inline assembly. The compiler must know
+    // exactly how many registers are popped and pushed implicitly by the asm.
+    // Otherwise it is not possible to restore the stack state after the inline
+    // asm.
+    //
+    // There are 3 kinds of input operands:
+    //
+    // 1. Popped inputs. These must appear at the stack top in ST0-STn. A
+    //    popped input operand must be in a fixed stack slot, and it is either
+    //    tied to an output operand, or in the clobber list. The MI has ST use
+    //    and def operands for these inputs.
+    //
+    // 2. Fixed inputs. These inputs appear in fixed stack slots, but are
+    //    preserved by the inline asm. The fixed stack slots must be STn-STm
+    //    following the popped inputs. A fixed input operand cannot be tied to
+    //    an output or appear in the clobber list. The MI has ST use operands
+    //    and no defs for these inputs.
+    //
+    // 3. Preserved inputs. These inputs use the "f" constraint which is
+    //    represented as an FP register. The inline asm won't change these
+    //    stack slots.
+    //
+    // Outputs must be in ST registers, FP outputs are not allowed. Clobbered
+    // registers do not count as output operands. The inline asm changes the
+    // stack as if it popped all the popped inputs and then pushed all the
+    // output operands.
+
+    // Scan the assembly for ST registers used, defined and clobbered. We can
+    // only tell clobbers from defs by looking at the asm descriptor.
+    unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0;
+    unsigned NumOps = 0;
+    SmallSet<unsigned, 1> FRegIdx;
+    unsigned RCID;
+
+    for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI->getNumOperands();
+         i != e && MI->getOperand(i).isImm(); i += 1 + NumOps) {
+      unsigned Flags = MI->getOperand(i).getImm();
+
+      NumOps = InlineAsm::getNumOperandRegisters(Flags);
+      if (NumOps != 1)
+        continue;
+      const MachineOperand &MO = MI->getOperand(i + 1);
+      if (!MO.isReg())
+        continue;
+      unsigned STReg = MO.getReg() - X86::FP0;
+      if (STReg >= 8)
+        continue;
+
+      // If the flag has a register class constraint, this must be an operand
+      // with constraint "f". Record its index and continue.
+      if (InlineAsm::hasRegClassConstraint(Flags, RCID)) {
+        FRegIdx.insert(i + 1);
+        continue;
+      }
+
+      switch (InlineAsm::getKind(Flags)) {
+      case InlineAsm::Kind_RegUse:
+        STUses |= (1u << STReg);
+        break;
+      case InlineAsm::Kind_RegDef:
+      case InlineAsm::Kind_RegDefEarlyClobber:
+        STDefs |= (1u << STReg);
+        if (MO.isDead())
+          STDeadDefs |= (1u << STReg);
+        break;
+      case InlineAsm::Kind_Clobber:
+        STClobbers |= (1u << STReg);
+        break;
+      default:
+        break;
+      }
+    }
+
+    if (STUses && !isMask_32(STUses))
+      MI->emitError("fixed input regs must be last on the x87 stack");
+    unsigned NumSTUses = countTrailingOnes(STUses);
+
+    // Defs must be contiguous from the stack top. ST0-STn.
+    if (STDefs && !isMask_32(STDefs)) {
+      MI->emitError("output regs must be last on the x87 stack");
+      STDefs = NextPowerOf2(STDefs) - 1;
+    }
+    unsigned NumSTDefs = countTrailingOnes(STDefs);
+
+    // So must the clobbered stack slots. ST0-STm, m >= n.
+    if (STClobbers && !isMask_32(STDefs | STClobbers))
+      MI->emitError("clobbers must be last on the x87 stack");
+
+    // Popped inputs are the ones that are also clobbered or defined.
+    unsigned STPopped = STUses & (STDefs | STClobbers);
+    if (STPopped && !isMask_32(STPopped))
+      MI->emitError("implicitly popped regs must be last on the x87 stack");
+    unsigned NumSTPopped = countTrailingOnes(STPopped);
+
+    DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
+                 << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n");
+
+#ifndef NDEBUG
+    // If any input operand uses constraint "f", all output register
+    // constraints must be early-clobber defs.
+    for (unsigned I = 0, E = MI->getNumOperands(); I < E; ++I)
+      if (FRegIdx.count(I)) {
+        assert((1 << getFPReg(MI->getOperand(I)) & STDefs) == 0 &&
+               "Operands with constraint \"f\" cannot overlap with defs");
+      }
+#endif
+
+    // Collect all FP registers (register operands with constraints "t", "u",
+    // and "f") to kill afer the instruction.
+    unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &Op = MI->getOperand(i);
+      if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+        continue;
+      unsigned FPReg = getFPReg(Op);
+
+      // If we kill this operand, make sure to pop it from the stack after the
+      // asm.  We just remember it for now, and pop them all off at the end in
+      // a batch.
+      if (Op.isUse() && Op.isKill())
+        FPKills |= 1U << FPReg;
+    }
+
+    // Do not include registers that are implicitly popped by defs/clobbers.
+    FPKills &= ~(STDefs | STClobbers);
+
+    // Now we can rearrange the live registers to match what was requested.
+    unsigned char STUsesArray[8];
+
+    for (unsigned I = 0; I < NumSTUses; ++I)
+      STUsesArray[I] = I;
+
+    shuffleStackTop(STUsesArray, NumSTUses, Inst);
+    DEBUG({dbgs() << "Before asm: "; dumpStack();});
+
+    // With the stack layout fixed, rewrite the FP registers.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &Op = MI->getOperand(i);
+      if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+        continue;
+
+      unsigned FPReg = getFPReg(Op);
+
+      if (FRegIdx.count(i))
+        // Operand with constraint "f".
+        Op.setReg(getSTReg(FPReg));
+      else
+        // Operand with a single register class constraint ("t" or "u").
+        Op.setReg(X86::ST0 + FPReg);
+    }
+
+    // Simulate the inline asm popping its inputs and pushing its outputs.
+    StackTop -= NumSTPopped;
+
+    for (unsigned i = 0; i < NumSTDefs; ++i)
+      pushReg(NumSTDefs - i - 1);
+
+    // If this asm kills any FP registers (is the last use of them) we must
+    // explicitly emit pop instructions for them.  Do this now after the asm has
+    // executed so that the ST(x) numbers are not off (which would happen if we
+    // did this inline with operand rewriting).
+    //
+    // Note: this might be a non-optimal pop sequence.  We might be able to do
+    // better by trying to pop in stack order or something.
+    while (FPKills) {
+      unsigned FPReg = countTrailingZeros(FPKills);
+      if (isLive(FPReg))
+        freeStackSlotAfter(Inst, FPReg);
+      FPKills &= ~(1U << FPReg);
+    }
+
+    // Don't delete the inline asm!
+    return;
+  }
+
+  case X86::RETQ:
+  case X86::RETL:
+  case X86::RETIL:
+  case X86::RETIQ:
+    // If RET has an FP register use operand, pass the first one in ST(0) and
+    // the second one in ST(1).
+
+    // Find the register operands.
+    unsigned FirstFPRegOp = ~0U, SecondFPRegOp = ~0U;
+    unsigned LiveMask = 0;
+
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &Op = MI->getOperand(i);
+      if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+        continue;
+      // FP Register uses must be kills unless there are two uses of the same
+      // register, in which case only one will be a kill.
+      assert(Op.isUse() &&
+             (Op.isKill() ||                        // Marked kill.
+              getFPReg(Op) == FirstFPRegOp ||       // Second instance.
+              MI->killsRegister(Op.getReg())) &&    // Later use is marked kill.
+             "Ret only defs operands, and values aren't live beyond it");
+
+      if (FirstFPRegOp == ~0U)
+        FirstFPRegOp = getFPReg(Op);
+      else {
+        assert(SecondFPRegOp == ~0U && "More than two fp operands!");
+        SecondFPRegOp = getFPReg(Op);
+      }
+      LiveMask |= (1 << getFPReg(Op));
+
+      // Remove the operand so that later passes don't see it.
+      MI->RemoveOperand(i);
+      --i, --e;
+    }
+
+    // We may have been carrying spurious live-ins, so make sure only the returned
+    // registers are left live.
+    adjustLiveRegs(LiveMask, MI);
+    if (!LiveMask) return;  // Quick check to see if any are possible.
+
+    // There are only four possibilities here:
+    // 1) we are returning a single FP value.  In this case, it has to be in
+    //    ST(0) already, so just declare success by removing the value from the
+    //    FP Stack.
+    if (SecondFPRegOp == ~0U) {
+      // Assert that the top of stack contains the right FP register.
+      assert(StackTop == 1 && FirstFPRegOp == getStackEntry(0) &&
+             "Top of stack not the right register for RET!");
+
+      // Ok, everything is good, mark the value as not being on the stack
+      // anymore so that our assertion about the stack being empty at end of
+      // block doesn't fire.
+      StackTop = 0;
+      return;
+    }
+
+    // Otherwise, we are returning two values:
+    // 2) If returning the same value for both, we only have one thing in the FP
+    //    stack.  Consider:  RET FP1, FP1
+    if (StackTop == 1) {
+      assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&&
+             "Stack misconfiguration for RET!");
+
+      // Duplicate the TOS so that we return it twice.  Just pick some other FPx
+      // register to hold it.
+      unsigned NewReg = ScratchFPReg;
+      duplicateToTop(FirstFPRegOp, NewReg, MI);
+      FirstFPRegOp = NewReg;
+    }
+
+    /// Okay we know we have two different FPx operands now:
+    assert(StackTop == 2 && "Must have two values live!");
+
+    /// 3) If SecondFPRegOp is currently in ST(0) and FirstFPRegOp is currently
+    ///    in ST(1).  In this case, emit an fxch.
+    if (getStackEntry(0) == SecondFPRegOp) {
+      assert(getStackEntry(1) == FirstFPRegOp && "Unknown regs live");
+      moveToTop(FirstFPRegOp, MI);
+    }
+
+    /// 4) Finally, FirstFPRegOp must be in ST(0) and SecondFPRegOp must be in
+    /// ST(1).  Just remove both from our understanding of the stack and return.
+    assert(getStackEntry(0) == FirstFPRegOp && "Unknown regs live");
+    assert(getStackEntry(1) == SecondFPRegOp && "Unknown regs live");
+    StackTop = 0;
+    return;
+  }
+
+  Inst = MBB->erase(Inst);  // Remove the pseudo instruction
+
+  // We want to leave I pointing to the previous instruction, but what if we
+  // just erased the first instruction?
+  if (Inst == MBB->begin()) {
+    DEBUG(dbgs() << "Inserting dummy KILL\n");
+    Inst = BuildMI(*MBB, Inst, DebugLoc(), TII->get(TargetOpcode::KILL));
+  } else
+    --Inst;
+}
+
+void FPS::setKillFlags(MachineBasicBlock &MBB) const {
+  const TargetRegisterInfo *TRI =
+      MBB.getParent()->getSubtarget().getRegisterInfo();
+  LivePhysRegs LPR(TRI);
+
+  LPR.addLiveOuts(&MBB);
+
+  for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
+       I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+
+    std::bitset<8> Defs;
+    SmallVector<MachineOperand *, 2> Uses;
+    MachineInstr &MI = *I;
+
+    for (auto &MO : I->operands()) {
+      if (!MO.isReg())
+        continue;
+
+      unsigned Reg = MO.getReg() - X86::FP0;
+
+      if (Reg >= 8)
+        continue;
+
+      if (MO.isDef()) {
+        Defs.set(Reg);
+        if (!LPR.contains(MO.getReg()))
+          MO.setIsDead();
+      } else
+        Uses.push_back(&MO);
+    }
+
+    for (auto *MO : Uses)
+      if (Defs.test(getFPReg(*MO)) || !LPR.contains(MO->getReg()))
+        MO->setIsKill();
+
+    LPR.stepBackward(MI);
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
new file mode 100644
index 0000000..8b5fd27
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -0,0 +1,2698 @@
+//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Debug.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
+                                   unsigned StackAlignOverride)
+    : TargetFrameLowering(StackGrowsDown, StackAlignOverride,
+                          STI.is64Bit() ? -8 : -4),
+      STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {
+  // Cache a bunch of frame-related predicates for this subtarget.
+  SlotSize = TRI->getSlotSize();
+  Is64Bit = STI.is64Bit();
+  IsLP64 = STI.isTarget64BitLP64();
+  // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
+  Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
+  StackPtr = TRI->getStackRegister();
+}
+
+bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  return !MF.getFrameInfo()->hasVarSizedObjects() &&
+         !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+}
+
+/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
+/// call frame pseudos can be simplified.  Having a FP, as in the default
+/// implementation, is not sufficient here since we can't always use it.
+/// Use a more nuanced condition.
+bool
+X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
+  return hasReservedCallFrame(MF) ||
+         (hasFP(MF) && !TRI->needsStackRealignment(MF)) ||
+         TRI->hasBasePointer(MF);
+}
+
+// needsFrameIndexResolution - Do we need to perform FI resolution for
+// this function. Normally, this is required only when the function
+// has any stack objects. However, FI resolution actually has another job,
+// not apparent from the title - it resolves callframesetup/destroy 
+// that were not simplified earlier.
+// So, this is required for x86 functions that have push sequences even
+// when there are no stack objects.
+bool
+X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
+  return MF.getFrameInfo()->hasStackObjects() ||
+         MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.  This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const MachineModuleInfo &MMI = MF.getMMI();
+
+  return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+          TRI->needsStackRealignment(MF) ||
+          MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken() || MFI->hasOpaqueSPAdjustment() ||
+          MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
+          MMI.callsUnwindInit() || MMI.hasEHFunclets() || MMI.callsEHReturn() ||
+          MFI->hasStackMap() || MFI->hasPatchPoint());
+}
+
+static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
+  if (IsLP64) {
+    if (isInt<8>(Imm))
+      return X86::SUB64ri8;
+    return X86::SUB64ri32;
+  } else {
+    if (isInt<8>(Imm))
+      return X86::SUB32ri8;
+    return X86::SUB32ri;
+  }
+}
+
+static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) {
+  if (IsLP64) {
+    if (isInt<8>(Imm))
+      return X86::ADD64ri8;
+    return X86::ADD64ri32;
+  } else {
+    if (isInt<8>(Imm))
+      return X86::ADD32ri8;
+    return X86::ADD32ri;
+  }
+}
+
+static unsigned getSUBrrOpcode(unsigned isLP64) {
+  return isLP64 ? X86::SUB64rr : X86::SUB32rr;
+}
+
+static unsigned getADDrrOpcode(unsigned isLP64) {
+  return isLP64 ? X86::ADD64rr : X86::ADD32rr;
+}
+
+static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
+  if (IsLP64) {
+    if (isInt<8>(Imm))
+      return X86::AND64ri8;
+    return X86::AND64ri32;
+  }
+  if (isInt<8>(Imm))
+    return X86::AND32ri8;
+  return X86::AND32ri;
+}
+
+static unsigned getLEArOpcode(unsigned IsLP64) {
+  return IsLP64 ? X86::LEA64r : X86::LEA32r;
+}
+
+/// findDeadCallerSavedReg - Return a caller-saved register that isn't live
+/// when it reaches the "return" instruction. We can then pop a stack object
+/// to this register without worry about clobbering it.
+static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator &MBBI,
+                                       const X86RegisterInfo *TRI,
+                                       bool Is64Bit) {
+  const MachineFunction *MF = MBB.getParent();
+  const Function *F = MF->getFunction();
+  if (!F || MF->getMMI().callsEHReturn())
+    return 0;
+
+  const TargetRegisterClass &AvailableRegs = *TRI->getGPRsForTailCall(*MF);
+
+  unsigned Opc = MBBI->getOpcode();
+  switch (Opc) {
+  default: return 0;
+  case X86::RETL:
+  case X86::RETQ:
+  case X86::RETIL:
+  case X86::RETIQ:
+  case X86::TCRETURNdi:
+  case X86::TCRETURNri:
+  case X86::TCRETURNmi:
+  case X86::TCRETURNdi64:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNmi64:
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64: {
+    SmallSet<uint16_t, 8> Uses;
+    for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MBBI->getOperand(i);
+      if (!MO.isReg() || MO.isDef())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        Uses.insert(*AI);
+    }
+
+    for (auto CS : AvailableRegs)
+      if (!Uses.count(CS) && CS != X86::RIP)
+        return CS;
+  }
+  }
+
+  return 0;
+}
+
+static bool isEAXLiveIn(MachineFunction &MF) {
+  for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(),
+       EE = MF.getRegInfo().livein_end(); II != EE; ++II) {
+    unsigned Reg = II->first;
+
+    if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX ||
+        Reg == X86::AH || Reg == X86::AL)
+      return true;
+  }
+
+  return false;
+}
+
+/// Check if the flags need to be preserved before the terminators.
+/// This would be the case, if the eflags is live-in of the region
+/// composed by the terminators or live-out of that region, without
+/// being defined by a terminator.
+static bool
+flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
+  for (const MachineInstr &MI : MBB.terminators()) {
+    bool BreakNext = false;
+    for (const MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (Reg != X86::EFLAGS)
+        continue;
+
+      // This terminator needs an eflags that is not defined
+      // by a previous another terminator:
+      // EFLAGS is live-in of the region composed by the terminators.
+      if (!MO.isDef())
+        return true;
+      // This terminator defines the eflags, i.e., we don't need to preserve it.
+      // However, we still need to check this specific terminator does not
+      // read a live-in value.
+      BreakNext = true;
+    }
+    // We found a definition of the eflags, no need to preserve them.
+    if (BreakNext)
+      return false;
+  }
+
+  // None of the terminators use or define the eflags.
+  // Check if they are live-out, that would imply we need to preserve them.
+  for (const MachineBasicBlock *Succ : MBB.successors())
+    if (Succ->isLiveIn(X86::EFLAGS))
+      return true;
+
+  return false;
+}
+
+/// emitSPUpdate - Emit a series of instructions to increment / decrement the
+/// stack pointer by a constant value.
+void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator &MBBI,
+                                    int64_t NumBytes, bool InEpilogue) const {
+  bool isSub = NumBytes < 0;
+  uint64_t Offset = isSub ? -NumBytes : NumBytes;
+
+  uint64_t Chunk = (1LL << 31) - 1;
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  while (Offset) {
+    if (Offset > Chunk) {
+      // Rather than emit a long series of instructions for large offsets,
+      // load the offset into a register and do one sub/add
+      unsigned Reg = 0;
+
+      if (isSub && !isEAXLiveIn(*MBB.getParent()))
+        Reg = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
+      else
+        Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+
+      if (Reg) {
+        unsigned Opc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
+        BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg)
+          .addImm(Offset);
+        Opc = isSub
+          ? getSUBrrOpcode(Is64Bit)
+          : getADDrrOpcode(Is64Bit);
+        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+          .addReg(StackPtr)
+          .addReg(Reg);
+        MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+        Offset = 0;
+        continue;
+      }
+    }
+
+    uint64_t ThisVal = std::min(Offset, Chunk);
+    if (ThisVal == (Is64Bit ? 8 : 4)) {
+      // Use push / pop instead.
+      unsigned Reg = isSub
+        ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
+        : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+      if (Reg) {
+        unsigned Opc = isSub
+          ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
+          : (Is64Bit ? X86::POP64r  : X86::POP32r);
+        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc))
+          .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
+        if (isSub)
+          MI->setFlag(MachineInstr::FrameSetup);
+        else
+          MI->setFlag(MachineInstr::FrameDestroy);
+        Offset -= ThisVal;
+        continue;
+      }
+    }
+
+    MachineInstrBuilder MI = BuildStackAdjustment(
+        MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue);
+    if (isSub)
+      MI.setMIFlag(MachineInstr::FrameSetup);
+    else
+      MI.setMIFlag(MachineInstr::FrameDestroy);
+
+    Offset -= ThisVal;
+  }
+}
+
+MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL,
+    int64_t Offset, bool InEpilogue) const {
+  assert(Offset != 0 && "zero offset stack adjustment requested");
+
+  // On Atom, using LEA to adjust SP is preferred, but using it in the epilogue
+  // is tricky.
+  bool UseLEA;
+  if (!InEpilogue) {
+    // Check if inserting the prologue at the beginning
+    // of MBB would require to use LEA operations.
+    // We need to use LEA operations if EFLAGS is live in, because
+    // it means an instruction will read it before it gets defined.
+    UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS);
+  } else {
+    // If we can use LEA for SP but we shouldn't, check that none
+    // of the terminators uses the eflags. Otherwise we will insert
+    // a ADD that will redefine the eflags and break the condition.
+    // Alternatively, we could move the ADD, but this may not be possible
+    // and is an optimization anyway.
+    UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent());
+    if (UseLEA && !STI.useLeaForSP())
+      UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB);
+    // If that assert breaks, that means we do not do the right thing
+    // in canUseAsEpilogue.
+    assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) &&
+           "We shouldn't have allowed this insertion point");
+  }
+
+  MachineInstrBuilder MI;
+  if (UseLEA) {
+    MI = addRegOffset(BuildMI(MBB, MBBI, DL,
+                              TII.get(getLEArOpcode(Uses64BitFramePtr)),
+                              StackPtr),
+                      StackPtr, false, Offset);
+  } else {
+    bool IsSub = Offset < 0;
+    uint64_t AbsOffset = IsSub ? -Offset : Offset;
+    unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset)
+                         : getADDriOpcode(Uses64BitFramePtr, AbsOffset);
+    MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+             .addReg(StackPtr)
+             .addImm(AbsOffset);
+    MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+  }
+  return MI;
+}
+
+int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator &MBBI,
+                                     bool doMergeWithPrevious) const {
+  if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
+      (!doMergeWithPrevious && MBBI == MBB.end()))
+    return 0;
+
+  MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
+  MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr
+                                                       : std::next(MBBI);
+  unsigned Opc = PI->getOpcode();
+  int Offset = 0;
+
+  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+       Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
+       Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
+      PI->getOperand(0).getReg() == StackPtr){
+    Offset += PI->getOperand(2).getImm();
+    MBB.erase(PI);
+    if (!doMergeWithPrevious) MBBI = NI;
+  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+             PI->getOperand(0).getReg() == StackPtr) {
+    Offset -= PI->getOperand(2).getImm();
+    MBB.erase(PI);
+    if (!doMergeWithPrevious) MBBI = NI;
+  }
+
+  return Offset;
+}
+
+void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                                MCCFIInstruction CFIInst) const {
+  MachineFunction &MF = *MBB.getParent();
+  unsigned CFIIndex = MF.getMMI().addFrameInst(CFIInst);
+  BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+}
+
+void
+X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MBBI,
+                                            DebugLoc DL) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+
+  // Add callee saved registers to move list.
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  if (CSI.empty()) return;
+
+  // Calculate offsets.
+  for (std::vector<CalleeSavedInfo>::const_iterator
+         I = CSI.begin(), E = CSI.end(); I != E; ++I) {
+    int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
+    unsigned Reg = I->getReg();
+
+    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+    BuildCFI(MBB, MBBI, DL,
+             MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+  }
+}
+
+MachineInstr *X86FrameLowering::emitStackProbe(MachineFunction &MF,
+                                               MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MBBI,
+                                               DebugLoc DL,
+                                               bool InProlog) const {
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  if (STI.isTargetWindowsCoreCLR()) {
+    if (InProlog) {
+      return emitStackProbeInlineStub(MF, MBB, MBBI, DL, true);
+    } else {
+      return emitStackProbeInline(MF, MBB, MBBI, DL, false);
+    }
+  } else {
+    return emitStackProbeCall(MF, MBB, MBBI, DL, InProlog);
+  }
+}
+
+void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
+                                        MachineBasicBlock &PrologMBB) const {
+  const StringRef ChkStkStubSymbol = "__chkstk_stub";
+  MachineInstr *ChkStkStub = nullptr;
+
+  for (MachineInstr &MI : PrologMBB) {
+    if (MI.isCall() && MI.getOperand(0).isSymbol() &&
+        ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) {
+      ChkStkStub = &MI;
+      break;
+    }
+  }
+
+  if (ChkStkStub != nullptr) {
+    MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator());
+    assert(std::prev(MBBI).operator==(ChkStkStub) &&
+      "MBBI expected after __chkstk_stub.");
+    DebugLoc DL = PrologMBB.findDebugLoc(MBBI);
+    emitStackProbeInline(MF, PrologMBB, MBBI, DL, true);
+    ChkStkStub->eraseFromParent();
+  }
+}
+
+MachineInstr *X86FrameLowering::emitStackProbeInline(
+  MachineFunction &MF, MachineBasicBlock &MBB,
+  MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  assert(STI.is64Bit() && "different expansion needed for 32 bit");
+  assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  const BasicBlock *LLVM_BB = MBB.getBasicBlock();
+
+  // RAX contains the number of bytes of desired stack adjustment.
+  // The handling here assumes this value has already been updated so as to
+  // maintain stack alignment.
+  //
+  // We need to exit with RSP modified by this amount and execute suitable
+  // page touches to notify the OS that we're growing the stack responsibly.
+  // All stack probing must be done without modifying RSP.
+  //
+  // MBB:
+  //    SizeReg = RAX;
+  //    ZeroReg = 0
+  //    CopyReg = RSP
+  //    Flags, TestReg = CopyReg - SizeReg
+  //    FinalReg = !Flags.Ovf ? TestReg : ZeroReg
+  //    LimitReg = gs magic thread env access
+  //    if FinalReg >= LimitReg goto ContinueMBB
+  // RoundBB:
+  //    RoundReg = page address of FinalReg
+  // LoopMBB:
+  //    LoopReg = PHI(LimitReg,ProbeReg)
+  //    ProbeReg = LoopReg - PageSize
+  //    [ProbeReg] = 0
+  //    if (ProbeReg > RoundReg) goto LoopMBB
+  // ContinueMBB:
+  //    RSP = RSP - RAX
+  //    [rest of original MBB]
+
+  // Set up the new basic blocks
+  MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+
+  MachineFunction::iterator MBBIter = std::next(MBB.getIterator());
+  MF.insert(MBBIter, RoundMBB);
+  MF.insert(MBBIter, LoopMBB);
+  MF.insert(MBBIter, ContinueMBB);
+
+  // Split MBB and move the tail portion down to ContinueMBB.
+  MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI);
+  ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end());
+  ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+
+  // Some useful constants
+  const int64_t ThreadEnvironmentStackLimit = 0x10;
+  const int64_t PageSize = 0x1000;
+  const int64_t PageMask = ~(PageSize - 1);
+
+  // Registers we need. For the normal case we use virtual
+  // registers. For the prolog expansion we use RAX, RCX and RDX.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterClass *RegClass = &X86::GR64RegClass;
+  const unsigned SizeReg = InProlog ? (unsigned)X86::RAX
+                                    : MRI.createVirtualRegister(RegClass),
+                 ZeroReg = InProlog ? (unsigned)X86::RCX
+                                    : MRI.createVirtualRegister(RegClass),
+                 CopyReg = InProlog ? (unsigned)X86::RDX
+                                    : MRI.createVirtualRegister(RegClass),
+                 TestReg = InProlog ? (unsigned)X86::RDX
+                                    : MRI.createVirtualRegister(RegClass),
+                 FinalReg = InProlog ? (unsigned)X86::RDX
+                                     : MRI.createVirtualRegister(RegClass),
+                 RoundedReg = InProlog ? (unsigned)X86::RDX
+                                       : MRI.createVirtualRegister(RegClass),
+                 LimitReg = InProlog ? (unsigned)X86::RCX
+                                     : MRI.createVirtualRegister(RegClass),
+                 JoinReg = InProlog ? (unsigned)X86::RCX
+                                    : MRI.createVirtualRegister(RegClass),
+                 ProbeReg = InProlog ? (unsigned)X86::RCX
+                                     : MRI.createVirtualRegister(RegClass);
+
+  // SP-relative offsets where we can save RCX and RDX.
+  int64_t RCXShadowSlot = 0;
+  int64_t RDXShadowSlot = 0;
+
+  // If inlining in the prolog, save RCX and RDX.     
+  // Future optimization: don't save or restore if not live in.
+  if (InProlog) {
+    // Compute the offsets. We need to account for things already
+    // pushed onto the stack at this point: return address, frame
+    // pointer (if used), and callee saves.
+    X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+    const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize();
+    const bool HasFP = hasFP(MF);
+    RCXShadowSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0);
+    RDXShadowSlot = RCXShadowSlot + 8;
+    // Emit the saves.
+    addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
+                 RCXShadowSlot)
+        .addReg(X86::RCX);
+    addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
+                 RDXShadowSlot)
+        .addReg(X86::RDX);
+  } else {
+    // Not in the prolog. Copy RAX to a virtual reg.
+    BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX);
+  }
+
+  // Add code to MBB to check for overflow and set the new target stack pointer
+  // to zero if so.
+  BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg)
+      .addReg(ZeroReg, RegState::Undef)
+      .addReg(ZeroReg, RegState::Undef);
+  BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP);
+  BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg)
+      .addReg(CopyReg)
+      .addReg(SizeReg);
+  BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg)
+      .addReg(TestReg)
+      .addReg(ZeroReg);
+
+  // FinalReg now holds final stack pointer value, or zero if
+  // allocation would overflow. Compare against the current stack
+  // limit from the thread environment block. Note this limit is the
+  // lowest touched page on the stack, not the point at which the OS
+  // will cause an overflow exception, so this is just an optimization
+  // to avoid unnecessarily touching pages that are below the current
+  // SP but already commited to the stack by the OS.
+  BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg)
+      .addReg(0)
+      .addImm(1)
+      .addReg(0)
+      .addImm(ThreadEnvironmentStackLimit)
+      .addReg(X86::GS);
+  BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);
+  // Jump if the desired stack pointer is at or above the stack limit.
+  BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB);
+
+  // Add code to roundMBB to round the final stack pointer to a page boundary.
+  BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg)
+      .addReg(FinalReg)
+      .addImm(PageMask);
+  BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB);
+
+  // LimitReg now holds the current stack limit, RoundedReg page-rounded
+  // final RSP value. Add code to loopMBB to decrement LimitReg page-by-page
+  // and probe until we reach RoundedReg.
+  if (!InProlog) {
+    BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg)
+        .addReg(LimitReg)
+        .addMBB(RoundMBB)
+        .addReg(ProbeReg)
+        .addMBB(LoopMBB);
+  }
+
+  addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg,
+               false, -PageSize);
+
+  // Probe by storing a byte onto the stack.
+  BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi))
+      .addReg(ProbeReg)
+      .addImm(1)
+      .addReg(0)
+      .addImm(0)
+      .addReg(0)
+      .addImm(0);
+  BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))
+      .addReg(RoundedReg)
+      .addReg(ProbeReg);
+  BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB);
+
+  MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();
+
+  // If in prolog, restore RDX and RCX.
+  if (InProlog) {
+    addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
+                         X86::RCX),
+                 X86::RSP, false, RCXShadowSlot);
+    addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
+                         X86::RDX),
+                 X86::RSP, false, RDXShadowSlot);
+  }
+
+  // Now that the probing is done, add code to continueMBB to update
+  // the stack pointer for real.
+  BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
+      .addReg(X86::RSP)
+      .addReg(SizeReg);
+
+  // Add the control flow edges we need.
+  MBB.addSuccessor(ContinueMBB);
+  MBB.addSuccessor(RoundMBB);
+  RoundMBB->addSuccessor(LoopMBB);
+  LoopMBB->addSuccessor(ContinueMBB);
+  LoopMBB->addSuccessor(LoopMBB);
+
+  // Mark all the instructions added to the prolog as frame setup.
+  if (InProlog) {
+    for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) {
+      BeforeMBBI->setFlag(MachineInstr::FrameSetup);
+    }
+    for (MachineInstr &MI : *RoundMBB) {
+      MI.setFlag(MachineInstr::FrameSetup);
+    }
+    for (MachineInstr &MI : *LoopMBB) {
+      MI.setFlag(MachineInstr::FrameSetup);
+    }
+    for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin();
+         CMBBI != ContinueMBBI; ++CMBBI) {
+      CMBBI->setFlag(MachineInstr::FrameSetup);
+    }
+  }
+
+  // Possible TODO: physreg liveness for InProlog case.
+
+  return ContinueMBBI;
+}
+
+MachineInstr *X86FrameLowering::emitStackProbeCall(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
+  bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
+
+  unsigned CallOp;
+  if (Is64Bit)
+    CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32;
+  else
+    CallOp = X86::CALLpcrel32;
+
+  const char *Symbol;
+  if (Is64Bit) {
+    if (STI.isTargetCygMing()) {
+      Symbol = "___chkstk_ms";
+    } else {
+      Symbol = "__chkstk";
+    }
+  } else if (STI.isTargetCygMing())
+    Symbol = "_alloca";
+  else
+    Symbol = "_chkstk";
+
+  MachineInstrBuilder CI;
+  MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);
+
+  // All current stack probes take AX and SP as input, clobber flags, and
+  // preserve all registers. x86_64 probes leave RSP unmodified.
+  if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
+    // For the large code model, we have to call through a register. Use R11,
+    // as it is scratch in all supported calling conventions.
+    BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
+        .addExternalSymbol(Symbol);
+    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
+  } else {
+    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol);
+  }
+
+  unsigned AX = Is64Bit ? X86::RAX : X86::EAX;
+  unsigned SP = Is64Bit ? X86::RSP : X86::ESP;
+  CI.addReg(AX, RegState::Implicit)
+      .addReg(SP, RegState::Implicit)
+      .addReg(AX, RegState::Define | RegState::Implicit)
+      .addReg(SP, RegState::Define | RegState::Implicit)
+      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+
+  if (Is64Bit) {
+    // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
+    // themselves. It also does not clobber %rax so we can reuse it when
+    // adjusting %rsp.
+    BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
+        .addReg(X86::RSP)
+        .addReg(X86::RAX);
+  }
+
+  if (InProlog) {
+    // Apply the frame setup flag to all inserted instrs.
+    for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI)
+      ExpansionMBBI->setFlag(MachineInstr::FrameSetup);
+  }
+
+  return MBBI;
+}
+
+MachineInstr *X86FrameLowering::emitStackProbeInlineStub(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
+
+  assert(InProlog && "ChkStkStub called outside prolog!");
+
+  BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+      .addExternalSymbol("__chkstk_stub");
+
+  return MBBI;
+}
+
+static unsigned calculateSetFPREG(uint64_t SPAdjust) {
+  // Win64 ABI has a less restrictive limitation of 240; 128 works equally well
+  // and might require smaller successive adjustments.
+  const uint64_t Win64MaxSEHOffset = 128;
+  uint64_t SEHFrameOffset = std::min(SPAdjust, Win64MaxSEHOffset);
+  // Win64 ABI requires 16-byte alignment for the UWOP_SET_FPREG opcode.
+  return SEHFrameOffset & -16;
+}
+
+// If we're forcing a stack realignment we can't rely on just the frame
+// info, we need to know the ABI stack alignment as well in case we
+// have a call out.  Otherwise just make sure we have some alignment - we'll
+// go with the minimum SlotSize.
+uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment.
+  unsigned StackAlign = getStackAlignment();
+  if (MF.getFunction()->hasFnAttribute("stackrealign")) {
+    if (MFI->hasCalls())
+      MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
+    else if (MaxAlign < SlotSize)
+      MaxAlign = SlotSize;
+  }
+  return MaxAlign;
+}
+
+void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI,
+                                          DebugLoc DL, unsigned Reg,
+                                          uint64_t MaxAlign) const {
+  uint64_t Val = -MaxAlign;
+  unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val);
+  MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
+                         .addReg(Reg)
+                         .addImm(Val)
+                         .setMIFlag(MachineInstr::FrameSetup);
+
+  // The EFLAGS implicit def is dead.
+  MI->getOperand(3).setIsDead();
+}
+
+/// emitPrologue - Push callee-saved registers onto the stack, which
+/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
+/// space for local variables. Also emit labels used by the exception handler to
+/// generate the exception handling frames.
+
+/*
+  Here's a gist of what gets emitted:
+
+  ; Establish frame pointer, if needed
+  [if needs FP]
+      push  %rbp
+      .cfi_def_cfa_offset 16
+      .cfi_offset %rbp, -16
+      .seh_pushreg %rpb
+      mov  %rsp, %rbp
+      .cfi_def_cfa_register %rbp
+
+  ; Spill general-purpose registers
+  [for all callee-saved GPRs]
+      pushq %<reg>
+      [if not needs FP]
+         .cfi_def_cfa_offset (offset from RETADDR)
+      .seh_pushreg %<reg>
+
+  ; If the required stack alignment > default stack alignment
+  ; rsp needs to be re-aligned.  This creates a "re-alignment gap"
+  ; of unknown size in the stack frame.
+  [if stack needs re-alignment]
+      and  $MASK, %rsp
+
+  ; Allocate space for locals
+  [if target is Windows and allocated space > 4096 bytes]
+      ; Windows needs special care for allocations larger
+      ; than one page.
+      mov $NNN, %rax
+      call ___chkstk_ms/___chkstk
+      sub  %rax, %rsp
+  [else]
+      sub  $NNN, %rsp
+
+  [if needs FP]
+      .seh_stackalloc (size of XMM spill slots)
+      .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots
+  [else]
+      .seh_stackalloc NNN
+
+  ; Spill XMMs
+  ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved,
+  ; they may get spilled on any platform, if the current function
+  ; calls @llvm.eh.unwind.init
+  [if needs FP]
+      [for all callee-saved XMM registers]
+          movaps  %<xmm reg>, -MMM(%rbp)
+      [for all callee-saved XMM registers]
+          .seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset)
+              ; i.e. the offset relative to (%rbp - SEHFrameOffset)
+  [else]
+      [for all callee-saved XMM registers]
+          movaps  %<xmm reg>, KKK(%rsp)
+      [for all callee-saved XMM registers]
+          .seh_savexmm %<xmm reg>, KKK
+
+  .seh_endprologue
+
+  [if needs base pointer]
+      mov  %rsp, %rbx
+      [if needs to restore base pointer]
+          mov %rsp, -MMM(%rbp)
+
+  ; Emit CFI info
+  [if needs FP]
+      [for all callee-saved registers]
+          .cfi_offset %<reg>, (offset from %rbp)
+  [else]
+       .cfi_def_cfa_offset (offset from RETADDR)
+      [for all callee-saved registers]
+          .cfi_offset %<reg>, (offset from %rsp)
+
+  Notes:
+  - .seh directives are emitted only for Windows 64 ABI
+  - .cfi directives are emitted for all other ABIs
+  - for 32-bit code, substitute %e?? registers for %r??
+*/
+
+void X86FrameLowering::emitPrologue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {
+  assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
+         "MF used frame lowering for wrong subtarget");
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function *Fn = MF.getFunction();
+  MachineModuleInfo &MMI = MF.getMMI();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
+  uint64_t StackSize = MFI->getStackSize();    // Number of bytes to allocate.
+  bool IsFunclet = MBB.isEHFuncletEntry();
+  EHPersonality Personality = EHPersonality::Unknown;
+  if (Fn->hasPersonalityFn())
+    Personality = classifyEHPersonality(Fn->getPersonalityFn());
+  bool FnHasClrFunclet =
+      MMI.hasEHFunclets() && Personality == EHPersonality::CoreCLR;
+  bool IsClrFunclet = IsFunclet && FnHasClrFunclet;
+  bool HasFP = hasFP(MF);
+  bool IsWin64CC = STI.isCallingConvWin64(Fn->getCallingConv());
+  bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+  bool NeedsWinCFI = IsWin64Prologue && Fn->needsUnwindTableEntry();
+  bool NeedsDwarfCFI =
+      !IsWin64Prologue && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
+  unsigned FramePtr = TRI->getFrameRegister(MF);
+  const unsigned MachineFramePtr =
+      STI.isTarget64BitILP32()
+          ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
+  unsigned BasePtr = TRI->getBaseRegister();
+  
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc DL;
+
+  // Add RETADDR move area to callee saved frame size.
+  int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+  if (TailCallReturnAddrDelta && IsWin64Prologue)
+    report_fatal_error("Can't handle guaranteed tail call under win64 yet");
+
+  if (TailCallReturnAddrDelta < 0)
+    X86FI->setCalleeSavedFrameSize(
+      X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
+
+  bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO());
+
+  // The default stack probe size is 4096 if the function has no stackprobesize
+  // attribute.
+  unsigned StackProbeSize = 4096;
+  if (Fn->hasFnAttribute("stack-probe-size"))
+    Fn->getFnAttribute("stack-probe-size")
+        .getValueAsString()
+        .getAsInteger(0, StackProbeSize);
+
+  // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
+  // function, and use up to 128 bytes of stack space, don't have a frame
+  // pointer, calls, or dynamic alloca then we do not need to adjust the
+  // stack pointer (we fit in the Red Zone). We also check that we don't
+  // push and pop from the stack.
+  if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) &&
+      !TRI->needsStackRealignment(MF) &&
+      !MFI->hasVarSizedObjects() &&    // No dynamic alloca.
+      !MFI->adjustsStack() &&          // No calls.
+      !IsWin64CC &&                    // Win64 has no Red Zone
+      !MFI->hasOpaqueSPAdjustment() && // Don't push and pop.
+      !MF.shouldSplitStack()) {        // Regular stack
+    uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
+    if (HasFP) MinSize += SlotSize;
+    StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
+    MFI->setStackSize(StackSize);
+  }
+
+  // Insert stack pointer adjustment for later moving of return addr.  Only
+  // applies to tail call optimized functions where the callee argument stack
+  // size is bigger than the callers.
+  if (TailCallReturnAddrDelta < 0) {
+    BuildStackAdjustment(MBB, MBBI, DL, TailCallReturnAddrDelta,
+                         /*InEpilogue=*/false)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // Mapping for machine moves:
+  //
+  //   DST: VirtualFP AND
+  //        SRC: VirtualFP              => DW_CFA_def_cfa_offset
+  //        ELSE                        => DW_CFA_def_cfa
+  //
+  //   SRC: VirtualFP AND
+  //        DST: Register               => DW_CFA_def_cfa_register
+  //
+  //   ELSE
+  //        OFFSET < 0                  => DW_CFA_offset_extended_sf
+  //        REG < 64                    => DW_CFA_offset + Reg
+  //        ELSE                        => DW_CFA_offset_extended
+
+  uint64_t NumBytes = 0;
+  int stackGrowth = -SlotSize;
+
+  // Find the funclet establisher parameter
+  unsigned Establisher = X86::NoRegister;
+  if (IsClrFunclet)
+    Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX;
+  else if (IsFunclet)
+    Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX;
+
+  if (IsWin64Prologue && IsFunclet && !IsClrFunclet) {
+    // Immediately spill establisher into the home slot.
+    // The runtime cares about this.
+    // MOV64mr %rdx, 16(%rsp)
+    unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, 16)
+        .addReg(Establisher)
+        .setMIFlag(MachineInstr::FrameSetup);
+    MBB.addLiveIn(Establisher);
+  }
+
+  if (HasFP) {
+    // Calculate required stack adjustment.
+    uint64_t FrameSize = StackSize - SlotSize;
+    // If required, include space for extra hidden slot for stashing base pointer.
+    if (X86FI->getRestoreBasePointer())
+      FrameSize += SlotSize;
+
+    NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+
+    // Callee-saved registers are pushed on stack before the stack is realigned.
+    if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
+      NumBytes = RoundUpToAlignment(NumBytes, MaxAlign);
+
+    // Get the offset of the stack slot for the EBP register, which is
+    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+    // Update the frame offset adjustment.
+    if (!IsFunclet)
+      MFI->setOffsetAdjustment(-NumBytes);
+    else
+      assert(MFI->getOffsetAdjustment() == -(int)NumBytes &&
+             "should calculate same local variable offset for funclets");
+
+    // Save EBP/RBP into the appropriate stack slot.
+    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+      .addReg(MachineFramePtr, RegState::Kill)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+    if (NeedsDwarfCFI) {
+      // Mark the place where EBP/RBP was saved.
+      // Define the current CFA rule to use the provided offset.
+      assert(StackSize);
+      BuildCFI(MBB, MBBI, DL,
+               MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth));
+
+      // Change the rule for the FramePtr to be an "offset" rule.
+      unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
+      BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset(
+                                  nullptr, DwarfFramePtr, 2 * stackGrowth));
+    }
+
+    if (NeedsWinCFI) {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
+          .addImm(FramePtr)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+
+    if (!IsWin64Prologue && !IsFunclet) {
+      // Update EBP with the new base value.
+      BuildMI(MBB, MBBI, DL,
+              TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
+              FramePtr)
+          .addReg(StackPtr)
+          .setMIFlag(MachineInstr::FrameSetup);
+
+      if (NeedsDwarfCFI) {
+        // Mark effective beginning of when frame pointer becomes valid.
+        // Define the current CFA to use the EBP/RBP register.
+        unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
+        BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister(
+                                    nullptr, DwarfFramePtr));
+      }
+    }
+
+    // Mark the FramePtr as live-in in every block. Don't do this again for
+    // funclet prologues.
+    if (!IsFunclet) {
+      for (MachineBasicBlock &EveryMBB : MF)
+        EveryMBB.addLiveIn(MachineFramePtr);
+    }
+  } else {
+    assert(!IsFunclet && "funclets without FPs not yet implemented");
+    NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
+  }
+
+  // For EH funclets, only allocate enough space for outgoing calls. Save the
+  // NumBytes value that we would've used for the parent frame.
+  unsigned ParentFrameNumBytes = NumBytes;
+  if (IsFunclet)
+    NumBytes = getWinEHFuncletFrameSize(MF);
+
+  // Skip the callee-saved push instructions.
+  bool PushedRegs = false;
+  int StackOffset = 2 * stackGrowth;
+
+  while (MBBI != MBB.end() &&
+         MBBI->getFlag(MachineInstr::FrameSetup) &&
+         (MBBI->getOpcode() == X86::PUSH32r ||
+          MBBI->getOpcode() == X86::PUSH64r)) {
+    PushedRegs = true;
+    unsigned Reg = MBBI->getOperand(0).getReg();
+    ++MBBI;
+
+    if (!HasFP && NeedsDwarfCFI) {
+      // Mark callee-saved push instruction.
+      // Define the current CFA rule to use the provided offset.
+      assert(StackSize);
+      BuildCFI(MBB, MBBI, DL,
+               MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset));
+      StackOffset += stackGrowth;
+    }
+
+    if (NeedsWinCFI) {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag(
+          MachineInstr::FrameSetup);
+    }
+  }
+
+  // Realign stack after we pushed callee-saved registers (so that we'll be
+  // able to calculate their offsets from the frame pointer).
+  // Don't do this for Win64, it needs to realign the stack after the prologue.
+  if (!IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) {
+    assert(HasFP && "There should be a frame pointer if stack is realigned.");
+    BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign);
+  }
+
+  // If there is an SUB32ri of ESP immediately before this instruction, merge
+  // the two. This can be the case when tail call elimination is enabled and
+  // the callee has more arguments then the caller.
+  NumBytes -= mergeSPUpdates(MBB, MBBI, true);
+
+  // Adjust stack pointer: ESP -= numbytes.
+
+  // Windows and cygwin/mingw require a prologue helper routine when allocating
+  // more than 4K bytes on the stack.  Windows uses __chkstk and cygwin/mingw
+  // uses __alloca.  __alloca and the 32-bit version of __chkstk will probe the
+  // stack and adjust the stack pointer in one go.  The 64-bit version of
+  // __chkstk is only responsible for probing the stack.  The 64-bit prologue is
+  // responsible for adjusting the stack pointer.  Touching the stack at 4K
+  // increments is necessary to ensure that the guard pages used by the OS
+  // virtual memory manager are allocated in correct sequence.
+  uint64_t AlignedNumBytes = NumBytes;
+  if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
+    AlignedNumBytes = RoundUpToAlignment(AlignedNumBytes, MaxAlign);
+  if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
+    // Check whether EAX is livein for this function.
+    bool isEAXAlive = isEAXLiveIn(MF);
+
+    if (isEAXAlive) {
+      // Sanity check that EAX is not livein for this function.
+      // It should not be, so throw an assert.
+      assert(!Is64Bit && "EAX is livein in x64 case!");
+
+      // Save EAX
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
+        .addReg(X86::EAX, RegState::Kill)
+        .setMIFlag(MachineInstr::FrameSetup);
+    }
+
+    if (Is64Bit) {
+      // Handle the 64-bit Windows ABI case where we need to call __chkstk.
+      // Function prologue is responsible for adjusting the stack pointer.
+      if (isUInt<32>(NumBytes)) {
+        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+            .addImm(NumBytes)
+            .setMIFlag(MachineInstr::FrameSetup);
+      } else if (isInt<32>(NumBytes)) {
+        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX)
+            .addImm(NumBytes)
+            .setMIFlag(MachineInstr::FrameSetup);
+      } else {
+        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
+            .addImm(NumBytes)
+            .setMIFlag(MachineInstr::FrameSetup);
+      }
+    } else {
+      // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
+      // We'll also use 4 already allocated bytes for EAX.
+      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+          .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+
+    // Call __chkstk, __chkstk_ms, or __alloca.
+    emitStackProbe(MF, MBB, MBBI, DL, true);
+
+    if (isEAXAlive) {
+      // Restore EAX
+      MachineInstr *MI =
+          addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
+                       StackPtr, false, NumBytes - 4);
+      MI->setFlag(MachineInstr::FrameSetup);
+      MBB.insert(MBBI, MI);
+    }
+  } else if (NumBytes) {
+    emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false);
+  }
+
+  if (NeedsWinCFI && NumBytes)
+    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
+        .addImm(NumBytes)
+        .setMIFlag(MachineInstr::FrameSetup);
+
+  int SEHFrameOffset = 0;
+  unsigned SPOrEstablisher;
+  if (IsFunclet) {
+    if (IsClrFunclet) {
+      // The establisher parameter passed to a CLR funclet is actually a pointer
+      // to the (mostly empty) frame of its nearest enclosing funclet; we have
+      // to find the root function establisher frame by loading the PSPSym from
+      // the intermediate frame.
+      unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
+      MachinePointerInfo NoInfo;
+      MBB.addLiveIn(Establisher);
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher),
+                   Establisher, false, PSPSlotOffset)
+          .addMemOperand(MF.getMachineMemOperand(
+              NoInfo, MachineMemOperand::MOLoad, SlotSize, SlotSize));
+      ;
+      // Save the root establisher back into the current funclet's (mostly
+      // empty) frame, in case a sub-funclet or the GC needs it.
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr,
+                   false, PSPSlotOffset)
+          .addReg(Establisher)
+          .addMemOperand(
+              MF.getMachineMemOperand(NoInfo, MachineMemOperand::MOStore |
+                                                  MachineMemOperand::MOVolatile,
+                                      SlotSize, SlotSize));
+    }
+    SPOrEstablisher = Establisher;
+  } else {
+    SPOrEstablisher = StackPtr;
+  }
+
+  if (IsWin64Prologue && HasFP) {
+    // Set RBP to a small fixed offset from RSP. In the funclet case, we base
+    // this calculation on the incoming establisher, which holds the value of
+    // RSP from the parent frame at the end of the prologue.
+    SEHFrameOffset = calculateSetFPREG(ParentFrameNumBytes);
+    if (SEHFrameOffset)
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr),
+                   SPOrEstablisher, false, SEHFrameOffset);
+    else
+      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr)
+          .addReg(SPOrEstablisher);
+
+    // If this is not a funclet, emit the CFI describing our frame pointer.
+    if (NeedsWinCFI && !IsFunclet) {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
+          .addImm(FramePtr)
+          .addImm(SEHFrameOffset)
+          .setMIFlag(MachineInstr::FrameSetup);
+      if (isAsynchronousEHPersonality(Personality))
+        MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset;
+    }
+  } else if (IsFunclet && STI.is32Bit()) {
+    // Reset EBP / ESI to something good for funclets.
+    MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL);
+    // If we're a catch funclet, we can be returned to via catchret. Save ESP
+    // into the registration node so that the runtime will restore it for us.
+    if (!MBB.isCleanupFuncletEntry()) {
+      assert(Personality == EHPersonality::MSVC_CXX);
+      unsigned FrameReg;
+      int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex;
+      int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg);
+      // ESP is the first field, so no extra displacement is needed.
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg,
+                   false, EHRegOffset)
+          .addReg(X86::ESP);
+    }
+  }
+
+  while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) {
+    const MachineInstr *FrameInstr = &*MBBI;
+    ++MBBI;
+
+    if (NeedsWinCFI) {
+      int FI;
+      if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
+        if (X86::FR64RegClass.contains(Reg)) {
+          unsigned IgnoredFrameReg;
+          int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg);
+          Offset += SEHFrameOffset;
+
+          BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
+              .addImm(Reg)
+              .addImm(Offset)
+              .setMIFlag(MachineInstr::FrameSetup);
+        }
+      }
+    }
+  }
+
+  if (NeedsWinCFI)
+    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
+        .setMIFlag(MachineInstr::FrameSetup);
+
+  if (FnHasClrFunclet && !IsFunclet) {
+    // Save the so-called Initial-SP (i.e. the value of the stack pointer
+    // immediately after the prolog)  into the PSPSlot so that funclets
+    // and the GC can recover it.
+    unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
+    auto PSPInfo = MachinePointerInfo::getFixedStack(
+        MF, MF.getWinEHFuncInfo()->PSPSymFrameIdx);
+    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false,
+                 PSPSlotOffset)
+        .addReg(StackPtr)
+        .addMemOperand(MF.getMachineMemOperand(
+            PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
+            SlotSize, SlotSize));
+  }
+
+  // Realign stack after we spilled callee-saved registers (so that we'll be
+  // able to calculate their offsets from the frame pointer).
+  // Win64 requires aligning the stack after the prologue.
+  if (IsWin64Prologue && TRI->needsStackRealignment(MF)) {
+    assert(HasFP && "There should be a frame pointer if stack is realigned.");
+    BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign);
+  }
+
+  // We already dealt with stack realignment and funclets above.
+  if (IsFunclet && STI.is32Bit())
+    return;
+
+  // If we need a base pointer, set it up here. It's whatever the value
+  // of the stack pointer is at this point. Any variable size objects
+  // will be allocated after this, so we can still use the base pointer
+  // to reference locals.
+  if (TRI->hasBasePointer(MF)) {
+    // Update the base pointer with the current stack pointer.
+    unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;
+    BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
+      .addReg(SPOrEstablisher)
+      .setMIFlag(MachineInstr::FrameSetup);
+    if (X86FI->getRestoreBasePointer()) {
+      // Stash value of base pointer.  Saving RSP instead of EBP shortens
+      // dependence chain. Used by SjLj EH.
+      unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)),
+                   FramePtr, true, X86FI->getRestoreBasePointerOffset())
+        .addReg(SPOrEstablisher)
+        .setMIFlag(MachineInstr::FrameSetup);
+    }
+
+    if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) {
+      // Stash the value of the frame pointer relative to the base pointer for
+      // Win32 EH. This supports Win32 EH, which does the inverse of the above:
+      // it recovers the frame pointer from the base pointer rather than the
+      // other way around.
+      unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+      unsigned UsedReg;
+      int Offset =
+          getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
+      assert(UsedReg == BasePtr);
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset)
+          .addReg(FramePtr)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+  }
+
+  if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) {
+    // Mark end of stack pointer adjustment.
+    if (!HasFP && NumBytes) {
+      // Define the current CFA rule to use the provided offset.
+      assert(StackSize);
+      BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(
+                                  nullptr, -StackSize + stackGrowth));
+    }
+
+    // Emit DWARF info specifying the offsets of the callee-saved registers.
+    if (PushedRegs)
+      emitCalleeSavedFrameMoves(MBB, MBBI, DL);
+  }
+}
+
+bool X86FrameLowering::canUseLEAForSPInEpilogue(
+    const MachineFunction &MF) const {
+  // We can't use LEA instructions for adjusting the stack pointer if this is a
+  // leaf function in the Win64 ABI.  Only ADD instructions may be used to
+  // deallocate the stack.
+  // This means that we can use LEA for SP in two situations:
+  // 1. We *aren't* using the Win64 ABI which means we are free to use LEA.
+  // 2. We *have* a frame pointer which means we are permitted to use LEA.
+  return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF);
+}
+
+static bool isFuncletReturnInstr(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case X86::CATCHRET:
+  case X86::CLEANUPRET:
+    return true;
+  default:
+    return false;
+  }
+  llvm_unreachable("impossible");
+}
+
+// CLR funclets use a special "Previous Stack Pointer Symbol" slot on the
+// stack. It holds a pointer to the bottom of the root function frame.  The
+// establisher frame pointer passed to a nested funclet may point to the
+// (mostly empty) frame of its parent funclet, but it will need to find
+// the frame of the root function to access locals.  To facilitate this,
+// every funclet copies the pointer to the bottom of the root function
+// frame into a PSPSym slot in its own (mostly empty) stack frame. Using the
+// same offset for the PSPSym in the root function frame that's used in the
+// funclets' frames allows each funclet to dynamically accept any ancestor
+// frame as its establisher argument (the runtime doesn't guarantee the
+// immediate parent for some reason lost to history), and also allows the GC,
+// which uses the PSPSym for some bookkeeping, to find it in any funclet's
+// frame with only a single offset reported for the entire method.
+unsigned
+X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
+  const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();
+  // getFrameIndexReferenceFromSP has an out ref parameter for the stack
+  // pointer register; pass a dummy that we ignore
+  unsigned SPReg;
+  int Offset = getFrameIndexReferenceFromSP(MF, Info.PSPSymFrameIdx, SPReg);
+  assert(Offset >= 0);
+  return static_cast<unsigned>(Offset);
+}
+
+unsigned
+X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
+  // This is the size of the pushed CSRs.
+  unsigned CSSize =
+      MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
+  // This is the amount of stack a funclet needs to allocate.
+  unsigned UsedSize;
+  EHPersonality Personality =
+      classifyEHPersonality(MF.getFunction()->getPersonalityFn());
+  if (Personality == EHPersonality::CoreCLR) {
+    // CLR funclets need to hold enough space to include the PSPSym, at the
+    // same offset from the stack pointer (immediately after the prolog) as it
+    // resides at in the main function.
+    UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize;
+  } else {
+    // Other funclets just need enough stack for outgoing call arguments.
+    UsedSize = MF.getFrameInfo()->getMaxCallFrameSize();
+  }
+  // RBP is not included in the callee saved register block. After pushing RBP,
+  // everything is 16 byte aligned. Everything we allocate before an outgoing
+  // call must also be 16 byte aligned.
+  unsigned FrameSizeMinusRBP =
+      RoundUpToAlignment(CSSize + UsedSize, getStackAlignment());
+  // Subtract out the size of the callee saved registers. This is how much stack
+  // each funclet will allocate.
+  return FrameSizeMinusRBP - CSSize;
+}
+
+void X86FrameLowering::emitEpilogue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+  // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
+  const bool Is64BitILP32 = STI.isTarget64BitILP32();
+  unsigned FramePtr = TRI->getFrameRegister(MF);
+  unsigned MachineFramePtr =
+      Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
+
+  bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+  bool NeedsWinCFI =
+      IsWin64Prologue && MF.getFunction()->needsUnwindTableEntry();
+  bool IsFunclet = isFuncletReturnInstr(MBBI);
+  MachineBasicBlock *TargetMBB = nullptr;
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  uint64_t StackSize = MFI->getStackSize();
+  uint64_t MaxAlign = calculateMaxStackAlign(MF);
+  unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+  uint64_t NumBytes = 0;
+
+  if (MBBI->getOpcode() == X86::CATCHRET) {
+    // SEH shouldn't use catchret.
+    assert(!isAsynchronousEHPersonality(
+               classifyEHPersonality(MF.getFunction()->getPersonalityFn())) &&
+           "SEH should not use CATCHRET");
+
+    NumBytes = getWinEHFuncletFrameSize(MF);
+    assert(hasFP(MF) && "EH funclets without FP not yet implemented");
+    TargetMBB = MBBI->getOperand(0).getMBB();
+
+    // Pop EBP.
+    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
+            MachineFramePtr)
+        .setMIFlag(MachineInstr::FrameDestroy);
+  } else if (MBBI->getOpcode() == X86::CLEANUPRET) {
+    NumBytes = getWinEHFuncletFrameSize(MF);
+    assert(hasFP(MF) && "EH funclets without FP not yet implemented");
+    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
+            MachineFramePtr)
+        .setMIFlag(MachineInstr::FrameDestroy);
+  } else if (hasFP(MF)) {
+    // Calculate required stack adjustment.
+    uint64_t FrameSize = StackSize - SlotSize;
+    NumBytes = FrameSize - CSSize;
+
+    // Callee-saved registers were pushed on stack before the stack was
+    // realigned.
+    if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
+      NumBytes = RoundUpToAlignment(FrameSize, MaxAlign);
+
+    // Pop EBP.
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr)
+        .setMIFlag(MachineInstr::FrameDestroy);
+  } else {
+    NumBytes = StackSize - CSSize;
+  }
+  uint64_t SEHStackAllocAmt = NumBytes;
+
+  // Skip the callee-saved pop instructions.
+  while (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PI = std::prev(MBBI);
+    unsigned Opc = PI->getOpcode();
+
+    if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
+        (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
+        Opc != X86::DBG_VALUE && !PI->isTerminator())
+      break;
+
+    --MBBI;
+  }
+  MachineBasicBlock::iterator FirstCSPop = MBBI;
+
+  if (TargetMBB) {
+    // Fill EAX/RAX with the address of the target block.
+    unsigned ReturnReg = STI.is64Bit() ? X86::RAX : X86::EAX;
+    if (STI.is64Bit()) {
+      // LEA64r TargetMBB(%rip), %rax
+      BuildMI(MBB, FirstCSPop, DL, TII.get(X86::LEA64r), ReturnReg)
+          .addReg(X86::RIP)
+          .addImm(0)
+          .addReg(0)
+          .addMBB(TargetMBB)
+          .addReg(0);
+    } else {
+      // MOV32ri $TargetMBB, %eax
+      BuildMI(MBB, FirstCSPop, DL, TII.get(X86::MOV32ri), ReturnReg)
+          .addMBB(TargetMBB);
+    }
+    // Record that we've taken the address of TargetMBB and no longer just
+    // reference it in a terminator.
+    TargetMBB->setHasAddressTaken();
+  }
+
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+
+  // If there is an ADD32ri or SUB32ri of ESP immediately before this
+  // instruction, merge the two instructions.
+  if (NumBytes || MFI->hasVarSizedObjects())
+    NumBytes += mergeSPUpdates(MBB, MBBI, true);
+
+  // If dynamic alloca is used, then reset esp to point to the last callee-saved
+  // slot before popping them off! Same applies for the case, when stack was
+  // realigned. Don't do this if this was a funclet epilogue, since the funclets
+  // will not do realignment or dynamic stack allocation.
+  if ((TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) &&
+      !IsFunclet) {
+    if (TRI->needsStackRealignment(MF))
+      MBBI = FirstCSPop;
+    unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt);
+    uint64_t LEAAmount =
+        IsWin64Prologue ? SEHStackAllocAmt - SEHFrameOffset : -CSSize;
+
+    // There are only two legal forms of epilogue:
+    // - add SEHAllocationSize, %rsp
+    // - lea SEHAllocationSize(%FramePtr), %rsp
+    //
+    // 'mov %FramePtr, %rsp' will not be recognized as an epilogue sequence.
+    // However, we may use this sequence if we have a frame pointer because the
+    // effects of the prologue can safely be undone.
+    if (LEAAmount != 0) {
+      unsigned Opc = getLEArOpcode(Uses64BitFramePtr);
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
+                   FramePtr, false, LEAAmount);
+      --MBBI;
+    } else {
+      unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr);
+      BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+        .addReg(FramePtr);
+      --MBBI;
+    }
+  } else if (NumBytes) {
+    // Adjust stack pointer back: ESP += numbytes.
+    emitSPUpdate(MBB, MBBI, NumBytes, /*InEpilogue=*/true);
+    --MBBI;
+  }
+
+  // Windows unwinder will not invoke function's exception handler if IP is
+  // either in prologue or in epilogue.  This behavior causes a problem when a
+  // call immediately precedes an epilogue, because the return address points
+  // into the epilogue.  To cope with that, we insert an epilogue marker here,
+  // then replace it with a 'nop' if it ends up immediately after a CALL in the
+  // final emitted code.
+  if (NeedsWinCFI)
+    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
+
+  // Add the return addr area delta back since we are not tail calling.
+  int Offset = -1 * X86FI->getTCReturnAddrDelta();
+  assert(Offset >= 0 && "TCDelta should never be positive");
+  if (Offset) {
+    MBBI = MBB.getFirstTerminator();
+
+    // Check for possible merge with preceding ADD instruction.
+    Offset += mergeSPUpdates(MBB, MBBI, true);
+    emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true);
+  }
+}
+
+// NOTE: this only has a subset of the full frame index logic. In
+// particular, the FI < 0 and AfterFPPop logic is handled in
+// X86RegisterInfo::eliminateFrameIndex, but not here. Possibly
+// (probably?) it should be moved into here.
+int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                             unsigned &FrameReg) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // We can't calculate offset from frame pointer if the stack is realigned,
+  // so enforce usage of stack/base pointer.  The base pointer is used when we
+  // have dynamic allocas in addition to dynamic realignment.
+  if (TRI->hasBasePointer(MF))
+    FrameReg = TRI->getBaseRegister();
+  else if (TRI->needsStackRealignment(MF))
+    FrameReg = TRI->getStackRegister();
+  else
+    FrameReg = TRI->getFrameRegister(MF);
+
+  // Offset will hold the offset from the stack pointer at function entry to the
+  // object.
+  // We need to factor in additional offsets applied during the prologue to the
+  // frame, base, and stack pointer depending on which is used.
+  int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
+  const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+  uint64_t StackSize = MFI->getStackSize();
+  bool HasFP = hasFP(MF);
+  bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+  int64_t FPDelta = 0;
+
+  if (IsWin64Prologue) {
+    assert(!MFI->hasCalls() || (StackSize % 16) == 8);
+
+    // Calculate required stack adjustment.
+    uint64_t FrameSize = StackSize - SlotSize;
+    // If required, include space for extra hidden slot for stashing base pointer.
+    if (X86FI->getRestoreBasePointer())
+      FrameSize += SlotSize;
+    uint64_t NumBytes = FrameSize - CSSize;
+
+    uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes);
+    if (FI && FI == X86FI->getFAIndex())
+      return -SEHFrameOffset;
+
+    // FPDelta is the offset from the "traditional" FP location of the old base
+    // pointer followed by return address and the location required by the
+    // restricted Win64 prologue.
+    // Add FPDelta to all offsets below that go through the frame pointer.
+    FPDelta = FrameSize - SEHFrameOffset;
+    assert((!MFI->hasCalls() || (FPDelta % 16) == 0) &&
+           "FPDelta isn't aligned per the Win64 ABI!");
+  }
+
+
+  if (TRI->hasBasePointer(MF)) {
+    assert(HasFP && "VLAs and dynamic stack realign, but no FP?!");
+    if (FI < 0) {
+      // Skip the saved EBP.
+      return Offset + SlotSize + FPDelta;
+    } else {
+      assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
+      return Offset + StackSize;
+    }
+  } else if (TRI->needsStackRealignment(MF)) {
+    if (FI < 0) {
+      // Skip the saved EBP.
+      return Offset + SlotSize + FPDelta;
+    } else {
+      assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
+      return Offset + StackSize;
+    }
+    // FIXME: Support tail calls
+  } else {
+    if (!HasFP)
+      return Offset + StackSize;
+
+    // Skip the saved EBP.
+    Offset += SlotSize;
+
+    // Skip the RETADDR move area
+    int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+    if (TailCallReturnAddrDelta < 0)
+      Offset -= TailCallReturnAddrDelta;
+  }
+
+  return Offset + FPDelta;
+}
+
+// Simplified from getFrameIndexReference keeping only StackPointer cases
+int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
+                                                   int FI,
+                                                   unsigned &FrameReg) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Does not include any dynamic realign.
+  const uint64_t StackSize = MFI->getStackSize();
+  {
+#ifndef NDEBUG
+    // LLVM arranges the stack as follows:
+    //   ...
+    //   ARG2
+    //   ARG1
+    //   RETADDR
+    //   PUSH RBP   <-- RBP points here
+    //   PUSH CSRs
+    //   ~~~~~~~    <-- possible stack realignment (non-win64)
+    //   ...
+    //   STACK OBJECTS
+    //   ...        <-- RSP after prologue points here
+    //   ~~~~~~~    <-- possible stack realignment (win64)
+    //
+    // if (hasVarSizedObjects()):
+    //   ...        <-- "base pointer" (ESI/RBX) points here
+    //   DYNAMIC ALLOCAS
+    //   ...        <-- RSP points here
+    //
+    // Case 1: In the simple case of no stack realignment and no dynamic
+    // allocas, both "fixed" stack objects (arguments and CSRs) are addressable
+    // with fixed offsets from RSP.
+    //
+    // Case 2: In the case of stack realignment with no dynamic allocas, fixed
+    // stack objects are addressed with RBP and regular stack objects with RSP.
+    //
+    // Case 3: In the case of dynamic allocas and stack realignment, RSP is used
+    // to address stack arguments for outgoing calls and nothing else. The "base
+    // pointer" points to local variables, and RBP points to fixed objects.
+    //
+    // In cases 2 and 3, we can only answer for non-fixed stack objects, and the
+    // answer we give is relative to the SP after the prologue, and not the
+    // SP in the middle of the function.
+
+    assert((!MFI->isFixedObjectIndex(FI) || !TRI->needsStackRealignment(MF) ||
+            STI.isTargetWin64()) &&
+           "offset from fixed object to SP is not static");
+
+    // We don't handle tail calls, and shouldn't be seeing them either.
+    int TailCallReturnAddrDelta =
+        MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta();
+    assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!");
+#endif
+  }
+
+  // Fill in FrameReg output argument.
+  FrameReg = TRI->getStackRegister();
+
+  // This is how the math works out:
+  //
+  //  %rsp grows (i.e. gets lower) left to right. Each box below is
+  //  one word (eight bytes).  Obj0 is the stack slot we're trying to
+  //  get to.
+  //
+  //    ----------------------------------
+  //    | BP | Obj0 | Obj1 | ... | ObjN |
+  //    ----------------------------------
+  //    ^    ^      ^                   ^
+  //    A    B      C                   E
+  //
+  // A is the incoming stack pointer.
+  // (B - A) is the local area offset (-8 for x86-64) [1]
+  // (C - A) is the Offset returned by MFI->getObjectOffset for Obj0 [2]
+  //
+  // |(E - B)| is the StackSize (absolute value, positive).  For a
+  // stack that grown down, this works out to be (B - E). [3]
+  //
+  // E is also the value of %rsp after stack has been set up, and we
+  // want (C - E) -- the value we can add to %rsp to get to Obj0.  Now
+  // (C - E) == (C - A) - (B - A) + (B - E)
+  //            { Using [1], [2] and [3] above }
+  //         == getObjectOffset - LocalAreaOffset + StackSize
+  //
+
+  // Get the Offset from the StackPointer
+  int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
+
+  return Offset + StackSize;
+}
+
+bool X86FrameLowering::assignCalleeSavedSpillSlots(
+    MachineFunction &MF, const TargetRegisterInfo *TRI,
+    std::vector<CalleeSavedInfo> &CSI) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+
+  unsigned CalleeSavedFrameSize = 0;
+  int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
+
+  if (hasFP(MF)) {
+    // emitPrologue always spills frame register the first thing.
+    SpillSlotOffset -= SlotSize;
+    MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+
+    // Since emitPrologue and emitEpilogue will handle spilling and restoring of
+    // the frame register, we can delete it from CSI list and not have to worry
+    // about avoiding it later.
+    unsigned FPReg = TRI->getFrameRegister(MF);
+    for (unsigned i = 0; i < CSI.size(); ++i) {
+      if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) {
+        CSI.erase(CSI.begin() + i);
+        break;
+      }
+    }
+  }
+
+  // Assign slots for GPRs. It increases frame size.
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i - 1].getReg();
+
+    if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
+      continue;
+
+    SpillSlotOffset -= SlotSize;
+    CalleeSavedFrameSize += SlotSize;
+
+    int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+    CSI[i - 1].setFrameIdx(SlotIndex);
+  }
+
+  X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
+
+  // Assign slots for XMMs.
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i - 1].getReg();
+    if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+      continue;
+
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    // ensure alignment
+    SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment();
+    // spill into slot
+    SpillSlotOffset -= RC->getSize();
+    int SlotIndex =
+        MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset);
+    CSI[i - 1].setFrameIdx(SlotIndex);
+    MFI->ensureMaxAlignment(RC->getAlignment());
+  }
+
+  return true;
+}
+
+bool X86FrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBB.findDebugLoc(MI);
+
+  // Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI
+  // for us, and there are no XMM CSRs on Win32.
+  if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows())
+    return true;
+
+  // Push GPRs. It increases frame size.
+  unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i - 1].getReg();
+
+    if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
+      continue;
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+
+    BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill)
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // Make XMM regs spilled. X86 does not have ability of push/pop XMM.
+  // It can be done by spilling XMMs to stack frame.
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+      continue;
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC,
+                            TRI);
+    --MI;
+    MI->setFlag(MachineInstr::FrameSetup);
+    ++MI;
+  }
+
+  return true;
+}
+
+bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                          const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  if (isFuncletReturnInstr(MI) && STI.isOSWindows()) {
+    // Don't restore CSRs in 32-bit EH funclets. Matches
+    // spillCalleeSavedRegisters.
+    if (STI.is32Bit())
+      return true;
+    // Don't restore CSRs before an SEH catchret. SEH except blocks do not form
+    // funclets. emitEpilogue transforms these to normal jumps.
+    if (MI->getOpcode() == X86::CATCHRET) {
+      const Function *Func = MBB.getParent()->getFunction();
+      bool IsSEH = isAsynchronousEHPersonality(
+          classifyEHPersonality(Func->getPersonalityFn()));
+      if (IsSEH)
+        return true;
+    }
+  }
+
+  DebugLoc DL = MBB.findDebugLoc(MI);
+
+  // Reload XMMs from stack frame.
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    if (X86::GR64RegClass.contains(Reg) ||
+        X86::GR32RegClass.contains(Reg))
+      continue;
+
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
+  }
+
+  // POP GPRs.
+  unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    if (!X86::GR64RegClass.contains(Reg) &&
+        !X86::GR32RegClass.contains(Reg))
+      continue;
+
+    BuildMI(MBB, MI, DL, TII.get(Opc), Reg)
+        .setMIFlag(MachineInstr::FrameDestroy);
+  }
+  return true;
+}
+
+void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                            BitVector &SavedRegs,
+                                            RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+
+  if (TailCallReturnAddrDelta < 0) {
+    // create RETURNADDR area
+    //   arg
+    //   arg
+    //   RETADDR
+    //   { ...
+    //     RETADDR area
+    //     ...
+    //   }
+    //   [EBP]
+    MFI->CreateFixedObject(-TailCallReturnAddrDelta,
+                           TailCallReturnAddrDelta - SlotSize, true);
+  }
+
+  // Spill the BasePtr if it's used.
+  if (TRI->hasBasePointer(MF)) {
+    SavedRegs.set(TRI->getBaseRegister());
+
+    // Allocate a spill slot for EBP if we have a base pointer and EH funclets.
+    if (MF.getMMI().hasEHFunclets()) {
+      int FI = MFI->CreateSpillStackObject(SlotSize, SlotSize);
+      X86FI->setHasSEHFramePtrSave(true);
+      X86FI->setSEHFramePtrSaveIndex(FI);
+    }
+  }
+}
+
+static bool
+HasNestArgument(const MachineFunction *MF) {
+  const Function *F = MF->getFunction();
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; I++) {
+    if (I->hasNestAttr())
+      return true;
+  }
+  return false;
+}
+
+/// GetScratchRegister - Get a temp register for performing work in the
+/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
+/// and the properties of the function either one or two registers will be
+/// needed. Set primary to true for the first register, false for the second.
+static unsigned
+GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) {
+  CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
+
+  // Erlang stuff.
+  if (CallingConvention == CallingConv::HiPE) {
+    if (Is64Bit)
+      return Primary ? X86::R14 : X86::R13;
+    else
+      return Primary ? X86::EBX : X86::EDI;
+  }
+
+  if (Is64Bit) {
+    if (IsLP64)
+      return Primary ? X86::R11 : X86::R12;
+    else
+      return Primary ? X86::R11D : X86::R12D;
+  }
+
+  bool IsNested = HasNestArgument(&MF);
+
+  if (CallingConvention == CallingConv::X86_FastCall ||
+      CallingConvention == CallingConv::Fast) {
+    if (IsNested)
+      report_fatal_error("Segmented stacks does not support fastcall with "
+                         "nested function.");
+    return Primary ? X86::EAX : X86::ECX;
+  }
+  if (IsNested)
+    return Primary ? X86::EDX : X86::EAX;
+  return Primary ? X86::ECX : X86::EAX;
+}
+
+// The stack limit in the TCB is set to this many bytes above the actual stack
+// limit.
+static const uint64_t kSplitStackAvailable = 256;
+
+void X86FrameLowering::adjustForSegmentedStacks(
+    MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  uint64_t StackSize;
+  unsigned TlsReg, TlsOffset;
+  DebugLoc DL;
+
+  unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
+  assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+         "Scratch register is live-in");
+
+  if (MF.getFunction()->isVarArg())
+    report_fatal_error("Segmented stacks do not support vararg functions.");
+  if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() &&
+      !STI.isTargetWin64() && !STI.isTargetFreeBSD() &&
+      !STI.isTargetDragonFly())
+    report_fatal_error("Segmented stacks not supported on this platform.");
+
+  // Eventually StackSize will be calculated by a link-time pass; which will
+  // also decide whether checking code needs to be injected into this particular
+  // prologue.
+  StackSize = MFI->getStackSize();
+
+  // Do not generate a prologue for functions with a stack of size zero
+  if (StackSize == 0)
+    return;
+
+  MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  bool IsNested = false;
+
+  // We need to know if the function has a nest argument only in 64 bit mode.
+  if (Is64Bit)
+    IsNested = HasNestArgument(&MF);
+
+  // The MOV R10, RAX needs to be in a different block, since the RET we emit in
+  // allocMBB needs to be last (terminating) instruction.
+
+  for (const auto &LI : PrologueMBB.liveins()) {
+    allocMBB->addLiveIn(LI);
+    checkMBB->addLiveIn(LI);
+  }
+
+  if (IsNested)
+    allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D);
+
+  MF.push_front(allocMBB);
+  MF.push_front(checkMBB);
+
+  // When the frame size is less than 256 we just compare the stack
+  // boundary directly to the value of the stack pointer, per gcc.
+  bool CompareStackPointer = StackSize < kSplitStackAvailable;
+
+  // Read the limit off the current stacklet off the stack_guard location.
+  if (Is64Bit) {
+    if (STI.isTargetLinux()) {
+      TlsReg = X86::FS;
+      TlsOffset = IsLP64 ? 0x70 : 0x40;
+    } else if (STI.isTargetDarwin()) {
+      TlsReg = X86::GS;
+      TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
+    } else if (STI.isTargetWin64()) {
+      TlsReg = X86::GS;
+      TlsOffset = 0x28; // pvArbitrary, reserved for application use
+    } else if (STI.isTargetFreeBSD()) {
+      TlsReg = X86::FS;
+      TlsOffset = 0x18;
+    } else if (STI.isTargetDragonFly()) {
+      TlsReg = X86::FS;
+      TlsOffset = 0x20; // use tls_tcb.tcb_segstack
+    } else {
+      report_fatal_error("Segmented stacks not supported on this platform.");
+    }
+
+    if (CompareStackPointer)
+      ScratchReg = IsLP64 ? X86::RSP : X86::ESP;
+    else
+      BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP)
+        .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+
+    BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg)
+      .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+  } else {
+    if (STI.isTargetLinux()) {
+      TlsReg = X86::GS;
+      TlsOffset = 0x30;
+    } else if (STI.isTargetDarwin()) {
+      TlsReg = X86::GS;
+      TlsOffset = 0x48 + 90*4;
+    } else if (STI.isTargetWin32()) {
+      TlsReg = X86::FS;
+      TlsOffset = 0x14; // pvArbitrary, reserved for application use
+    } else if (STI.isTargetDragonFly()) {
+      TlsReg = X86::FS;
+      TlsOffset = 0x10; // use tls_tcb.tcb_segstack
+    } else if (STI.isTargetFreeBSD()) {
+      report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
+    } else {
+      report_fatal_error("Segmented stacks not supported on this platform.");
+    }
+
+    if (CompareStackPointer)
+      ScratchReg = X86::ESP;
+    else
+      BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
+        .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+
+    if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() ||
+        STI.isTargetDragonFly()) {
+      BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
+        .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+    } else if (STI.isTargetDarwin()) {
+
+      // TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
+      unsigned ScratchReg2;
+      bool SaveScratch2;
+      if (CompareStackPointer) {
+        // The primary scratch register is available for holding the TLS offset.
+        ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true);
+        SaveScratch2 = false;
+      } else {
+        // Need to use a second register to hold the TLS offset
+        ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false);
+
+        // Unfortunately, with fastcc the second scratch register may hold an
+        // argument.
+        SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
+      }
+
+      // If Scratch2 is live-in then it needs to be saved.
+      assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
+             "Scratch register is live-in and not saved");
+
+      if (SaveScratch2)
+        BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
+          .addReg(ScratchReg2, RegState::Kill);
+
+      BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
+        .addImm(TlsOffset);
+      BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
+        .addReg(ScratchReg)
+        .addReg(ScratchReg2).addImm(1).addReg(0)
+        .addImm(0)
+        .addReg(TlsReg);
+
+      if (SaveScratch2)
+        BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
+    }
+  }
+
+  // This jump is taken if SP >= (Stacklet Limit + Stack Space required).
+  // It jumps to normal execution of the function body.
+  BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&PrologueMBB);
+
+  // On 32 bit we first push the arguments size and then the frame size. On 64
+  // bit, we pass the stack frame size in r10 and the argument size in r11.
+  if (Is64Bit) {
+    // Functions with nested arguments use R10, so it needs to be saved across
+    // the call to _morestack
+
+    const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX;
+    const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D;
+    const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D;
+    const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr;
+    const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri;
+
+    if (IsNested)
+      BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10);
+
+    BuildMI(allocMBB, DL, TII.get(MOVri), Reg10)
+      .addImm(StackSize);
+    BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
+      .addImm(X86FI->getArgumentStackSize());
+  } else {
+    BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+      .addImm(X86FI->getArgumentStackSize());
+    BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+      .addImm(StackSize);
+  }
+
+  // __morestack is in libgcc
+  if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
+    // Under the large code model, we cannot assume that __morestack lives
+    // within 2^31 bytes of the call site, so we cannot use pc-relative
+    // addressing. We cannot perform the call via a temporary register,
+    // as the rax register may be used to store the static chain, and all
+    // other suitable registers may be either callee-save or used for
+    // parameter passing. We cannot use the stack at this point either
+    // because __morestack manipulates the stack directly.
+    //
+    // To avoid these issues, perform an indirect call via a read-only memory
+    // location containing the address.
+    //
+    // This solution is not perfect, as it assumes that the .rodata section
+    // is laid out within 2^31 bytes of each function body, but this seems
+    // to be sufficient for JIT.
+    BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
+        .addReg(X86::RIP)
+        .addImm(0)
+        .addReg(0)
+        .addExternalSymbol("__morestack_addr")
+        .addReg(0);
+    MF.getMMI().setUsesMorestackAddr(true);
+  } else {
+    if (Is64Bit)
+      BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
+        .addExternalSymbol("__morestack");
+    else
+      BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
+        .addExternalSymbol("__morestack");
+  }
+
+  if (IsNested)
+    BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));
+  else
+    BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET));
+
+  allocMBB->addSuccessor(&PrologueMBB);
+
+  checkMBB->addSuccessor(allocMBB);
+  checkMBB->addSuccessor(&PrologueMBB);
+
+#ifdef XDEBUG
+  MF.verify();
+#endif
+}
+
+/// Erlang programs may need a special prologue to handle the stack size they
+/// might need at runtime. That is because Erlang/OTP does not implement a C
+/// stack but uses a custom implementation of hybrid stack/heap architecture.
+/// (for more information see Eric Stenman's Ph.D. thesis:
+/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
+///
+/// CheckStack:
+///       temp0 = sp - MaxStack
+///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+/// OldStart:
+///       ...
+/// IncStack:
+///       call inc_stack   # doubles the stack space
+///       temp0 = sp - MaxStack
+///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+void X86FrameLowering::adjustForHiPEPrologue(
+    MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  DebugLoc DL;
+  // HiPE-specific values
+  const unsigned HipeLeafWords = 24;
+  const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
+  const unsigned Guaranteed = HipeLeafWords * SlotSize;
+  unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ?
+                            MF.getFunction()->arg_size() - CCRegisteredArgs : 0;
+  unsigned MaxStack = MFI->getStackSize() + CallerStkArity*SlotSize + SlotSize;
+
+  assert(STI.isTargetLinux() &&
+         "HiPE prologue is only supported on Linux operating systems.");
+
+  // Compute the largest caller's frame that is needed to fit the callees'
+  // frames. This 'MaxStack' is computed from:
+  //
+  // a) the fixed frame size, which is the space needed for all spilled temps,
+  // b) outgoing on-stack parameter areas, and
+  // c) the minimum stack space this function needs to make available for the
+  //    functions it calls (a tunable ABI property).
+  if (MFI->hasCalls()) {
+    unsigned MoreStackForCalls = 0;
+
+    for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end();
+         MBBI != MBBE; ++MBBI)
+      for (MachineBasicBlock::iterator MI = MBBI->begin(), ME = MBBI->end();
+           MI != ME; ++MI) {
+        if (!MI->isCall())
+          continue;
+
+        // Get callee operand.
+        const MachineOperand &MO = MI->getOperand(0);
+
+        // Only take account of global function calls (no closures etc.).
+        if (!MO.isGlobal())
+          continue;
+
+        const Function *F = dyn_cast<Function>(MO.getGlobal());
+        if (!F)
+          continue;
+
+        // Do not update 'MaxStack' for primitive and built-in functions
+        // (encoded with names either starting with "erlang."/"bif_" or not
+        // having a ".", such as a simple <Module>.<Function>.<Arity>, or an
+        // "_", such as the BIF "suspend_0") as they are executed on another
+        // stack.
+        if (F->getName().find("erlang.") != StringRef::npos ||
+            F->getName().find("bif_") != StringRef::npos ||
+            F->getName().find_first_of("._") == StringRef::npos)
+          continue;
+
+        unsigned CalleeStkArity =
+          F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0;
+        if (HipeLeafWords - 1 > CalleeStkArity)
+          MoreStackForCalls = std::max(MoreStackForCalls,
+                               (HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
+      }
+    MaxStack += MoreStackForCalls;
+  }
+
+  // If the stack frame needed is larger than the guaranteed then runtime checks
+  // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
+  if (MaxStack > Guaranteed) {
+    MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
+    MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
+
+    for (const auto &LI : PrologueMBB.liveins()) {
+      stackCheckMBB->addLiveIn(LI);
+      incStackMBB->addLiveIn(LI);
+    }
+
+    MF.push_front(incStackMBB);
+    MF.push_front(stackCheckMBB);
+
+    unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
+    unsigned LEAop, CMPop, CALLop;
+    if (Is64Bit) {
+      SPReg = X86::RSP;
+      PReg  = X86::RBP;
+      LEAop = X86::LEA64r;
+      CMPop = X86::CMP64rm;
+      CALLop = X86::CALL64pcrel32;
+      SPLimitOffset = 0x90;
+    } else {
+      SPReg = X86::ESP;
+      PReg  = X86::EBP;
+      LEAop = X86::LEA32r;
+      CMPop = X86::CMP32rm;
+      CALLop = X86::CALLpcrel32;
+      SPLimitOffset = 0x4c;
+    }
+
+    ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
+    assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+           "HiPE prologue scratch register is live-in");
+
+    // Create new MBB for StackCheck:
+    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg),
+                 SPReg, false, -MaxStack);
+    // SPLimitOffset is in a fixed heap location (pointed by BP).
+    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
+                 .addReg(ScratchReg), PReg, false, SPLimitOffset);
+    BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&PrologueMBB);
+
+    // Create new MBB for IncStack:
+    BuildMI(incStackMBB, DL, TII.get(CALLop)).
+      addExternalSymbol("inc_stack_0");
+    addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg),
+                 SPReg, false, -MaxStack);
+    addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
+                 .addReg(ScratchReg), PReg, false, SPLimitOffset);
+    BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB);
+
+    stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100});
+    stackCheckMBB->addSuccessor(incStackMBB, {1, 100});
+    incStackMBB->addSuccessor(&PrologueMBB, {99, 100});
+    incStackMBB->addSuccessor(incStackMBB, {1, 100});
+  }
+#ifdef XDEBUG
+  MF.verify();
+#endif
+}
+
+bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, DebugLoc DL, int Offset) const {
+
+  if (Offset <= 0)
+    return false;
+
+  if (Offset % SlotSize)
+    return false;
+
+  int NumPops = Offset / SlotSize;
+  // This is only worth it if we have at most 2 pops.
+  if (NumPops != 1 && NumPops != 2)
+    return false;
+
+  // Handle only the trivial case where the adjustment directly follows
+  // a call. This is the most common one, anyway.
+  if (MBBI == MBB.begin())
+    return false;
+  MachineBasicBlock::iterator Prev = std::prev(MBBI);
+  if (!Prev->isCall() || !Prev->getOperand(1).isRegMask())
+    return false;
+
+  unsigned Regs[2];
+  unsigned FoundRegs = 0;
+
+  auto RegMask = Prev->getOperand(1);
+
+  auto &RegClass =
+      Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass;
+  // Try to find up to NumPops free registers.
+  for (auto Candidate : RegClass) {
+
+    // Poor man's liveness:
+    // Since we're immediately after a call, any register that is clobbered
+    // by the call and not defined by it can be considered dead.
+    if (!RegMask.clobbersPhysReg(Candidate))
+      continue;
+
+    bool IsDef = false;
+    for (const MachineOperand &MO : Prev->implicit_operands()) {
+      if (MO.isReg() && MO.isDef() && MO.getReg() == Candidate) {
+        IsDef = true;
+        break;
+      }
+    }
+
+    if (IsDef)
+      continue;
+
+    Regs[FoundRegs++] = Candidate;
+    if (FoundRegs == (unsigned)NumPops)
+      break;
+  }
+
+  if (FoundRegs == 0)
+    return false;
+
+  // If we found only one free register, but need two, reuse the same one twice.
+  while (FoundRegs < (unsigned)NumPops)
+    Regs[FoundRegs++] = Regs[0];
+
+  for (int i = 0; i < NumPops; ++i)
+    BuildMI(MBB, MBBI, DL, 
+            TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]);
+
+  return true;
+}
+
+void X86FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  bool reserveCallFrame = hasReservedCallFrame(MF);
+  unsigned Opcode = I->getOpcode();
+  bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
+  DebugLoc DL = I->getDebugLoc();
+  uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
+  uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0;
+  I = MBB.erase(I);
+
+  if (!reserveCallFrame) {
+    // If the stack pointer can be changed after prologue, turn the
+    // adjcallstackup instruction into a 'sub ESP, <amt>' and the
+    // adjcallstackdown instruction into 'add ESP, <amt>'
+
+    // We need to keep the stack aligned properly.  To do this, we round the
+    // amount of space needed for the outgoing arguments up to the next
+    // alignment boundary.
+    unsigned StackAlign = getStackAlignment();
+    Amount = RoundUpToAlignment(Amount, StackAlign);
+
+    MachineModuleInfo &MMI = MF.getMMI();
+    const Function *Fn = MF.getFunction();
+    bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+    bool DwarfCFI = !WindowsCFI && 
+                    (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
+
+    // If we have any exception handlers in this function, and we adjust
+    // the SP before calls, we may need to indicate this to the unwinder
+    // using GNU_ARGS_SIZE. Note that this may be necessary even when
+    // Amount == 0, because the preceding function may have set a non-0
+    // GNU_ARGS_SIZE.
+    // TODO: We don't need to reset this between subsequent functions,
+    // if it didn't change.
+    bool HasDwarfEHHandlers = !WindowsCFI &&
+                              !MF.getMMI().getLandingPads().empty();
+
+    if (HasDwarfEHHandlers && !isDestroy &&
+        MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences())
+      BuildCFI(MBB, I, DL,
+               MCCFIInstruction::createGnuArgsSize(nullptr, Amount));
+
+    if (Amount == 0)
+      return;
+
+    // Factor out the amount that gets handled inside the sequence
+    // (Pushes of argument for frame setup, callee pops for frame destroy)
+    Amount -= InternalAmt;
+
+    // TODO: This is needed only if we require precise CFA.
+    // If this is a callee-pop calling convention, emit a CFA adjust for
+    // the amount the callee popped.
+    if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF))
+      BuildCFI(MBB, I, DL, 
+               MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));
+
+    if (Amount) {
+      // Add Amount to SP to destroy a frame, and subtract to setup.
+      int Offset = isDestroy ? Amount : -Amount;
+
+      if (!(Fn->optForMinSize() && 
+            adjustStackWithPops(MBB, I, DL, Offset)))
+        BuildStackAdjustment(MBB, I, DL, Offset, /*InEpilogue=*/false);
+    }
+
+    if (DwarfCFI && !hasFP(MF)) {
+      // If we don't have FP, but need to generate unwind information,
+      // we need to set the correct CFA offset after the stack adjustment.
+      // How much we adjust the CFA offset depends on whether we're emitting
+      // CFI only for EH purposes or for debugging. EH only requires the CFA
+      // offset to be correct at each call site, while for debugging we want
+      // it to be more precise.
+      int CFAOffset = Amount;
+      // TODO: When not using precise CFA, we also need to adjust for the
+      // InternalAmt here.
+
+      if (CFAOffset) {
+        CFAOffset = isDestroy ? -CFAOffset : CFAOffset;
+        BuildCFI(MBB, I, DL, 
+                 MCCFIInstruction::createAdjustCfaOffset(nullptr, CFAOffset));
+      }
+    }
+
+    return;
+  }
+
+  if (isDestroy && InternalAmt) {
+    // If we are performing frame pointer elimination and if the callee pops
+    // something off the stack pointer, add it back.  We do this until we have
+    // more advanced stack pointer tracking ability.
+    // We are not tracking the stack pointer adjustment by the callee, so make
+    // sure we restore the stack pointer immediately after the call, there may
+    // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
+    MachineBasicBlock::iterator B = MBB.begin();
+    while (I != B && !std::prev(I)->isCall())
+      --I;
+    BuildStackAdjustment(MBB, I, DL, -InternalAmt, /*InEpilogue=*/false);
+  }
+}
+
+bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
+  assert(MBB.getParent() && "Block is not attached to a function!");
+
+  // Win64 has strict requirements in terms of epilogue and we are
+  // not taking a chance at messing with them.
+  // I.e., unless this block is already an exit block, we can't use
+  // it as an epilogue.
+  if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock())
+    return false;
+
+  if (canUseLEAForSPInEpilogue(*MBB.getParent()))
+    return true;
+
+  // If we cannot use LEA to adjust SP, we may need to use ADD, which
+  // clobbers the EFLAGS. Check that we do not need to preserve it,
+  // otherwise, conservatively assume this is not
+  // safe to insert the epilogue here.
+  return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
+}
+
+bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+  // If we may need to emit frameless compact unwind information, give
+  // up as this is currently broken: PR25614.
+  return MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF);
+}
+
+MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    DebugLoc DL, bool RestoreSP) const {
+  assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env");
+  assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32");
+  assert(STI.is32Bit() && !Uses64BitFramePtr &&
+         "restoring EBP/ESI on non-32-bit target");
+
+  MachineFunction &MF = *MBB.getParent();
+  unsigned FramePtr = TRI->getFrameRegister(MF);
+  unsigned BasePtr = TRI->getBaseRegister();
+  WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // FIXME: Don't set FrameSetup flag in catchret case.
+
+  int FI = FuncInfo.EHRegNodeFrameIndex;
+  int EHRegSize = MFI->getObjectSize(FI);
+
+  if (RestoreSP) {
+    // MOV32rm -EHRegSize(%ebp), %esp
+    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP),
+                 X86::EBP, true, -EHRegSize)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  unsigned UsedReg;
+  int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg);
+  int EndOffset = -EHRegOffset - EHRegSize;
+  FuncInfo.EHRegNodeEndOffset = EndOffset;
+
+  if (UsedReg == FramePtr) {
+    // ADD $offset, %ebp
+    unsigned ADDri = getADDriOpcode(false, EndOffset);
+    BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr)
+        .addReg(FramePtr)
+        .addImm(EndOffset)
+        .setMIFlag(MachineInstr::FrameSetup)
+        ->getOperand(3)
+        .setIsDead();
+    assert(EndOffset >= 0 &&
+           "end of registration object above normal EBP position!");
+  } else if (UsedReg == BasePtr) {
+    // LEA offset(%ebp), %esi
+    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr),
+                 FramePtr, false, EndOffset)
+        .setMIFlag(MachineInstr::FrameSetup);
+    // MOV32rm SavedEBPOffset(%esi), %ebp
+    assert(X86FI->getHasSEHFramePtrSave());
+    int Offset =
+        getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
+    assert(UsedReg == BasePtr);
+    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr),
+                 UsedReg, true, Offset)
+        .setMIFlag(MachineInstr::FrameSetup);
+  } else {
+    llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr");
+  }
+  return MBBI;
+}
+
+unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const {
+  // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue.
+  unsigned Offset = 16;
+  // RBP is immediately pushed.
+  Offset += SlotSize;
+  // All callee-saved registers are then pushed.
+  Offset += MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
+  // Every funclet allocates enough stack space for the largest outgoing call.
+  Offset += getWinEHFuncletFrameSize(MF);
+  return Offset;
+}
+
+void X86FrameLowering::processFunctionBeforeFrameFinalized(
+    MachineFunction &MF, RegScavenger *RS) const {
+  // If this function isn't doing Win64-style C++ EH, we don't need to do
+  // anything.
+  const Function *Fn = MF.getFunction();
+  if (!STI.is64Bit() || !MF.getMMI().hasEHFunclets() ||
+      classifyEHPersonality(Fn->getPersonalityFn()) != EHPersonality::MSVC_CXX)
+    return;
+
+  // Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset
+  // relative to RSP after the prologue.  Find the offset of the last fixed
+  // object, so that we can allocate a slot immediately following it. If there
+  // were no fixed objects, use offset -SlotSize, which is immediately after the
+  // return address. Fixed objects have negative frame indices.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  int64_t MinFixedObjOffset = -SlotSize;
+  for (int I = MFI->getObjectIndexBegin(); I < 0; ++I)
+    MinFixedObjOffset = std::min(MinFixedObjOffset, MFI->getObjectOffset(I));
+
+  int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize;
+  int UnwindHelpFI =
+      MFI->CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false);
+  MF.getWinEHFuncInfo()->UnwindHelpFrameIdx = UnwindHelpFI;
+
+  // Store -2 into UnwindHelp on function entry. We have to scan forwards past
+  // other frame setup instructions.
+  MachineBasicBlock &MBB = MF.front();
+  auto MBBI = MBB.begin();
+  while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
+    ++MBBI;
+
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+  addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)),
+                    UnwindHelpFI)
+      .addImm(-2);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
new file mode 100644
index 0000000..3ab41b4
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
@@ -0,0 +1,203 @@
+//===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements X86-specific bits of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
+#define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class MachineInstrBuilder;
+class MCCFIInstruction;
+class X86Subtarget;
+class X86RegisterInfo;
+
+class X86FrameLowering : public TargetFrameLowering {
+public:
+  X86FrameLowering(const X86Subtarget &STI, unsigned StackAlignOverride);
+
+  // Cached subtarget predicates.
+
+  const X86Subtarget &STI;
+  const TargetInstrInfo &TII;
+  const X86RegisterInfo *TRI;
+
+  unsigned SlotSize;
+
+  /// Is64Bit implies that x86_64 instructions are available.
+  bool Is64Bit;
+
+  bool IsLP64;
+
+  /// True if the 64-bit frame or stack pointer should be used. True for most
+  /// 64-bit targets with the exception of x32. If this is false, 32-bit
+  /// instruction operands should be used to manipulate StackPtr and FramePtr.
+  bool Uses64BitFramePtr;
+
+  unsigned StackPtr;
+
+  /// Emit target stack probe code. This is required for all
+  /// large stack allocations on Windows. The caller is required to materialize
+  /// the number of bytes to probe in RAX/EAX. Returns instruction just
+  /// after the expansion.
+  MachineInstr *emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                               bool InProlog) const;
+
+  /// Replace a StackProbe inline-stub with the actual probe code inline.
+  void inlineStackProbe(MachineFunction &MF,
+                        MachineBasicBlock &PrologMBB) const override;
+
+  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 DebugLoc DL) const;
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  void adjustForSegmentedStacks(MachineFunction &MF,
+                                MachineBasicBlock &PrologueMBB) const override;
+
+  void adjustForHiPEPrologue(MachineFunction &MF,
+                             MachineBasicBlock &PrologueMBB) const override;
+
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS = nullptr) const override;
+
+  bool
+  assignCalleeSavedSpillSlots(MachineFunction &MF,
+                              const TargetRegisterInfo *TRI,
+                              std::vector<CalleeSavedInfo> &CSI) const override;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
+  bool needsFrameIndexResolution(const MachineFunction &MF) const override;
+
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
+
+  int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI,
+                                   unsigned &FrameReg) const override;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                 MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI) const override;
+
+  unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                           RegScavenger *RS) const override;
+
+  /// Check the instruction before/after the passed instruction. If
+  /// it is an ADD/SUB/LEA instruction it is deleted argument and the
+  /// stack adjustment is returned as a positive value for ADD/LEA and
+  /// a negative for SUB.
+  int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                     bool doMergeWithPrevious) const;
+
+  /// Emit a series of instructions to increment / decrement the stack
+  /// pointer by a constant value.
+  void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                    int64_t NumBytes, bool InEpilogue) const;
+
+  /// Check that LEA can be used on SP in an epilogue sequence for \p MF.
+  bool canUseLEAForSPInEpilogue(const MachineFunction &MF) const;
+
+  /// Check whether or not the given \p MBB can be used as a epilogue
+  /// for the target.
+  /// The epilogue will be inserted before the first terminator of that block.
+  /// This method is used by the shrink-wrapping pass to decide if
+  /// \p MBB will be correctly handled by the target.
+  bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
+
+  /// Returns true if the target will correctly handle shrink wrapping.
+  bool enableShrinkWrapping(const MachineFunction &MF) const override;
+
+  /// convertArgMovsToPushes - This method tries to convert a call sequence
+  /// that uses sub and mov instructions to put the argument onto the stack
+  /// into a series of pushes.
+  /// Returns true if the transformation succeeded, false if not.
+  bool convertArgMovsToPushes(MachineFunction &MF, 
+                              MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I, 
+                              uint64_t Amount) const;
+
+  /// Wraps up getting a CFI index and building a MachineInstr for it.
+  void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                DebugLoc DL, MCCFIInstruction CFIInst) const;
+
+  /// Sets up EBP and optionally ESI based on the incoming EBP value.  Only
+  /// needed for 32-bit. Used in funclet prologues and at catchret destinations.
+  MachineBasicBlock::iterator
+  restoreWin32EHStackPointers(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                              bool RestoreSP = false) const;
+
+private:
+  uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
+
+  /// Emit target stack probe as a call to a helper function
+  MachineInstr *emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   DebugLoc DL, bool InProlog) const;
+
+  /// Emit target stack probe as an inline sequence.
+  MachineInstr *emitStackProbeInline(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     DebugLoc DL, bool InProlog) const;
+
+  /// Emit a stub to later inline the target stack probe.
+  MachineInstr *emitStackProbeInlineStub(MachineFunction &MF,
+                                         MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MBBI,
+                                         DebugLoc DL, bool InProlog) const;
+
+  /// Aligns the stack pointer by ANDing it with -MaxAlign.
+  void BuildStackAlignAND(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                          unsigned Reg, uint64_t MaxAlign) const;
+
+  /// Make small positive stack adjustments using POPs.
+  bool adjustStackWithPops(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                           int Offset) const;
+
+  /// Adjusts the stack pointer using LEA, SUB, or ADD.
+  MachineInstrBuilder BuildStackAdjustment(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MBBI,
+                                           DebugLoc DL, int64_t Offset,
+                                           bool InEpilogue) const;
+
+  unsigned getPSPSlotOffsetFromSP(const MachineFunction &MF) const;
+
+  unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
new file mode 100644
index 0000000..868ae4e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -0,0 +1,3012 @@
+//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a DAG pattern matching instruction selector for X86,
+// converting from a legalized dag to a X86 dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <stdint.h>
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-isel"
+
+STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
+
+//===----------------------------------------------------------------------===//
+//                      Pattern Matcher Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// This corresponds to X86AddressMode, but uses SDValue's instead of register
+  /// numbers for the leaves of the matched tree.
+  struct X86ISelAddressMode {
+    enum {
+      RegBase,
+      FrameIndexBase
+    } BaseType;
+
+    // This is really a union, discriminated by BaseType!
+    SDValue Base_Reg;
+    int Base_FrameIndex;
+
+    unsigned Scale;
+    SDValue IndexReg;
+    int32_t Disp;
+    SDValue Segment;
+    const GlobalValue *GV;
+    const Constant *CP;
+    const BlockAddress *BlockAddr;
+    const char *ES;
+    MCSymbol *MCSym;
+    int JT;
+    unsigned Align;    // CP alignment.
+    unsigned char SymbolFlags;  // X86II::MO_*
+
+    X86ISelAddressMode()
+        : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
+          Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
+          MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
+
+    bool hasSymbolicDisplacement() const {
+      return GV != nullptr || CP != nullptr || ES != nullptr ||
+             MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
+    }
+
+    bool hasBaseOrIndexReg() const {
+      return BaseType == FrameIndexBase ||
+             IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
+    }
+
+    /// Return true if this addressing mode is already RIP-relative.
+    bool isRIPRelative() const {
+      if (BaseType != RegBase) return false;
+      if (RegisterSDNode *RegNode =
+            dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
+        return RegNode->getReg() == X86::RIP;
+      return false;
+    }
+
+    void setBaseReg(SDValue Reg) {
+      BaseType = RegBase;
+      Base_Reg = Reg;
+    }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    void dump() {
+      dbgs() << "X86ISelAddressMode " << this << '\n';
+      dbgs() << "Base_Reg ";
+      if (Base_Reg.getNode())
+        Base_Reg.getNode()->dump();
+      else
+        dbgs() << "nul";
+      dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'
+             << " Scale" << Scale << '\n'
+             << "IndexReg ";
+      if (IndexReg.getNode())
+        IndexReg.getNode()->dump();
+      else
+        dbgs() << "nul";
+      dbgs() << " Disp " << Disp << '\n'
+             << "GV ";
+      if (GV)
+        GV->dump();
+      else
+        dbgs() << "nul";
+      dbgs() << " CP ";
+      if (CP)
+        CP->dump();
+      else
+        dbgs() << "nul";
+      dbgs() << '\n'
+             << "ES ";
+      if (ES)
+        dbgs() << ES;
+      else
+        dbgs() << "nul";
+      dbgs() << " MCSym ";
+      if (MCSym)
+        dbgs() << MCSym;
+      else
+        dbgs() << "nul";
+      dbgs() << " JT" << JT << " Align" << Align << '\n';
+    }
+#endif
+  };
+}
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  /// ISel - X86-specific code to select X86 machine instructions for
+  /// SelectionDAG operations.
+  ///
+  class X86DAGToDAGISel final : public SelectionDAGISel {
+    /// Keep a pointer to the X86Subtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const X86Subtarget *Subtarget;
+
+    /// If true, selector should try to optimize for code size instead of
+    /// performance.
+    bool OptForSize;
+
+  public:
+    explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
+        : SelectionDAGISel(tm, OptLevel), OptForSize(false) {}
+
+    const char *getPassName() const override {
+      return "X86 DAG->DAG Instruction Selection";
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      // Reset the subtarget each time through.
+      Subtarget = &MF.getSubtarget<X86Subtarget>();
+      SelectionDAGISel::runOnMachineFunction(MF);
+      return true;
+    }
+
+    void EmitFunctionEntryCode() override;
+
+    bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
+
+    void PreprocessISelDAG() override;
+
+    inline bool immSext8(SDNode *N) const {
+      return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue());
+    }
+
+    // True if the 64-bit immediate fits in a 32-bit sign-extended field.
+    inline bool i64immSExt32(SDNode *N) const {
+      uint64_t v = cast<ConstantSDNode>(N)->getZExtValue();
+      return (int64_t)v == (int32_t)v;
+    }
+
+// Include the pieces autogenerated from the target description.
+#include "X86GenDAGISel.inc"
+
+  private:
+    SDNode *Select(SDNode *N) override;
+    SDNode *selectGather(SDNode *N, unsigned Opc);
+    SDNode *selectAtomicLoadArith(SDNode *Node, MVT NVT);
+
+    bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
+    bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
+    bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
+    bool matchAddress(SDValue N, X86ISelAddressMode &AM);
+    bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth);
+    bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+                                 unsigned Depth);
+    bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
+    bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+                    SDValue &Scale, SDValue &Index, SDValue &Disp,
+                    SDValue &Segment);
+    bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
+                          SDValue &Scale, SDValue &Index, SDValue &Disp,
+                          SDValue &Segment);
+    bool selectMOV64Imm32(SDValue N, SDValue &Imm);
+    bool selectLEAAddr(SDValue N, SDValue &Base,
+                       SDValue &Scale, SDValue &Index, SDValue &Disp,
+                       SDValue &Segment);
+    bool selectLEA64_32Addr(SDValue N, SDValue &Base,
+                            SDValue &Scale, SDValue &Index, SDValue &Disp,
+                            SDValue &Segment);
+    bool selectTLSADDRAddr(SDValue N, SDValue &Base,
+                           SDValue &Scale, SDValue &Index, SDValue &Disp,
+                           SDValue &Segment);
+    bool selectScalarSSELoad(SDNode *Root, SDValue N,
+                             SDValue &Base, SDValue &Scale,
+                             SDValue &Index, SDValue &Disp,
+                             SDValue &Segment,
+                             SDValue &NodeWithChain);
+
+    bool tryFoldLoad(SDNode *P, SDValue N,
+                     SDValue &Base, SDValue &Scale,
+                     SDValue &Index, SDValue &Disp,
+                     SDValue &Segment);
+
+    /// Implement addressing mode selection for inline asm expressions.
+    bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                      unsigned ConstraintID,
+                                      std::vector<SDValue> &OutOps) override;
+
+    void emitSpecialCodeForMain();
+
+    inline void getAddressOperands(X86ISelAddressMode &AM, SDLoc DL,
+                                   SDValue &Base, SDValue &Scale,
+                                   SDValue &Index, SDValue &Disp,
+                                   SDValue &Segment) {
+      Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+                 ? CurDAG->getTargetFrameIndex(
+                       AM.Base_FrameIndex,
+                       TLI->getPointerTy(CurDAG->getDataLayout()))
+                 : AM.Base_Reg;
+      Scale = getI8Imm(AM.Scale, DL);
+      Index = AM.IndexReg;
+      // These are 32-bit even in 64-bit mode since RIP-relative offset
+      // is 32-bit.
+      if (AM.GV)
+        Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
+                                              MVT::i32, AM.Disp,
+                                              AM.SymbolFlags);
+      else if (AM.CP)
+        Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
+                                             AM.Align, AM.Disp, AM.SymbolFlags);
+      else if (AM.ES) {
+        assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
+        Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
+      } else if (AM.MCSym) {
+        assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
+        assert(AM.SymbolFlags == 0 && "oo");
+        Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
+      } else if (AM.JT != -1) {
+        assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
+        Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
+      } else if (AM.BlockAddr)
+        Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
+                                             AM.SymbolFlags);
+      else
+        Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
+
+      if (AM.Segment.getNode())
+        Segment = AM.Segment;
+      else
+        Segment = CurDAG->getRegister(0, MVT::i32);
+    }
+
+    // Utility function to determine whether we should avoid selecting
+    // immediate forms of instructions for better code size or not.
+    // At a high level, we'd like to avoid such instructions when
+    // we have similar constants used within the same basic block
+    // that can be kept in a register.
+    //
+    bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
+      uint32_t UseCount = 0;
+
+      // Do not want to hoist if we're not optimizing for size.
+      // TODO: We'd like to remove this restriction.
+      // See the comment in X86InstrInfo.td for more info.
+      if (!OptForSize)
+        return false;
+
+      // Walk all the users of the immediate.
+      for (SDNode::use_iterator UI = N->use_begin(),
+           UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
+
+        SDNode *User = *UI;
+
+        // This user is already selected. Count it as a legitimate use and
+        // move on.
+        if (User->isMachineOpcode()) {
+          UseCount++;
+          continue;
+        }
+
+        // We want to count stores of immediates as real uses.
+        if (User->getOpcode() == ISD::STORE &&
+            User->getOperand(1).getNode() == N) {
+          UseCount++;
+          continue;
+        }
+
+        // We don't currently match users that have > 2 operands (except
+        // for stores, which are handled above)
+        // Those instruction won't match in ISEL, for now, and would
+        // be counted incorrectly.
+        // This may change in the future as we add additional instruction
+        // types.
+        if (User->getNumOperands() != 2)
+          continue;
+        
+        // Immediates that are used for offsets as part of stack
+        // manipulation should be left alone. These are typically
+        // used to indicate SP offsets for argument passing and
+        // will get pulled into stores/pushes (implicitly).
+        if (User->getOpcode() == X86ISD::ADD ||
+            User->getOpcode() == ISD::ADD    ||
+            User->getOpcode() == X86ISD::SUB ||
+            User->getOpcode() == ISD::SUB) {
+
+          // Find the other operand of the add/sub.
+          SDValue OtherOp = User->getOperand(0);
+          if (OtherOp.getNode() == N)
+            OtherOp = User->getOperand(1);
+
+          // Don't count if the other operand is SP.
+          RegisterSDNode *RegNode;
+          if (OtherOp->getOpcode() == ISD::CopyFromReg &&
+              (RegNode = dyn_cast_or_null<RegisterSDNode>(
+                 OtherOp->getOperand(1).getNode())))
+            if ((RegNode->getReg() == X86::ESP) ||
+                (RegNode->getReg() == X86::RSP))
+              continue;
+        }
+
+        // ... otherwise, count this and move on.
+        UseCount++;
+      }
+
+      // If we have more than 1 use, then recommend for hoisting.
+      return (UseCount > 1);
+    }
+
+    /// Return a target constant with the specified value of type i8.
+    inline SDValue getI8Imm(unsigned Imm, SDLoc DL) {
+      return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
+    }
+
+    /// Return a target constant with the specified value, of type i32.
+    inline SDValue getI32Imm(unsigned Imm, SDLoc DL) {
+      return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
+    }
+
+    /// Return an SDNode that returns the value of the global base register.
+    /// Output instructions required to initialize the global base register,
+    /// if necessary.
+    SDNode *getGlobalBaseReg();
+
+    /// Return a reference to the TargetMachine, casted to the target-specific
+    /// type.
+    const X86TargetMachine &getTargetMachine() const {
+      return static_cast<const X86TargetMachine &>(TM);
+    }
+
+    /// Return a reference to the TargetInstrInfo, casted to the target-specific
+    /// type.
+    const X86InstrInfo *getInstrInfo() const {
+      return Subtarget->getInstrInfo();
+    }
+
+    /// \brief Address-mode matching performs shift-of-and to and-of-shift
+    /// reassociation in order to expose more scaled addressing
+    /// opportunities.
+    bool ComplexPatternFuncMutatesDAG() const override {
+      return true;
+    }
+  };
+}
+
+
+bool
+X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
+  if (OptLevel == CodeGenOpt::None) return false;
+
+  if (!N.hasOneUse())
+    return false;
+
+  if (N.getOpcode() != ISD::LOAD)
+    return true;
+
+  // If N is a load, do additional profitability checks.
+  if (U == Root) {
+    switch (U->getOpcode()) {
+    default: break;
+    case X86ISD::ADD:
+    case X86ISD::SUB:
+    case X86ISD::AND:
+    case X86ISD::XOR:
+    case X86ISD::OR:
+    case ISD::ADD:
+    case ISD::ADDC:
+    case ISD::ADDE:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR: {
+      SDValue Op1 = U->getOperand(1);
+
+      // If the other operand is a 8-bit immediate we should fold the immediate
+      // instead. This reduces code size.
+      // e.g.
+      // movl 4(%esp), %eax
+      // addl $4, %eax
+      // vs.
+      // movl $4, %eax
+      // addl 4(%esp), %eax
+      // The former is 2 bytes shorter. In case where the increment is 1, then
+      // the saving can be 4 bytes (by using incl %eax).
+      if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1))
+        if (Imm->getAPIntValue().isSignedIntN(8))
+          return false;
+
+      // If the other operand is a TLS address, we should fold it instead.
+      // This produces
+      // movl    %gs:0, %eax
+      // leal    i@NTPOFF(%eax), %eax
+      // instead of
+      // movl    $i@NTPOFF, %eax
+      // addl    %gs:0, %eax
+      // if the block also has an access to a second TLS address this will save
+      // a load.
+      // FIXME: This is probably also true for non-TLS addresses.
+      if (Op1.getOpcode() == X86ISD::Wrapper) {
+        SDValue Val = Op1.getOperand(0);
+        if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
+          return false;
+      }
+    }
+    }
+  }
+
+  return true;
+}
+
+/// Replace the original chain operand of the call with
+/// load's chain operand and move load below the call's chain operand.
+static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
+                               SDValue Call, SDValue OrigChain) {
+  SmallVector<SDValue, 8> Ops;
+  SDValue Chain = OrigChain.getOperand(0);
+  if (Chain.getNode() == Load.getNode())
+    Ops.push_back(Load.getOperand(0));
+  else {
+    assert(Chain.getOpcode() == ISD::TokenFactor &&
+           "Unexpected chain operand");
+    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
+      if (Chain.getOperand(i).getNode() == Load.getNode())
+        Ops.push_back(Load.getOperand(0));
+      else
+        Ops.push_back(Chain.getOperand(i));
+    SDValue NewChain =
+      CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
+    Ops.clear();
+    Ops.push_back(NewChain);
+  }
+  Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
+  CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
+  CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
+                             Load.getOperand(1), Load.getOperand(2));
+
+  Ops.clear();
+  Ops.push_back(SDValue(Load.getNode(), 1));
+  Ops.append(Call->op_begin() + 1, Call->op_end());
+  CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
+}
+
+/// Return true if call address is a load and it can be
+/// moved below CALLSEQ_START and the chains leading up to the call.
+/// Return the CALLSEQ_START by reference as a second output.
+/// In the case of a tail call, there isn't a callseq node between the call
+/// chain and the load.
+static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
+  // The transformation is somewhat dangerous if the call's chain was glued to
+  // the call. After MoveBelowOrigChain the load is moved between the call and
+  // the chain, this can create a cycle if the load is not folded. So it is
+  // *really* important that we are sure the load will be folded.
+  if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
+    return false;
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
+  if (!LD ||
+      LD->isVolatile() ||
+      LD->getAddressingMode() != ISD::UNINDEXED ||
+      LD->getExtensionType() != ISD::NON_EXTLOAD)
+    return false;
+
+  // Now let's find the callseq_start.
+  while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
+    if (!Chain.hasOneUse())
+      return false;
+    Chain = Chain.getOperand(0);
+  }
+
+  if (!Chain.getNumOperands())
+    return false;
+  // Since we are not checking for AA here, conservatively abort if the chain
+  // writes to memory. It's not safe to move the callee (a load) across a store.
+  if (isa<MemSDNode>(Chain.getNode()) &&
+      cast<MemSDNode>(Chain.getNode())->writeMem())
+    return false;
+  if (Chain.getOperand(0).getNode() == Callee.getNode())
+    return true;
+  if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
+      Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
+      Callee.getValue(1).hasOneUse())
+    return true;
+  return false;
+}
+
+void X86DAGToDAGISel::PreprocessISelDAG() {
+  // OptForSize is used in pattern predicates that isel is matching.
+  OptForSize = MF->getFunction()->optForSize();
+
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+       E = CurDAG->allnodes_end(); I != E; ) {
+    SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
+
+    if (OptLevel != CodeGenOpt::None &&
+        // Only does this when target favors doesn't favor register indirect
+        // call.
+        ((N->getOpcode() == X86ISD::CALL && !Subtarget->callRegIndirect()) ||
+         (N->getOpcode() == X86ISD::TC_RETURN &&
+          // Only does this if load can be folded into TC_RETURN.
+          (Subtarget->is64Bit() ||
+           getTargetMachine().getRelocationModel() != Reloc::PIC_)))) {
+      /// Also try moving call address load from outside callseq_start to just
+      /// before the call to allow it to be folded.
+      ///
+      ///     [Load chain]
+      ///         ^
+      ///         |
+      ///       [Load]
+      ///       ^    ^
+      ///       |    |
+      ///      /      \--
+      ///     /          |
+      ///[CALLSEQ_START] |
+      ///     ^          |
+      ///     |          |
+      /// [LOAD/C2Reg]   |
+      ///     |          |
+      ///      \        /
+      ///       \      /
+      ///       [CALL]
+      bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
+      SDValue Chain = N->getOperand(0);
+      SDValue Load  = N->getOperand(1);
+      if (!isCalleeLoad(Load, Chain, HasCallSeq))
+        continue;
+      moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
+      ++NumLoadMoved;
+      continue;
+    }
+
+    // Lower fpround and fpextend nodes that target the FP stack to be store and
+    // load to the stack.  This is a gross hack.  We would like to simply mark
+    // these as being illegal, but when we do that, legalize produces these when
+    // it expands calls, then expands these in the same legalize pass.  We would
+    // like dag combine to be able to hack on these between the call expansion
+    // and the node legalization.  As such this pass basically does "really
+    // late" legalization of these inline with the X86 isel pass.
+    // FIXME: This should only happen when not compiled with -O0.
+    if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
+      continue;
+
+    MVT SrcVT = N->getOperand(0).getSimpleValueType();
+    MVT DstVT = N->getSimpleValueType(0);
+
+    // If any of the sources are vectors, no fp stack involved.
+    if (SrcVT.isVector() || DstVT.isVector())
+      continue;
+
+    // If the source and destination are SSE registers, then this is a legal
+    // conversion that should not be lowered.
+    const X86TargetLowering *X86Lowering =
+        static_cast<const X86TargetLowering *>(TLI);
+    bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
+    bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
+    if (SrcIsSSE && DstIsSSE)
+      continue;
+
+    if (!SrcIsSSE && !DstIsSSE) {
+      // If this is an FPStack extension, it is a noop.
+      if (N->getOpcode() == ISD::FP_EXTEND)
+        continue;
+      // If this is a value-preserving FPStack truncation, it is a noop.
+      if (N->getConstantOperandVal(1))
+        continue;
+    }
+
+    // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+    // FPStack has extload and truncstore.  SSE can fold direct loads into other
+    // operations.  Based on this, decide what we want to do.
+    MVT MemVT;
+    if (N->getOpcode() == ISD::FP_ROUND)
+      MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
+    else
+      MemVT = SrcIsSSE ? SrcVT : DstVT;
+
+    SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+    SDLoc dl(N);
+
+    // FIXME: optimize the case where the src/dest is a load or store?
+    SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl,
+                                          N->getOperand(0),
+                                          MemTmp, MachinePointerInfo(), MemVT,
+                                          false, false, 0);
+    SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
+                                        MachinePointerInfo(),
+                                        MemVT, false, false, false, 0);
+
+    // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+    // extload we created.  This will cause general havok on the dag because
+    // anything below the conversion could be folded into other existing nodes.
+    // To avoid invalidating 'I', back it up to the convert node.
+    --I;
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+
+    // Now that we did that, the node is dead.  Increment the iterator to the
+    // next node to process, then delete N.
+    ++I;
+    CurDAG->DeleteNode(N);
+  }
+}
+
+
+/// Emit any code that needs to be executed only in the main function.
+void X86DAGToDAGISel::emitSpecialCodeForMain() {
+  if (Subtarget->isTargetCygMing()) {
+    TargetLowering::ArgListTy Args;
+    auto &DL = CurDAG->getDataLayout();
+
+    TargetLowering::CallLoweringInfo CLI(*CurDAG);
+    CLI.setChain(CurDAG->getRoot())
+        .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
+                   CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
+                   std::move(Args), 0);
+    const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
+    std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
+    CurDAG->setRoot(Result.second);
+  }
+}
+
+void X86DAGToDAGISel::EmitFunctionEntryCode() {
+  // If this is main, emit special code for main.
+  if (const Function *Fn = MF->getFunction())
+    if (Fn->hasExternalLinkage() && Fn->getName() == "main")
+      emitSpecialCodeForMain();
+}
+
+static bool isDispSafeForFrameIndex(int64_t Val) {
+  // On 64-bit platforms, we can run into an issue where a frame index
+  // includes a displacement that, when added to the explicit displacement,
+  // will overflow the displacement field. Assuming that the frame index
+  // displacement fits into a 31-bit integer  (which is only slightly more
+  // aggressive than the current fundamental assumption that it fits into
+  // a 32-bit integer), a 31-bit disp should always be safe.
+  return isInt<31>(Val);
+}
+
+bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
+                                            X86ISelAddressMode &AM) {
+  // Cannot combine ExternalSymbol displacements with integer offsets.
+  if (Offset != 0 && (AM.ES || AM.MCSym))
+    return true;
+  int64_t Val = AM.Disp + Offset;
+  CodeModel::Model M = TM.getCodeModel();
+  if (Subtarget->is64Bit()) {
+    if (!X86::isOffsetSuitableForCodeModel(Val, M,
+                                           AM.hasSymbolicDisplacement()))
+      return true;
+    // In addition to the checks required for a register base, check that
+    // we do not try to use an unsafe Disp with a frame index.
+    if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
+        !isDispSafeForFrameIndex(Val))
+      return true;
+  }
+  AM.Disp = Val;
+  return false;
+
+}
+
+bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
+  SDValue Address = N->getOperand(1);
+
+  // load gs:0 -> GS segment register.
+  // load fs:0 -> FS segment register.
+  //
+  // This optimization is valid because the GNU TLS model defines that
+  // gs:0 (or fs:0 on X86-64) contains its own address.
+  // For more information see http://people.redhat.com/drepper/tls.pdf
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
+    if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
+        Subtarget->isTargetLinux())
+      switch (N->getPointerInfo().getAddrSpace()) {
+      case 256:
+        AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+        return false;
+      case 257:
+        AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+        return false;
+      }
+
+  return true;
+}
+
+/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
+/// mode. These wrap things that will resolve down into a symbol reference.
+/// If no match is possible, this returns true, otherwise it returns false.
+bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
+  // If the addressing mode already has a symbol as the displacement, we can
+  // never match another symbol.
+  if (AM.hasSymbolicDisplacement())
+    return true;
+
+  SDValue N0 = N.getOperand(0);
+  CodeModel::Model M = TM.getCodeModel();
+
+  // Handle X86-64 rip-relative addresses.  We check this before checking direct
+  // folding because RIP is preferable to non-RIP accesses.
+  if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP &&
+      // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
+      // they cannot be folded into immediate fields.
+      // FIXME: This can be improved for kernel and other models?
+      (M == CodeModel::Small || M == CodeModel::Kernel)) {
+    // Base and index reg must be 0 in order to use %rip as base.
+    if (AM.hasBaseOrIndexReg())
+      return true;
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+      X86ISelAddressMode Backup = AM;
+      AM.GV = G->getGlobal();
+      AM.SymbolFlags = G->getTargetFlags();
+      if (foldOffsetIntoAddress(G->getOffset(), AM)) {
+        AM = Backup;
+        return true;
+      }
+    } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+      X86ISelAddressMode Backup = AM;
+      AM.CP = CP->getConstVal();
+      AM.Align = CP->getAlignment();
+      AM.SymbolFlags = CP->getTargetFlags();
+      if (foldOffsetIntoAddress(CP->getOffset(), AM)) {
+        AM = Backup;
+        return true;
+      }
+    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+      AM.ES = S->getSymbol();
+      AM.SymbolFlags = S->getTargetFlags();
+    } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
+      AM.MCSym = S->getMCSymbol();
+    } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+      AM.JT = J->getIndex();
+      AM.SymbolFlags = J->getTargetFlags();
+    } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+      X86ISelAddressMode Backup = AM;
+      AM.BlockAddr = BA->getBlockAddress();
+      AM.SymbolFlags = BA->getTargetFlags();
+      if (foldOffsetIntoAddress(BA->getOffset(), AM)) {
+        AM = Backup;
+        return true;
+      }
+    } else
+      llvm_unreachable("Unhandled symbol reference node.");
+
+    if (N.getOpcode() == X86ISD::WrapperRIP)
+      AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
+    return false;
+  }
+
+  // Handle the case when globals fit in our immediate field: This is true for
+  // X86-32 always and X86-64 when in -mcmodel=small mode.  In 64-bit
+  // mode, this only applies to a non-RIP-relative computation.
+  if (!Subtarget->is64Bit() ||
+      M == CodeModel::Small || M == CodeModel::Kernel) {
+    assert(N.getOpcode() != X86ISD::WrapperRIP &&
+           "RIP-relative addressing already handled");
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+      AM.GV = G->getGlobal();
+      AM.Disp += G->getOffset();
+      AM.SymbolFlags = G->getTargetFlags();
+    } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+      AM.CP = CP->getConstVal();
+      AM.Align = CP->getAlignment();
+      AM.Disp += CP->getOffset();
+      AM.SymbolFlags = CP->getTargetFlags();
+    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+      AM.ES = S->getSymbol();
+      AM.SymbolFlags = S->getTargetFlags();
+    } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
+      AM.MCSym = S->getMCSymbol();
+    } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+      AM.JT = J->getIndex();
+      AM.SymbolFlags = J->getTargetFlags();
+    } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+      AM.BlockAddr = BA->getBlockAddress();
+      AM.Disp += BA->getOffset();
+      AM.SymbolFlags = BA->getTargetFlags();
+    } else
+      llvm_unreachable("Unhandled symbol reference node.");
+    return false;
+  }
+
+  return true;
+}
+
+/// Add the specified node to the specified addressing mode, returning true if
+/// it cannot be done. This just pattern matches for the addressing mode.
+bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
+  if (matchAddressRecursively(N, AM, 0))
+    return true;
+
+  // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
+  // a smaller encoding and avoids a scaled-index.
+  if (AM.Scale == 2 &&
+      AM.BaseType == X86ISelAddressMode::RegBase &&
+      AM.Base_Reg.getNode() == nullptr) {
+    AM.Base_Reg = AM.IndexReg;
+    AM.Scale = 1;
+  }
+
+  // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
+  // because it has a smaller encoding.
+  // TODO: Which other code models can use this?
+  if (TM.getCodeModel() == CodeModel::Small &&
+      Subtarget->is64Bit() &&
+      AM.Scale == 1 &&
+      AM.BaseType == X86ISelAddressMode::RegBase &&
+      AM.Base_Reg.getNode() == nullptr &&
+      AM.IndexReg.getNode() == nullptr &&
+      AM.SymbolFlags == X86II::MO_NO_FLAG &&
+      AM.hasSymbolicDisplacement())
+    AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
+
+  return false;
+}
+
+bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM,
+                               unsigned Depth) {
+  // Add an artificial use to this node so that we can keep track of
+  // it if it gets CSE'd with a different node.
+  HandleSDNode Handle(N);
+
+  X86ISelAddressMode Backup = AM;
+  if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
+      !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
+    return false;
+  AM = Backup;
+
+  // Try again after commuting the operands.
+  if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
+      !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
+    return false;
+  AM = Backup;
+
+  // If we couldn't fold both operands into the address at the same time,
+  // see if we can just put each operand into a register and fold at least
+  // the add.
+  if (AM.BaseType == X86ISelAddressMode::RegBase &&
+      !AM.Base_Reg.getNode() &&
+      !AM.IndexReg.getNode()) {
+    N = Handle.getValue();
+    AM.Base_Reg = N.getOperand(0);
+    AM.IndexReg = N.getOperand(1);
+    AM.Scale = 1;
+    return false;
+  }
+  N = Handle.getValue();
+  return true;
+}
+
+// Insert a node into the DAG at least before the Pos node's position. This
+// will reposition the node as needed, and will assign it a node ID that is <=
+// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
+// IDs! The selection DAG must no longer depend on their uniqueness when this
+// is used.
+static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
+  if (N.getNode()->getNodeId() == -1 ||
+      N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) {
+    DAG.RepositionNode(Pos.getNode()->getIterator(), N.getNode());
+    N.getNode()->setNodeId(Pos.getNode()->getNodeId());
+  }
+}
+
+// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
+// safe. This allows us to convert the shift and and into an h-register
+// extract and a scaled index. Returns false if the simplification is
+// performed.
+static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
+                                      uint64_t Mask,
+                                      SDValue Shift, SDValue X,
+                                      X86ISelAddressMode &AM) {
+  if (Shift.getOpcode() != ISD::SRL ||
+      !isa<ConstantSDNode>(Shift.getOperand(1)) ||
+      !Shift.hasOneUse())
+    return true;
+
+  int ScaleLog = 8 - Shift.getConstantOperandVal(1);
+  if (ScaleLog <= 0 || ScaleLog >= 4 ||
+      Mask != (0xffu << ScaleLog))
+    return true;
+
+  MVT VT = N.getSimpleValueType();
+  SDLoc DL(N);
+  SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
+  SDValue NewMask = DAG.getConstant(0xff, DL, VT);
+  SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
+  SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
+  SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
+  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
+
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  insertDAGNode(DAG, N, Eight);
+  insertDAGNode(DAG, N, Srl);
+  insertDAGNode(DAG, N, NewMask);
+  insertDAGNode(DAG, N, And);
+  insertDAGNode(DAG, N, ShlCount);
+  insertDAGNode(DAG, N, Shl);
+  DAG.ReplaceAllUsesWith(N, Shl);
+  AM.IndexReg = And;
+  AM.Scale = (1 << ScaleLog);
+  return false;
+}
+
+// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
+// allows us to fold the shift into this addressing mode. Returns false if the
+// transform succeeded.
+static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
+                                        uint64_t Mask,
+                                        SDValue Shift, SDValue X,
+                                        X86ISelAddressMode &AM) {
+  if (Shift.getOpcode() != ISD::SHL ||
+      !isa<ConstantSDNode>(Shift.getOperand(1)))
+    return true;
+
+  // Not likely to be profitable if either the AND or SHIFT node has more
+  // than one use (unless all uses are for address computation). Besides,
+  // isel mechanism requires their node ids to be reused.
+  if (!N.hasOneUse() || !Shift.hasOneUse())
+    return true;
+
+  // Verify that the shift amount is something we can fold.
+  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+  if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
+    return true;
+
+  MVT VT = N.getSimpleValueType();
+  SDLoc DL(N);
+  SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
+  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
+  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
+
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  insertDAGNode(DAG, N, NewMask);
+  insertDAGNode(DAG, N, NewAnd);
+  insertDAGNode(DAG, N, NewShift);
+  DAG.ReplaceAllUsesWith(N, NewShift);
+
+  AM.Scale = 1 << ShiftAmt;
+  AM.IndexReg = NewAnd;
+  return false;
+}
+
+// Implement some heroics to detect shifts of masked values where the mask can
+// be replaced by extending the shift and undoing that in the addressing mode
+// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
+// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
+// the addressing mode. This results in code such as:
+//
+//   int f(short *y, int *lookup_table) {
+//     ...
+//     return *y + lookup_table[*y >> 11];
+//   }
+//
+// Turning into:
+//   movzwl (%rdi), %eax
+//   movl %eax, %ecx
+//   shrl $11, %ecx
+//   addl (%rsi,%rcx,4), %eax
+//
+// Instead of:
+//   movzwl (%rdi), %eax
+//   movl %eax, %ecx
+//   shrl $9, %ecx
+//   andl $124, %rcx
+//   addl (%rsi,%rcx), %eax
+//
+// Note that this function assumes the mask is provided as a mask *after* the
+// value is shifted. The input chain may or may not match that, but computing
+// such a mask is trivial.
+static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
+                                    uint64_t Mask,
+                                    SDValue Shift, SDValue X,
+                                    X86ISelAddressMode &AM) {
+  if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
+      !isa<ConstantSDNode>(Shift.getOperand(1)))
+    return true;
+
+  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+  unsigned MaskLZ = countLeadingZeros(Mask);
+  unsigned MaskTZ = countTrailingZeros(Mask);
+
+  // The amount of shift we're trying to fit into the addressing mode is taken
+  // from the trailing zeros of the mask.
+  unsigned AMShiftAmt = MaskTZ;
+
+  // There is nothing we can do here unless the mask is removing some bits.
+  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
+  if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
+
+  // We also need to ensure that mask is a continuous run of bits.
+  if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
+
+  // Scale the leading zero count down based on the actual size of the value.
+  // Also scale it down based on the size of the shift.
+  MaskLZ -= (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
+
+  // The final check is to ensure that any masked out high bits of X are
+  // already known to be zero. Otherwise, the mask has a semantic impact
+  // other than masking out a couple of low bits. Unfortunately, because of
+  // the mask, zero extensions will be removed from operands in some cases.
+  // This code works extra hard to look through extensions because we can
+  // replace them with zero extensions cheaply if necessary.
+  bool ReplacingAnyExtend = false;
+  if (X.getOpcode() == ISD::ANY_EXTEND) {
+    unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
+                          X.getOperand(0).getSimpleValueType().getSizeInBits();
+    // Assume that we'll replace the any-extend with a zero-extend, and
+    // narrow the search to the extended value.
+    X = X.getOperand(0);
+    MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
+    ReplacingAnyExtend = true;
+  }
+  APInt MaskedHighBits =
+    APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
+  APInt KnownZero, KnownOne;
+  DAG.computeKnownBits(X, KnownZero, KnownOne);
+  if (MaskedHighBits != KnownZero) return true;
+
+  // We've identified a pattern that can be transformed into a single shift
+  // and an addressing mode. Make it so.
+  MVT VT = N.getSimpleValueType();
+  if (ReplacingAnyExtend) {
+    assert(X.getValueType() != VT);
+    // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
+    SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
+    insertDAGNode(DAG, N, NewX);
+    X = NewX;
+  }
+  SDLoc DL(N);
+  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
+  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
+  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
+  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
+
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  insertDAGNode(DAG, N, NewSRLAmt);
+  insertDAGNode(DAG, N, NewSRL);
+  insertDAGNode(DAG, N, NewSHLAmt);
+  insertDAGNode(DAG, N, NewSHL);
+  DAG.ReplaceAllUsesWith(N, NewSHL);
+
+  AM.Scale = 1 << AMShiftAmt;
+  AM.IndexReg = NewSRL;
+  return false;
+}
+
+bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+                                              unsigned Depth) {
+  SDLoc dl(N);
+  DEBUG({
+      dbgs() << "MatchAddress: ";
+      AM.dump();
+    });
+  // Limit recursion.
+  if (Depth > 5)
+    return matchAddressBase(N, AM);
+
+  // If this is already a %rip relative address, we can only merge immediates
+  // into it.  Instead of handling this in every case, we handle it here.
+  // RIP relative addressing: %rip + 32-bit displacement!
+  if (AM.isRIPRelative()) {
+    // FIXME: JumpTable and ExternalSymbol address currently don't like
+    // displacements.  It isn't very important, but this should be fixed for
+    // consistency.
+    if (!(AM.ES || AM.MCSym) && AM.JT != -1)
+      return true;
+
+    if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
+      if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
+        return false;
+    return true;
+  }
+
+  switch (N.getOpcode()) {
+  default: break;
+  case ISD::LOCAL_RECOVER: {
+    if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
+      if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
+        // Use the symbol and don't prefix it.
+        AM.MCSym = ESNode->getMCSymbol();
+        return false;
+      }
+    break;
+  }
+  case ISD::Constant: {
+    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+    if (!foldOffsetIntoAddress(Val, AM))
+      return false;
+    break;
+  }
+
+  case X86ISD::Wrapper:
+  case X86ISD::WrapperRIP:
+    if (!matchWrapper(N, AM))
+      return false;
+    break;
+
+  case ISD::LOAD:
+    if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
+      return false;
+    break;
+
+  case ISD::FrameIndex:
+    if (AM.BaseType == X86ISelAddressMode::RegBase &&
+        AM.Base_Reg.getNode() == nullptr &&
+        (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
+      AM.BaseType = X86ISelAddressMode::FrameIndexBase;
+      AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+      return false;
+    }
+    break;
+
+  case ISD::SHL:
+    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
+      break;
+
+    if (ConstantSDNode
+          *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
+      unsigned Val = CN->getZExtValue();
+      // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
+      // that the base operand remains free for further matching. If
+      // the base doesn't end up getting used, a post-processing step
+      // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
+      if (Val == 1 || Val == 2 || Val == 3) {
+        AM.Scale = 1 << Val;
+        SDValue ShVal = N.getNode()->getOperand(0);
+
+        // Okay, we know that we have a scale by now.  However, if the scaled
+        // value is an add of something and a constant, we can fold the
+        // constant into the disp field here.
+        if (CurDAG->isBaseWithConstantOffset(ShVal)) {
+          AM.IndexReg = ShVal.getNode()->getOperand(0);
+          ConstantSDNode *AddVal =
+            cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
+          uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
+          if (!foldOffsetIntoAddress(Disp, AM))
+            return false;
+        }
+
+        AM.IndexReg = ShVal;
+        return false;
+      }
+    }
+    break;
+
+  case ISD::SRL: {
+    // Scale must not be used already.
+    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
+
+    SDValue And = N.getOperand(0);
+    if (And.getOpcode() != ISD::AND) break;
+    SDValue X = And.getOperand(0);
+
+    // We only handle up to 64-bit values here as those are what matter for
+    // addressing mode optimizations.
+    if (X.getSimpleValueType().getSizeInBits() > 64) break;
+
+    // The mask used for the transform is expected to be post-shift, but we
+    // found the shift first so just apply the shift to the mask before passing
+    // it down.
+    if (!isa<ConstantSDNode>(N.getOperand(1)) ||
+        !isa<ConstantSDNode>(And.getOperand(1)))
+      break;
+    uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
+
+    // Try to fold the mask and shift into the scale, and return false if we
+    // succeed.
+    if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
+      return false;
+    break;
+  }
+
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI:
+    // A mul_lohi where we need the low part can be folded as a plain multiply.
+    if (N.getResNo() != 0) break;
+    // FALL THROUGH
+  case ISD::MUL:
+  case X86ISD::MUL_IMM:
+    // X*[3,5,9] -> X+X*[2,4,8]
+    if (AM.BaseType == X86ISelAddressMode::RegBase &&
+        AM.Base_Reg.getNode() == nullptr &&
+        AM.IndexReg.getNode() == nullptr) {
+      if (ConstantSDNode
+            *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
+        if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
+            CN->getZExtValue() == 9) {
+          AM.Scale = unsigned(CN->getZExtValue())-1;
+
+          SDValue MulVal = N.getNode()->getOperand(0);
+          SDValue Reg;
+
+          // Okay, we know that we have a scale by now.  However, if the scaled
+          // value is an add of something and a constant, we can fold the
+          // constant into the disp field here.
+          if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
+              isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
+            Reg = MulVal.getNode()->getOperand(0);
+            ConstantSDNode *AddVal =
+              cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
+            uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
+            if (foldOffsetIntoAddress(Disp, AM))
+              Reg = N.getNode()->getOperand(0);
+          } else {
+            Reg = N.getNode()->getOperand(0);
+          }
+
+          AM.IndexReg = AM.Base_Reg = Reg;
+          return false;
+        }
+    }
+    break;
+
+  case ISD::SUB: {
+    // Given A-B, if A can be completely folded into the address and
+    // the index field with the index field unused, use -B as the index.
+    // This is a win if a has multiple parts that can be folded into
+    // the address. Also, this saves a mov if the base register has
+    // other uses, since it avoids a two-address sub instruction, however
+    // it costs an additional mov if the index register has other uses.
+
+    // Add an artificial use to this node so that we can keep track of
+    // it if it gets CSE'd with a different node.
+    HandleSDNode Handle(N);
+
+    // Test if the LHS of the sub can be folded.
+    X86ISelAddressMode Backup = AM;
+    if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
+      AM = Backup;
+      break;
+    }
+    // Test if the index field is free for use.
+    if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
+      AM = Backup;
+      break;
+    }
+
+    int Cost = 0;
+    SDValue RHS = Handle.getValue().getNode()->getOperand(1);
+    // If the RHS involves a register with multiple uses, this
+    // transformation incurs an extra mov, due to the neg instruction
+    // clobbering its operand.
+    if (!RHS.getNode()->hasOneUse() ||
+        RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
+        RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
+        RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
+        (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
+         RHS.getNode()->getOperand(0).getValueType() == MVT::i32))
+      ++Cost;
+    // If the base is a register with multiple uses, this
+    // transformation may save a mov.
+    if ((AM.BaseType == X86ISelAddressMode::RegBase &&
+         AM.Base_Reg.getNode() &&
+         !AM.Base_Reg.getNode()->hasOneUse()) ||
+        AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+      --Cost;
+    // If the folded LHS was interesting, this transformation saves
+    // address arithmetic.
+    if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
+        ((AM.Disp != 0) && (Backup.Disp == 0)) +
+        (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
+      --Cost;
+    // If it doesn't look like it may be an overall win, don't do it.
+    if (Cost >= 0) {
+      AM = Backup;
+      break;
+    }
+
+    // Ok, the transformation is legal and appears profitable. Go for it.
+    SDValue Zero = CurDAG->getConstant(0, dl, N.getValueType());
+    SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
+    AM.IndexReg = Neg;
+    AM.Scale = 1;
+
+    // Insert the new nodes into the topological ordering.
+    insertDAGNode(*CurDAG, N, Zero);
+    insertDAGNode(*CurDAG, N, Neg);
+    return false;
+  }
+
+  case ISD::ADD:
+    if (!matchAdd(N, AM, Depth))
+      return false;
+    break;
+
+  case ISD::OR:
+    // We want to look through a transform in InstCombine and DAGCombiner that
+    // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
+    // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
+    // An 'lea' can then be used to match the shift (multiply) and add:
+    // and $1, %esi
+    // lea (%rsi, %rdi, 8), %rax
+    if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
+        !matchAdd(N, AM, Depth))
+      return false;
+    break;
+
+  case ISD::AND: {
+    // Perform some heroic transforms on an and of a constant-count shift
+    // with a constant to enable use of the scaled offset field.
+
+    // Scale must not be used already.
+    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
+
+    SDValue Shift = N.getOperand(0);
+    if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
+    SDValue X = Shift.getOperand(0);
+
+    // We only handle up to 64-bit values here as those are what matter for
+    // addressing mode optimizations.
+    if (X.getSimpleValueType().getSizeInBits() > 64) break;
+
+    if (!isa<ConstantSDNode>(N.getOperand(1)))
+      break;
+    uint64_t Mask = N.getConstantOperandVal(1);
+
+    // Try to fold the mask and shift into an extract and scale.
+    if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
+      return false;
+
+    // Try to fold the mask and shift directly into the scale.
+    if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
+      return false;
+
+    // Try to swap the mask and shift to place shifts which can be done as
+    // a scale on the outside of the mask.
+    if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
+      return false;
+    break;
+  }
+  }
+
+  return matchAddressBase(N, AM);
+}
+
+/// Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
+  // Is the base register already occupied?
+  if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
+    // If so, check to see if the scale index register is set.
+    if (!AM.IndexReg.getNode()) {
+      AM.IndexReg = N;
+      AM.Scale = 1;
+      return false;
+    }
+
+    // Otherwise, we cannot select it.
+    return true;
+  }
+
+  // Default, generate it as a register.
+  AM.BaseType = X86ISelAddressMode::RegBase;
+  AM.Base_Reg = N;
+  return false;
+}
+
+bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
+                                      SDValue &Scale, SDValue &Index,
+                                      SDValue &Disp, SDValue &Segment) {
+
+  MaskedGatherScatterSDNode *Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent);
+  if (!Mgs)
+    return false;
+  X86ISelAddressMode AM;
+  unsigned AddrSpace = Mgs->getPointerInfo().getAddrSpace();
+  // AddrSpace 256 -> GS, 257 -> FS.
+  if (AddrSpace == 256)
+    AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+  if (AddrSpace == 257)
+    AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+
+  SDLoc DL(N);
+  Base = Mgs->getBasePtr();
+  Index = Mgs->getIndex();
+  unsigned ScalarSize = Mgs->getValue().getValueType().getScalarSizeInBits();
+  Scale = getI8Imm(ScalarSize/8, DL);
+
+  // If Base is 0, the whole address is in index and the Scale is 1
+  if (isa<ConstantSDNode>(Base)) {
+    assert(cast<ConstantSDNode>(Base)->isNullValue() &&
+           "Unexpected base in gather/scatter");
+    Scale = getI8Imm(1, DL);
+    Base = CurDAG->getRegister(0, MVT::i32);
+  }
+  if (AM.Segment.getNode())
+    Segment = AM.Segment;
+  else
+    Segment = CurDAG->getRegister(0, MVT::i32);
+  Disp = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  return true;
+}
+
+/// Returns true if it is able to pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+///
+/// Parent is the parent node of the addr operand that is being matched.  It
+/// is always a load, store, atomic node, or null.  It is only null when
+/// checking memory operands for inline asm nodes.
+bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+                                 SDValue &Scale, SDValue &Index,
+                                 SDValue &Disp, SDValue &Segment) {
+  X86ISelAddressMode AM;
+
+  if (Parent &&
+      // This list of opcodes are all the nodes that have an "addr:$ptr" operand
+      // that are not a MemSDNode, and thus don't have proper addrspace info.
+      Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
+      Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
+      Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
+      Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
+      Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
+    unsigned AddrSpace =
+      cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
+    // AddrSpace 256 -> GS, 257 -> FS.
+    if (AddrSpace == 256)
+      AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+    if (AddrSpace == 257)
+      AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+  }
+
+  if (matchAddress(N, AM))
+    return false;
+
+  MVT VT = N.getSimpleValueType();
+  if (AM.BaseType == X86ISelAddressMode::RegBase) {
+    if (!AM.Base_Reg.getNode())
+      AM.Base_Reg = CurDAG->getRegister(0, VT);
+  }
+
+  if (!AM.IndexReg.getNode())
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
+  return true;
+}
+
+/// Match a scalar SSE load. In particular, we want to match a load whose top
+/// elements are either undef or zeros. The load flavor is derived from the
+/// type of N, which is either v4f32 or v2f64.
+///
+/// We also return:
+///   PatternChainNode: this is the matched node that has a chain input and
+///   output.
+bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
+                                          SDValue N, SDValue &Base,
+                                          SDValue &Scale, SDValue &Index,
+                                          SDValue &Disp, SDValue &Segment,
+                                          SDValue &PatternNodeWithChain) {
+  if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    PatternNodeWithChain = N.getOperand(0);
+    if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
+        PatternNodeWithChain.hasOneUse() &&
+        IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
+        IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
+      LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
+      if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+        return false;
+      return true;
+    }
+  }
+
+  // Also handle the case where we explicitly require zeros in the top
+  // elements.  This is a vector shuffle from the zero vector.
+  if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
+      // Check to see if the top elements are all zeros (or bitcast of zeros).
+      N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+      N.getOperand(0).getNode()->hasOneUse() &&
+      ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) &&
+      N.getOperand(0).getOperand(0).hasOneUse() &&
+      IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
+      IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
+    // Okay, this is a zero extending load.  Fold it.
+    LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0));
+    if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+      return false;
+    PatternNodeWithChain = SDValue(LD, 0);
+    return true;
+  }
+  return false;
+}
+
+
+bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
+  if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+    uint64_t ImmVal = CN->getZExtValue();
+    if ((uint32_t)ImmVal != (uint64_t)ImmVal)
+      return false;
+
+    Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
+    return true;
+  }
+
+  // In static codegen with small code model, we can get the address of a label
+  // into a register with 'movl'. TableGen has already made sure we're looking
+  // at a label of some kind.
+  assert(N->getOpcode() == X86ISD::Wrapper &&
+         "Unexpected node type for MOV32ri64");
+  N = N.getOperand(0);
+
+  if (N->getOpcode() != ISD::TargetConstantPool &&
+      N->getOpcode() != ISD::TargetJumpTable &&
+      N->getOpcode() != ISD::TargetGlobalAddress &&
+      N->getOpcode() != ISD::TargetExternalSymbol &&
+      N->getOpcode() != ISD::MCSymbol &&
+      N->getOpcode() != ISD::TargetBlockAddress)
+    return false;
+
+  Imm = N;
+  return TM.getCodeModel() == CodeModel::Small;
+}
+
+bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
+                                         SDValue &Scale, SDValue &Index,
+                                         SDValue &Disp, SDValue &Segment) {
+  if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
+    return false;
+
+  SDLoc DL(N);
+  RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
+  if (RN && RN->getReg() == 0)
+    Base = CurDAG->getRegister(0, MVT::i64);
+  else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(Base)) {
+    // Base could already be %rip, particularly in the x32 ABI.
+    Base = SDValue(CurDAG->getMachineNode(
+                       TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
+                       CurDAG->getTargetConstant(0, DL, MVT::i64),
+                       Base,
+                       CurDAG->getTargetConstant(X86::sub_32bit, DL, MVT::i32)),
+                   0);
+  }
+
+  RN = dyn_cast<RegisterSDNode>(Index);
+  if (RN && RN->getReg() == 0)
+    Index = CurDAG->getRegister(0, MVT::i64);
+  else {
+    assert(Index.getValueType() == MVT::i32 &&
+           "Expect to be extending 32-bit registers for use in LEA");
+    Index = SDValue(CurDAG->getMachineNode(
+                        TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
+                        CurDAG->getTargetConstant(0, DL, MVT::i64),
+                        Index,
+                        CurDAG->getTargetConstant(X86::sub_32bit, DL,
+                                                  MVT::i32)),
+                    0);
+  }
+
+  return true;
+}
+
+/// Calls SelectAddr and determines if the maximal addressing
+/// mode it matches can be cost effectively emitted as an LEA instruction.
+bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
+                                    SDValue &Base, SDValue &Scale,
+                                    SDValue &Index, SDValue &Disp,
+                                    SDValue &Segment) {
+  X86ISelAddressMode AM;
+
+  // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
+  // segments.
+  SDValue Copy = AM.Segment;
+  SDValue T = CurDAG->getRegister(0, MVT::i32);
+  AM.Segment = T;
+  if (matchAddress(N, AM))
+    return false;
+  assert (T == AM.Segment);
+  AM.Segment = Copy;
+
+  MVT VT = N.getSimpleValueType();
+  unsigned Complexity = 0;
+  if (AM.BaseType == X86ISelAddressMode::RegBase)
+    if (AM.Base_Reg.getNode())
+      Complexity = 1;
+    else
+      AM.Base_Reg = CurDAG->getRegister(0, VT);
+  else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+    Complexity = 4;
+
+  if (AM.IndexReg.getNode())
+    Complexity++;
+  else
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
+  // a simple shift.
+  if (AM.Scale > 1)
+    Complexity++;
+
+  // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
+  // to a LEA. This is determined with some experimentation but is by no means
+  // optimal (especially for code size consideration). LEA is nice because of
+  // its three-address nature. Tweak the cost function again when we can run
+  // convertToThreeAddress() at register allocation time.
+  if (AM.hasSymbolicDisplacement()) {
+    // For X86-64, always use LEA to materialize RIP-relative addresses.
+    if (Subtarget->is64Bit())
+      Complexity = 4;
+    else
+      Complexity += 2;
+  }
+
+  if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode()))
+    Complexity++;
+
+  // If it isn't worth using an LEA, reject it.
+  if (Complexity <= 2)
+    return false;
+
+  getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
+  return true;
+}
+
+/// This is only run on TargetGlobalTLSAddress nodes.
+bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
+                                        SDValue &Scale, SDValue &Index,
+                                        SDValue &Disp, SDValue &Segment) {
+  assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
+  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+
+  X86ISelAddressMode AM;
+  AM.GV = GA->getGlobal();
+  AM.Disp += GA->getOffset();
+  AM.Base_Reg = CurDAG->getRegister(0, N.getValueType());
+  AM.SymbolFlags = GA->getTargetFlags();
+
+  if (N.getValueType() == MVT::i32) {
+    AM.Scale = 1;
+    AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
+  } else {
+    AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
+  }
+
+  getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
+  return true;
+}
+
+
+bool X86DAGToDAGISel::tryFoldLoad(SDNode *P, SDValue N,
+                                  SDValue &Base, SDValue &Scale,
+                                  SDValue &Index, SDValue &Disp,
+                                  SDValue &Segment) {
+  if (!ISD::isNON_EXTLoad(N.getNode()) ||
+      !IsProfitableToFold(N, P, P) ||
+      !IsLegalToFold(N, P, P, OptLevel))
+    return false;
+
+  return selectAddr(N.getNode(),
+                    N.getOperand(1), Base, Scale, Index, Disp, Segment);
+}
+
+/// Return an SDNode that returns the value of the global base register.
+/// Output instructions required to initialize the global base register,
+/// if necessary.
+SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
+  unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
+  auto &DL = MF->getDataLayout();
+  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
+}
+
+/// Atomic opcode table
+///
+enum AtomicOpc {
+  ADD,
+  SUB,
+  INC,
+  DEC,
+  OR,
+  AND,
+  XOR,
+  AtomicOpcEnd
+};
+
+enum AtomicSz {
+  ConstantI8,
+  I8,
+  SextConstantI16,
+  ConstantI16,
+  I16,
+  SextConstantI32,
+  ConstantI32,
+  I32,
+  SextConstantI64,
+  ConstantI64,
+  I64,
+  AtomicSzEnd
+};
+
+static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
+  {
+    X86::LOCK_ADD8mi,
+    X86::LOCK_ADD8mr,
+    X86::LOCK_ADD16mi8,
+    X86::LOCK_ADD16mi,
+    X86::LOCK_ADD16mr,
+    X86::LOCK_ADD32mi8,
+    X86::LOCK_ADD32mi,
+    X86::LOCK_ADD32mr,
+    X86::LOCK_ADD64mi8,
+    X86::LOCK_ADD64mi32,
+    X86::LOCK_ADD64mr,
+  },
+  {
+    X86::LOCK_SUB8mi,
+    X86::LOCK_SUB8mr,
+    X86::LOCK_SUB16mi8,
+    X86::LOCK_SUB16mi,
+    X86::LOCK_SUB16mr,
+    X86::LOCK_SUB32mi8,
+    X86::LOCK_SUB32mi,
+    X86::LOCK_SUB32mr,
+    X86::LOCK_SUB64mi8,
+    X86::LOCK_SUB64mi32,
+    X86::LOCK_SUB64mr,
+  },
+  {
+    0,
+    X86::LOCK_INC8m,
+    0,
+    0,
+    X86::LOCK_INC16m,
+    0,
+    0,
+    X86::LOCK_INC32m,
+    0,
+    0,
+    X86::LOCK_INC64m,
+  },
+  {
+    0,
+    X86::LOCK_DEC8m,
+    0,
+    0,
+    X86::LOCK_DEC16m,
+    0,
+    0,
+    X86::LOCK_DEC32m,
+    0,
+    0,
+    X86::LOCK_DEC64m,
+  },
+  {
+    X86::LOCK_OR8mi,
+    X86::LOCK_OR8mr,
+    X86::LOCK_OR16mi8,
+    X86::LOCK_OR16mi,
+    X86::LOCK_OR16mr,
+    X86::LOCK_OR32mi8,
+    X86::LOCK_OR32mi,
+    X86::LOCK_OR32mr,
+    X86::LOCK_OR64mi8,
+    X86::LOCK_OR64mi32,
+    X86::LOCK_OR64mr,
+  },
+  {
+    X86::LOCK_AND8mi,
+    X86::LOCK_AND8mr,
+    X86::LOCK_AND16mi8,
+    X86::LOCK_AND16mi,
+    X86::LOCK_AND16mr,
+    X86::LOCK_AND32mi8,
+    X86::LOCK_AND32mi,
+    X86::LOCK_AND32mr,
+    X86::LOCK_AND64mi8,
+    X86::LOCK_AND64mi32,
+    X86::LOCK_AND64mr,
+  },
+  {
+    X86::LOCK_XOR8mi,
+    X86::LOCK_XOR8mr,
+    X86::LOCK_XOR16mi8,
+    X86::LOCK_XOR16mi,
+    X86::LOCK_XOR16mr,
+    X86::LOCK_XOR32mi8,
+    X86::LOCK_XOR32mi,
+    X86::LOCK_XOR32mr,
+    X86::LOCK_XOR64mi8,
+    X86::LOCK_XOR64mi32,
+    X86::LOCK_XOR64mr,
+  }
+};
+
+// Return the target constant operand for atomic-load-op and do simple
+// translations, such as from atomic-load-add to lock-sub. The return value is
+// one of the following 3 cases:
+// + target-constant, the operand could be supported as a target constant.
+// + empty, the operand is not needed any more with the new op selected.
+// + non-empty, otherwise.
+static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
+                                                SDLoc dl,
+                                                enum AtomicOpc &Op, MVT NVT,
+                                                SDValue Val,
+                                                const X86Subtarget *Subtarget) {
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) {
+    int64_t CNVal = CN->getSExtValue();
+    // Quit if not 32-bit imm.
+    if ((int32_t)CNVal != CNVal)
+      return Val;
+    // Quit if INT32_MIN: it would be negated as it is negative and overflow,
+    // producing an immediate that does not fit in the 32 bits available for
+    // an immediate operand to sub. However, it still fits in 32 bits for the
+    // add (since it is not negated) so we can return target-constant.
+    if (CNVal == INT32_MIN)
+      return CurDAG->getTargetConstant(CNVal, dl, NVT);
+    // For atomic-load-add, we could do some optimizations.
+    if (Op == ADD) {
+      // Translate to INC/DEC if ADD by 1 or -1.
+      if (((CNVal == 1) || (CNVal == -1)) && !Subtarget->slowIncDec()) {
+        Op = (CNVal == 1) ? INC : DEC;
+        // No more constant operand after being translated into INC/DEC.
+        return SDValue();
+      }
+      // Translate to SUB if ADD by negative value.
+      if (CNVal < 0) {
+        Op = SUB;
+        CNVal = -CNVal;
+      }
+    }
+    return CurDAG->getTargetConstant(CNVal, dl, NVT);
+  }
+
+  // If the value operand is single-used, try to optimize it.
+  if (Op == ADD && Val.hasOneUse()) {
+    // Translate (atomic-load-add ptr (sub 0 x)) back to (lock-sub x).
+    if (Val.getOpcode() == ISD::SUB && X86::isZeroNode(Val.getOperand(0))) {
+      Op = SUB;
+      return Val.getOperand(1);
+    }
+    // A special case for i16, which needs truncating as, in most cases, it's
+    // promoted to i32. We will translate
+    // (atomic-load-add (truncate (sub 0 x))) to (lock-sub (EXTRACT_SUBREG x))
+    if (Val.getOpcode() == ISD::TRUNCATE && NVT == MVT::i16 &&
+        Val.getOperand(0).getOpcode() == ISD::SUB &&
+        X86::isZeroNode(Val.getOperand(0).getOperand(0))) {
+      Op = SUB;
+      Val = Val.getOperand(0);
+      return CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, NVT,
+                                            Val.getOperand(1));
+    }
+  }
+
+  return Val;
+}
+
+SDNode *X86DAGToDAGISel::selectAtomicLoadArith(SDNode *Node, MVT NVT) {
+  if (Node->hasAnyUseOfValue(0))
+    return nullptr;
+
+  SDLoc dl(Node);
+
+  // Optimize common patterns for __sync_or_and_fetch and similar arith
+  // operations where the result is not used. This allows us to use the "lock"
+  // version of the arithmetic instruction.
+  SDValue Chain = Node->getOperand(0);
+  SDValue Ptr = Node->getOperand(1);
+  SDValue Val = Node->getOperand(2);
+  SDValue Base, Scale, Index, Disp, Segment;
+  if (!selectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment))
+    return nullptr;
+
+  // Which index into the table.
+  enum AtomicOpc Op;
+  switch (Node->getOpcode()) {
+    default:
+      return nullptr;
+    case ISD::ATOMIC_LOAD_OR:
+      Op = OR;
+      break;
+    case ISD::ATOMIC_LOAD_AND:
+      Op = AND;
+      break;
+    case ISD::ATOMIC_LOAD_XOR:
+      Op = XOR;
+      break;
+    case ISD::ATOMIC_LOAD_ADD:
+      Op = ADD;
+      break;
+  }
+
+  Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val, Subtarget);
+  bool isUnOp = !Val.getNode();
+  bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant);
+
+  unsigned Opc = 0;
+  switch (NVT.SimpleTy) {
+    default: return nullptr;
+    case MVT::i8:
+      if (isCN)
+        Opc = AtomicOpcTbl[Op][ConstantI8];
+      else
+        Opc = AtomicOpcTbl[Op][I8];
+      break;
+    case MVT::i16:
+      if (isCN) {
+        if (immSext8(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][SextConstantI16];
+        else
+          Opc = AtomicOpcTbl[Op][ConstantI16];
+      } else
+        Opc = AtomicOpcTbl[Op][I16];
+      break;
+    case MVT::i32:
+      if (isCN) {
+        if (immSext8(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][SextConstantI32];
+        else
+          Opc = AtomicOpcTbl[Op][ConstantI32];
+      } else
+        Opc = AtomicOpcTbl[Op][I32];
+      break;
+    case MVT::i64:
+      if (isCN) {
+        if (immSext8(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][SextConstantI64];
+        else if (i64immSExt32(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][ConstantI64];
+        else
+          llvm_unreachable("True 64 bits constant in SelectAtomicLoadArith");
+      } else
+        Opc = AtomicOpcTbl[Op][I64];
+      break;
+  }
+
+  assert(Opc != 0 && "Invalid arith lock transform!");
+
+  // Building the new node.
+  SDValue Ret;
+  if (isUnOp) {
+    SDValue Ops[] = { Base, Scale, Index, Disp, Segment, Chain };
+    Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0);
+  } else {
+    SDValue Ops[] = { Base, Scale, Index, Disp, Segment, Val, Chain };
+    Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0);
+  }
+
+  // Copying the MachineMemOperand.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+  cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
+
+  // We need to have two outputs as that is what the original instruction had.
+  // So we add a dummy, undefined output. This is safe as we checked first
+  // that no-one uses our output anyway.
+  SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                 dl, NVT), 0);
+  SDValue RetVals[] = { Undef, Ret };
+  return CurDAG->getMergeValues(RetVals, dl).getNode();
+}
+
+/// Test whether the given X86ISD::CMP node has any uses which require the SF
+/// or OF bits to be accurate.
+static bool hasNoSignedComparisonUses(SDNode *N) {
+  // Examine each user of the node.
+  for (SDNode::use_iterator UI = N->use_begin(),
+         UE = N->use_end(); UI != UE; ++UI) {
+    // Only examine CopyToReg uses.
+    if (UI->getOpcode() != ISD::CopyToReg)
+      return false;
+    // Only examine CopyToReg uses that copy to EFLAGS.
+    if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() !=
+          X86::EFLAGS)
+      return false;
+    // Examine each user of the CopyToReg use.
+    for (SDNode::use_iterator FlagUI = UI->use_begin(),
+           FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
+      // Only examine the Flag result.
+      if (FlagUI.getUse().getResNo() != 1) continue;
+      // Anything unusual: assume conservatively.
+      if (!FlagUI->isMachineOpcode()) return false;
+      // Examine the opcode of the user.
+      switch (FlagUI->getMachineOpcode()) {
+      // These comparisons don't treat the most significant bit specially.
+      case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr:
+      case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr:
+      case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm:
+      case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm:
+      case X86::JA_1: case X86::JAE_1: case X86::JB_1: case X86::JBE_1:
+      case X86::JE_1: case X86::JNE_1: case X86::JP_1: case X86::JNP_1:
+      case X86::CMOVA16rr: case X86::CMOVA16rm:
+      case X86::CMOVA32rr: case X86::CMOVA32rm:
+      case X86::CMOVA64rr: case X86::CMOVA64rm:
+      case X86::CMOVAE16rr: case X86::CMOVAE16rm:
+      case X86::CMOVAE32rr: case X86::CMOVAE32rm:
+      case X86::CMOVAE64rr: case X86::CMOVAE64rm:
+      case X86::CMOVB16rr: case X86::CMOVB16rm:
+      case X86::CMOVB32rr: case X86::CMOVB32rm:
+      case X86::CMOVB64rr: case X86::CMOVB64rm:
+      case X86::CMOVBE16rr: case X86::CMOVBE16rm:
+      case X86::CMOVBE32rr: case X86::CMOVBE32rm:
+      case X86::CMOVBE64rr: case X86::CMOVBE64rm:
+      case X86::CMOVE16rr: case X86::CMOVE16rm:
+      case X86::CMOVE32rr: case X86::CMOVE32rm:
+      case X86::CMOVE64rr: case X86::CMOVE64rm:
+      case X86::CMOVNE16rr: case X86::CMOVNE16rm:
+      case X86::CMOVNE32rr: case X86::CMOVNE32rm:
+      case X86::CMOVNE64rr: case X86::CMOVNE64rm:
+      case X86::CMOVNP16rr: case X86::CMOVNP16rm:
+      case X86::CMOVNP32rr: case X86::CMOVNP32rm:
+      case X86::CMOVNP64rr: case X86::CMOVNP64rm:
+      case X86::CMOVP16rr: case X86::CMOVP16rm:
+      case X86::CMOVP32rr: case X86::CMOVP32rm:
+      case X86::CMOVP64rr: case X86::CMOVP64rm:
+        continue;
+      // Anything else: assume conservatively.
+      default: return false;
+      }
+    }
+  }
+  return true;
+}
+
+/// Check whether or not the chain ending in StoreNode is suitable for doing
+/// the {load; increment or decrement; store} to modify transformation.
+static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
+                                SDValue StoredVal, SelectionDAG *CurDAG,
+                                LoadSDNode* &LoadNode, SDValue &InputChain) {
+
+  // is the value stored the result of a DEC or INC?
+  if (!(Opc == X86ISD::DEC || Opc == X86ISD::INC)) return false;
+
+  // is the stored value result 0 of the load?
+  if (StoredVal.getResNo() != 0) return false;
+
+  // are there other uses of the loaded value than the inc or dec?
+  if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
+
+  // is the store non-extending and non-indexed?
+  if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
+    return false;
+
+  SDValue Load = StoredVal->getOperand(0);
+  // Is the stored value a non-extending and non-indexed load?
+  if (!ISD::isNormalLoad(Load.getNode())) return false;
+
+  // Return LoadNode by reference.
+  LoadNode = cast<LoadSDNode>(Load);
+  // is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8)
+  EVT LdVT = LoadNode->getMemoryVT();
+  if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 &&
+      LdVT != MVT::i8)
+    return false;
+
+  // Is store the only read of the loaded value?
+  if (!Load.hasOneUse())
+    return false;
+
+  // Is the address of the store the same as the load?
+  if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
+      LoadNode->getOffset() != StoreNode->getOffset())
+    return false;
+
+  // Check if the chain is produced by the load or is a TokenFactor with
+  // the load output chain as an operand. Return InputChain by reference.
+  SDValue Chain = StoreNode->getChain();
+
+  bool ChainCheck = false;
+  if (Chain == Load.getValue(1)) {
+    ChainCheck = true;
+    InputChain = LoadNode->getChain();
+  } else if (Chain.getOpcode() == ISD::TokenFactor) {
+    SmallVector<SDValue, 4> ChainOps;
+    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
+      SDValue Op = Chain.getOperand(i);
+      if (Op == Load.getValue(1)) {
+        ChainCheck = true;
+        continue;
+      }
+
+      // Make sure using Op as part of the chain would not cause a cycle here.
+      // In theory, we could check whether the chain node is a predecessor of
+      // the load. But that can be very expensive. Instead visit the uses and
+      // make sure they all have smaller node id than the load.
+      int LoadId = LoadNode->getNodeId();
+      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+             UE = UI->use_end(); UI != UE; ++UI) {
+        if (UI.getUse().getResNo() != 0)
+          continue;
+        if (UI->getNodeId() > LoadId)
+          return false;
+      }
+
+      ChainOps.push_back(Op);
+    }
+
+    if (ChainCheck)
+      // Make a new TokenFactor with all the other input chains except
+      // for the load.
+      InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
+                                   MVT::Other, ChainOps);
+  }
+  if (!ChainCheck)
+    return false;
+
+  return true;
+}
+
+/// Get the appropriate X86 opcode for an in-memory increment or decrement.
+/// Opc should be X86ISD::DEC or X86ISD::INC.
+static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
+  if (Opc == X86ISD::DEC) {
+    if (LdVT == MVT::i64) return X86::DEC64m;
+    if (LdVT == MVT::i32) return X86::DEC32m;
+    if (LdVT == MVT::i16) return X86::DEC16m;
+    if (LdVT == MVT::i8)  return X86::DEC8m;
+  } else {
+    assert(Opc == X86ISD::INC && "unrecognized opcode");
+    if (LdVT == MVT::i64) return X86::INC64m;
+    if (LdVT == MVT::i32) return X86::INC32m;
+    if (LdVT == MVT::i16) return X86::INC16m;
+    if (LdVT == MVT::i8)  return X86::INC8m;
+  }
+  llvm_unreachable("unrecognized size for LdVT");
+}
+
+/// Customized ISel for GATHER operations.
+SDNode *X86DAGToDAGISel::selectGather(SDNode *Node, unsigned Opc) {
+  // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
+  SDValue Chain = Node->getOperand(0);
+  SDValue VSrc = Node->getOperand(2);
+  SDValue Base = Node->getOperand(3);
+  SDValue VIdx = Node->getOperand(4);
+  SDValue VMask = Node->getOperand(5);
+  ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
+  if (!Scale)
+    return nullptr;
+
+  SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
+                                   MVT::Other);
+
+  SDLoc DL(Node);
+
+  // Memory Operands: Base, Scale, Index, Disp, Segment
+  SDValue Disp = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  SDValue Segment = CurDAG->getRegister(0, MVT::i32);
+  const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue(), DL), VIdx,
+                          Disp, Segment, VMask, Chain};
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
+  // Node has 2 outputs: VDst and MVT::Other.
+  // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
+  // We replace VDst of Node with VDst of ResNode, and Other of Node with Other
+  // of ResNode.
+  ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+  ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2));
+  return ResNode;
+}
+
+SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
+  MVT NVT = Node->getSimpleValueType(0);
+  unsigned Opc, MOpc;
+  unsigned Opcode = Node->getOpcode();
+  SDLoc dl(Node);
+
+  DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
+
+  if (Node->isMachineOpcode()) {
+    DEBUG(dbgs() << "== ";  Node->dump(CurDAG); dbgs() << '\n');
+    Node->setNodeId(-1);
+    return nullptr;   // Already selected.
+  }
+
+  switch (Opcode) {
+  default: break;
+  case ISD::BRIND: {
+    if (Subtarget->isTargetNaCl())
+      // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
+      // leave the instruction alone.
+      break;
+    if (Subtarget->isTarget64BitILP32()) {
+      // Converts a 32-bit register to a 64-bit, zero-extended version of
+      // it. This is needed because x86-64 can do many things, but jmp %r32
+      // ain't one of them.
+      const SDValue &Target = Node->getOperand(1);
+      assert(Target.getSimpleValueType() == llvm::MVT::i32);
+      SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
+      SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
+                                      Node->getOperand(0), ZextTarget);
+      ReplaceUses(SDValue(Node, 0), Brind);
+      SelectCode(ZextTarget.getNode());
+      SelectCode(Brind.getNode());
+      return nullptr;
+    }
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default: break;
+    case Intrinsic::x86_avx2_gather_d_pd:
+    case Intrinsic::x86_avx2_gather_d_pd_256:
+    case Intrinsic::x86_avx2_gather_q_pd:
+    case Intrinsic::x86_avx2_gather_q_pd_256:
+    case Intrinsic::x86_avx2_gather_d_ps:
+    case Intrinsic::x86_avx2_gather_d_ps_256:
+    case Intrinsic::x86_avx2_gather_q_ps:
+    case Intrinsic::x86_avx2_gather_q_ps_256:
+    case Intrinsic::x86_avx2_gather_d_q:
+    case Intrinsic::x86_avx2_gather_d_q_256:
+    case Intrinsic::x86_avx2_gather_q_q:
+    case Intrinsic::x86_avx2_gather_q_q_256:
+    case Intrinsic::x86_avx2_gather_d_d:
+    case Intrinsic::x86_avx2_gather_d_d_256:
+    case Intrinsic::x86_avx2_gather_q_d:
+    case Intrinsic::x86_avx2_gather_q_d_256: {
+      if (!Subtarget->hasAVX2())
+        break;
+      unsigned Opc;
+      switch (IntNo) {
+      default: llvm_unreachable("Impossible intrinsic");
+      case Intrinsic::x86_avx2_gather_d_pd:     Opc = X86::VGATHERDPDrm;  break;
+      case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break;
+      case Intrinsic::x86_avx2_gather_q_pd:     Opc = X86::VGATHERQPDrm;  break;
+      case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break;
+      case Intrinsic::x86_avx2_gather_d_ps:     Opc = X86::VGATHERDPSrm;  break;
+      case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break;
+      case Intrinsic::x86_avx2_gather_q_ps:     Opc = X86::VGATHERQPSrm;  break;
+      case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break;
+      case Intrinsic::x86_avx2_gather_d_q:      Opc = X86::VPGATHERDQrm;  break;
+      case Intrinsic::x86_avx2_gather_d_q_256:  Opc = X86::VPGATHERDQYrm; break;
+      case Intrinsic::x86_avx2_gather_q_q:      Opc = X86::VPGATHERQQrm;  break;
+      case Intrinsic::x86_avx2_gather_q_q_256:  Opc = X86::VPGATHERQQYrm; break;
+      case Intrinsic::x86_avx2_gather_d_d:      Opc = X86::VPGATHERDDrm;  break;
+      case Intrinsic::x86_avx2_gather_d_d_256:  Opc = X86::VPGATHERDDYrm; break;
+      case Intrinsic::x86_avx2_gather_q_d:      Opc = X86::VPGATHERQDrm;  break;
+      case Intrinsic::x86_avx2_gather_q_d_256:  Opc = X86::VPGATHERQDYrm; break;
+      }
+      SDNode *RetVal = selectGather(Node, Opc);
+      if (RetVal)
+        // We already called ReplaceUses inside SelectGather.
+        return nullptr;
+      break;
+    }
+    }
+    break;
+  }
+  case X86ISD::GlobalBaseReg:
+    return getGlobalBaseReg();
+
+  case X86ISD::SHRUNKBLEND: {
+    // SHRUNKBLEND selects like a regular VSELECT.
+    SDValue VSelect = CurDAG->getNode(
+        ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
+        Node->getOperand(1), Node->getOperand(2));
+    ReplaceUses(SDValue(Node, 0), VSelect);
+    SelectCode(VSelect.getNode());
+    // We already called ReplaceUses.
+    return nullptr;
+  }
+
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_ADD: {
+    SDNode *RetVal = selectAtomicLoadArith(Node, NVT);
+    if (RetVal)
+      return RetVal;
+    break;
+  }
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    // For operations of the form (x << C1) op C2, check if we can use a smaller
+    // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse())
+      break;
+
+    // i8 is unshrinkable, i16 should be promoted to i32.
+    if (NVT != MVT::i32 && NVT != MVT::i64)
+      break;
+
+    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
+    ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+    if (!Cst || !ShlCst)
+      break;
+
+    int64_t Val = Cst->getSExtValue();
+    uint64_t ShlVal = ShlCst->getZExtValue();
+
+    // Make sure that we don't change the operation by removing bits.
+    // This only matters for OR and XOR, AND is unaffected.
+    uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
+    if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
+      break;
+
+    unsigned ShlOp, AddOp, Op;
+    MVT CstVT = NVT;
+
+    // Check the minimum bitwidth for the new constant.
+    // TODO: AND32ri is the same as AND64ri32 with zext imm.
+    // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
+    // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
+    if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal))
+      CstVT = MVT::i8;
+    else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal))
+      CstVT = MVT::i32;
+
+    // Bail if there is no smaller encoding.
+    if (NVT == CstVT)
+      break;
+
+    switch (NVT.SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i32:
+      assert(CstVT == MVT::i8);
+      ShlOp = X86::SHL32ri;
+      AddOp = X86::ADD32rr;
+
+      switch (Opcode) {
+      default: llvm_unreachable("Impossible opcode");
+      case ISD::AND: Op = X86::AND32ri8; break;
+      case ISD::OR:  Op =  X86::OR32ri8; break;
+      case ISD::XOR: Op = X86::XOR32ri8; break;
+      }
+      break;
+    case MVT::i64:
+      assert(CstVT == MVT::i8 || CstVT == MVT::i32);
+      ShlOp = X86::SHL64ri;
+      AddOp = X86::ADD64rr;
+
+      switch (Opcode) {
+      default: llvm_unreachable("Impossible opcode");
+      case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
+      case ISD::OR:  Op = CstVT==MVT::i8?  X86::OR64ri8 :  X86::OR64ri32; break;
+      case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
+      }
+      break;
+    }
+
+    // Emit the smaller op and the shift.
+    SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, dl, CstVT);
+    SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
+    if (ShlVal == 1)
+      return CurDAG->SelectNodeTo(Node, AddOp, NVT, SDValue(New, 0),
+                                  SDValue(New, 0));
+    return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
+                                getI8Imm(ShlVal, dl));
+  }
+  case X86ISD::UMUL8:
+  case X86ISD::SMUL8: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r);
+
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::AL,
+                                          N0, SDValue()).getValue(1);
+
+    SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32);
+    SDValue Ops[] = {N1, InFlag};
+    SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+
+    ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+    ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1));
+    return nullptr;
+  }
+
+  case X86ISD::UMUL: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    unsigned LoReg;
+    switch (NVT.SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i8:  LoReg = X86::AL;  Opc = X86::MUL8r; break;
+    case MVT::i16: LoReg = X86::AX;  Opc = X86::MUL16r; break;
+    case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break;
+    case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break;
+    }
+
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
+                                          N0, SDValue()).getValue(1);
+
+    SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
+    SDValue Ops[] = {N1, InFlag};
+    SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+
+    ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+    ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1));
+    ReplaceUses(SDValue(Node, 2), SDValue(CNode, 2));
+    return nullptr;
+  }
+
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    bool isSigned = Opcode == ISD::SMUL_LOHI;
+    bool hasBMI2 = Subtarget->hasBMI2();
+    if (!isSigned) {
+      switch (NVT.SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
+      case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
+      case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r;
+                     MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break;
+      case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r;
+                     MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break;
+      }
+    } else {
+      switch (NVT.SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::IMUL8r;  MOpc = X86::IMUL8m;  break;
+      case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
+      case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
+      case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
+      }
+    }
+
+    unsigned SrcReg, LoReg, HiReg;
+    switch (Opc) {
+    default: llvm_unreachable("Unknown MUL opcode!");
+    case X86::IMUL8r:
+    case X86::MUL8r:
+      SrcReg = LoReg = X86::AL; HiReg = X86::AH;
+      break;
+    case X86::IMUL16r:
+    case X86::MUL16r:
+      SrcReg = LoReg = X86::AX; HiReg = X86::DX;
+      break;
+    case X86::IMUL32r:
+    case X86::MUL32r:
+      SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
+      break;
+    case X86::IMUL64r:
+    case X86::MUL64r:
+      SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
+      break;
+    case X86::MULX32rr:
+      SrcReg = X86::EDX; LoReg = HiReg = 0;
+      break;
+    case X86::MULX64rr:
+      SrcReg = X86::RDX; LoReg = HiReg = 0;
+      break;
+    }
+
+    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+    bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+    // Multiply is commmutative.
+    if (!foldedLoad) {
+      foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+      if (foldedLoad)
+        std::swap(N0, N1);
+    }
+
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
+                                          N0, SDValue()).getValue(1);
+    SDValue ResHi, ResLo;
+
+    if (foldedLoad) {
+      SDValue Chain;
+      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+                        InFlag };
+      if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) {
+        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+        ResHi = SDValue(CNode, 0);
+        ResLo = SDValue(CNode, 1);
+        Chain = SDValue(CNode, 2);
+        InFlag = SDValue(CNode, 3);
+      } else {
+        SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+        Chain = SDValue(CNode, 0);
+        InFlag = SDValue(CNode, 1);
+      }
+
+      // Update the chain.
+      ReplaceUses(N1.getValue(1), Chain);
+    } else {
+      SDValue Ops[] = { N1, InFlag };
+      if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) {
+        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+        ResHi = SDValue(CNode, 0);
+        ResLo = SDValue(CNode, 1);
+        InFlag = SDValue(CNode, 2);
+      } else {
+        SDVTList VTs = CurDAG->getVTList(MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+        InFlag = SDValue(CNode, 0);
+      }
+    }
+
+    // Prevent use of AH in a REX instruction by referencing AX instead.
+    if (HiReg == X86::AH && Subtarget->is64Bit() &&
+        !SDValue(Node, 1).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                              X86::AX, MVT::i16, InFlag);
+      InFlag = Result.getValue(2);
+      // Get the low part if needed. Don't use getCopyFromReg for aliasing
+      // registers.
+      if (!SDValue(Node, 0).use_empty())
+        ReplaceUses(SDValue(Node, 1),
+          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+
+      // Shift AX down 8 bits.
+      Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
+                                              Result,
+                                     CurDAG->getTargetConstant(8, dl, MVT::i8)),
+                       0);
+      // Then truncate it down to i8.
+      ReplaceUses(SDValue(Node, 1),
+        CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+    }
+    // Copy the low half of the result, if it is needed.
+    if (!SDValue(Node, 0).use_empty()) {
+      if (!ResLo.getNode()) {
+        assert(LoReg && "Register for low half is not defined!");
+        ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT,
+                                       InFlag);
+        InFlag = ResLo.getValue(2);
+      }
+      ReplaceUses(SDValue(Node, 0), ResLo);
+      DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n');
+    }
+    // Copy the high half of the result, if it is needed.
+    if (!SDValue(Node, 1).use_empty()) {
+      if (!ResHi.getNode()) {
+        assert(HiReg && "Register for high half is not defined!");
+        ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT,
+                                       InFlag);
+        InFlag = ResHi.getValue(2);
+      }
+      ReplaceUses(SDValue(Node, 1), ResHi);
+      DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
+    }
+
+    return nullptr;
+  }
+
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+  case X86ISD::SDIVREM8_SEXT_HREG:
+  case X86ISD::UDIVREM8_ZEXT_HREG: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    bool isSigned = (Opcode == ISD::SDIVREM ||
+                     Opcode == X86ISD::SDIVREM8_SEXT_HREG);
+    if (!isSigned) {
+      switch (NVT.SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
+      case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
+      case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
+      case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
+      }
+    } else {
+      switch (NVT.SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
+      case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+      case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+      case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+      }
+    }
+
+    unsigned LoReg, HiReg, ClrReg;
+    unsigned SExtOpcode;
+    switch (NVT.SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i8:
+      LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
+      SExtOpcode = X86::CBW;
+      break;
+    case MVT::i16:
+      LoReg = X86::AX;  HiReg = X86::DX;
+      ClrReg = X86::DX;
+      SExtOpcode = X86::CWD;
+      break;
+    case MVT::i32:
+      LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
+      SExtOpcode = X86::CDQ;
+      break;
+    case MVT::i64:
+      LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
+      SExtOpcode = X86::CQO;
+      break;
+    }
+
+    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+    bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+    bool signBitIsZero = CurDAG->SignBitIsZero(N0);
+
+    SDValue InFlag;
+    if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
+      // Special case for div8, just use a move with zero extension to AX to
+      // clear the upper 8 bits (AH).
+      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
+      if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
+        Move =
+          SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
+                                         MVT::Other, Ops), 0);
+        Chain = Move.getValue(1);
+        ReplaceUses(N0.getValue(1), Chain);
+      } else {
+        Move =
+          SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0);
+        Chain = CurDAG->getEntryNode();
+      }
+      Chain  = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue());
+      InFlag = Chain.getValue(1);
+    } else {
+      InFlag =
+        CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
+                             LoReg, N0, SDValue()).getValue(1);
+      if (isSigned && !signBitIsZero) {
+        // Sign extend the low part into the high part.
+        InFlag =
+          SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
+      } else {
+        // Zero out the high part, effectively zero extending the input.
+        SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
+        switch (NVT.SimpleTy) {
+        case MVT::i16:
+          ClrNode =
+              SDValue(CurDAG->getMachineNode(
+                          TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
+                          CurDAG->getTargetConstant(X86::sub_16bit, dl,
+                                                    MVT::i32)),
+                      0);
+          break;
+        case MVT::i32:
+          break;
+        case MVT::i64:
+          ClrNode =
+              SDValue(CurDAG->getMachineNode(
+                          TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+                          CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
+                          CurDAG->getTargetConstant(X86::sub_32bit, dl,
+                                                    MVT::i32)),
+                      0);
+          break;
+        default:
+          llvm_unreachable("Unexpected division source");
+        }
+
+        InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
+                                      ClrNode, InFlag).getValue(1);
+      }
+    }
+
+    if (foldedLoad) {
+      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+                        InFlag };
+      SDNode *CNode =
+        CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
+      InFlag = SDValue(CNode, 1);
+      // Update the chain.
+      ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+    } else {
+      InFlag =
+        SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
+    }
+
+    // Prevent use of AH in a REX instruction by explicitly copying it to
+    // an ABCD_L register.
+    //
+    // The current assumption of the register allocator is that isel
+    // won't generate explicit references to the GR8_ABCD_H registers. If
+    // the allocator and/or the backend get enhanced to be more robust in
+    // that regard, this can be, and should be, removed.
+    if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
+      SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
+      unsigned AHExtOpcode =
+          isSigned ? X86::MOVSX32_NOREXrr8 : X86::MOVZX32_NOREXrr8;
+
+      SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
+                                             MVT::Glue, AHCopy, InFlag);
+      SDValue Result(RNode, 0);
+      InFlag = SDValue(RNode, 1);
+
+      if (Opcode == X86ISD::UDIVREM8_ZEXT_HREG ||
+          Opcode == X86ISD::SDIVREM8_SEXT_HREG) {
+        if (Node->getValueType(1) == MVT::i64) {
+          // It's not possible to directly movsx AH to a 64bit register, because
+          // the latter needs the REX prefix, but the former can't have it.
+          assert(Opcode != X86ISD::SDIVREM8_SEXT_HREG &&
+                 "Unexpected i64 sext of h-register");
+          Result =
+              SDValue(CurDAG->getMachineNode(
+                          TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+                          CurDAG->getTargetConstant(0, dl, MVT::i64), Result,
+                          CurDAG->getTargetConstant(X86::sub_32bit, dl,
+                                                    MVT::i32)),
+                      0);
+        }
+      } else {
+        Result =
+            CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
+      }
+      ReplaceUses(SDValue(Node, 1), Result);
+      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+    }
+    // Copy the division (low) result, if it is needed.
+    if (!SDValue(Node, 0).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                                LoReg, NVT, InFlag);
+      InFlag = Result.getValue(2);
+      ReplaceUses(SDValue(Node, 0), Result);
+      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+    }
+    // Copy the remainder (high) result, if it is needed.
+    if (!SDValue(Node, 1).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                              HiReg, NVT, InFlag);
+      InFlag = Result.getValue(2);
+      ReplaceUses(SDValue(Node, 1), Result);
+      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+    }
+    return nullptr;
+  }
+
+  case X86ISD::CMP:
+  case X86ISD::SUB: {
+    // Sometimes a SUB is used to perform comparison.
+    if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0))
+      // This node is not a CMP.
+      break;
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
+        hasNoSignedComparisonUses(Node))
+      N0 = N0.getOperand(0);
+
+    // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
+    // use a smaller encoding.
+    // Look past the truncate if CMP is the only use of it.
+    if ((N0.getNode()->getOpcode() == ISD::AND ||
+         (N0.getResNo() == 0 && N0.getNode()->getOpcode() == X86ISD::AND)) &&
+        N0.getNode()->hasOneUse() &&
+        N0.getValueType() != MVT::i8 &&
+        X86::isZeroNode(N1)) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1));
+      if (!C) break;
+
+      // For example, convert "testl %eax, $8" to "testb %al, $8"
+      if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 &&
+          (!(C->getZExtValue() & 0x80) ||
+           hasNoSignedComparisonUses(Node))) {
+        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // On x86-32, only the ABCD registers have 8-bit subregisters.
+        if (!Subtarget->is64Bit()) {
+          const TargetRegisterClass *TRC;
+          switch (N0.getSimpleValueType().SimpleTy) {
+          case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+          case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+          default: llvm_unreachable("Unsupported TEST operand type!");
+          }
+          SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32);
+          Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+                                               Reg.getValueType(), Reg, RC), 0);
+        }
+
+        // Extract the l-register.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
+                                                        MVT::i8, Reg);
+
+        // Emit a testb.
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
+                                                 Subreg, Imm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return nullptr;
+      }
+
+      // For example, "testl %eax, $2048" to "testb %ah, $8".
+      if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 &&
+          (!(C->getZExtValue() & 0x8000) ||
+           hasNoSignedComparisonUses(Node))) {
+        // Shift the immediate right by 8 bits.
+        SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
+                                                       dl, MVT::i8);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // Put the value in an ABCD register.
+        const TargetRegisterClass *TRC;
+        switch (N0.getSimpleValueType().SimpleTy) {
+        case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
+        case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+        case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+        default: llvm_unreachable("Unsupported TEST operand type!");
+        }
+        SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32);
+        Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+                                             Reg.getValueType(), Reg, RC), 0);
+
+        // Extract the h-register.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
+                                                        MVT::i8, Reg);
+
+        // Emit a testb.  The EXTRACT_SUBREG becomes a COPY that can only
+        // target GR8_NOREX registers, so make sure the register class is
+        // forced.
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl,
+                                                 MVT::i32, Subreg, ShiftedImm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return nullptr;
+      }
+
+      // For example, "testl %eax, $32776" to "testw %ax, $32776".
+      if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 &&
+          N0.getValueType() != MVT::i16 &&
+          (!(C->getZExtValue() & 0x8000) ||
+           hasNoSignedComparisonUses(Node))) {
+        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
+                                                MVT::i16);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // Extract the 16-bit subregister.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
+                                                        MVT::i16, Reg);
+
+        // Emit a testw.
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32,
+                                                 Subreg, Imm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return nullptr;
+      }
+
+      // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
+      if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 &&
+          N0.getValueType() == MVT::i64 &&
+          (!(C->getZExtValue() & 0x80000000) ||
+           hasNoSignedComparisonUses(Node))) {
+        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
+                                                MVT::i32);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // Extract the 32-bit subregister.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
+                                                        MVT::i32, Reg);
+
+        // Emit a testl.
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32,
+                                                 Subreg, Imm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return nullptr;
+      }
+    }
+    break;
+  }
+  case ISD::STORE: {
+    // Change a chain of {load; incr or dec; store} of the same value into
+    // a simple increment or decrement through memory of that value, if the
+    // uses of the modified value and its address are suitable.
+    // The DEC64m tablegen pattern is currently not able to match the case where
+    // the EFLAGS on the original DEC are used. (This also applies to
+    // {INC,DEC}X{64,32,16,8}.)
+    // We'll need to improve tablegen to allow flags to be transferred from a
+    // node in the pattern to the result node.  probably with a new keyword
+    // for example, we have this
+    // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+    //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+    //   (implicit EFLAGS)]>;
+    // but maybe need something like this
+    // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+    //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+    //   (transferrable EFLAGS)]>;
+
+    StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
+    SDValue StoredVal = StoreNode->getOperand(1);
+    unsigned Opc = StoredVal->getOpcode();
+
+    LoadSDNode *LoadNode = nullptr;
+    SDValue InputChain;
+    if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG,
+                             LoadNode, InputChain))
+      break;
+
+    SDValue Base, Scale, Index, Disp, Segment;
+    if (!selectAddr(LoadNode, LoadNode->getBasePtr(),
+                    Base, Scale, Index, Disp, Segment))
+      break;
+
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2);
+    MemOp[0] = StoreNode->getMemOperand();
+    MemOp[1] = LoadNode->getMemOperand();
+    const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain };
+    EVT LdVT = LoadNode->getMemoryVT();
+    unsigned newOpc = getFusedLdStOpcode(LdVT, Opc);
+    MachineSDNode *Result = CurDAG->getMachineNode(newOpc,
+                                                   SDLoc(Node),
+                                                   MVT::i32, MVT::Other, Ops);
+    Result->setMemRefs(MemOp, MemOp + 2);
+
+    ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
+    ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
+
+    return Result;
+  }
+  }
+
+  SDNode *ResNode = SelectCode(Node);
+
+  DEBUG(dbgs() << "=> ";
+        if (ResNode == nullptr || ResNode == Node)
+          Node->dump(CurDAG);
+        else
+          ResNode->dump(CurDAG);
+        dbgs() << '\n');
+
+  return ResNode;
+}
+
+bool X86DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1, Op2, Op3, Op4;
+  switch (ConstraintID) {
+  default:
+    llvm_unreachable("Unexpected asm memory constraint");
+  case InlineAsm::Constraint_i:
+    // FIXME: It seems strange that 'i' is needed here since it's supposed to
+    //        be an immediate and not a memory constraint.
+    // Fallthrough.
+  case InlineAsm::Constraint_o: // offsetable        ??
+  case InlineAsm::Constraint_v: // not offsetable    ??
+  case InlineAsm::Constraint_m: // memory
+  case InlineAsm::Constraint_X:
+    if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
+      return true;
+    break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  OutOps.push_back(Op2);
+  OutOps.push_back(Op3);
+  OutOps.push_back(Op4);
+  return false;
+}
+
+/// This pass converts a legalized DAG into a X86-specific DAG,
+/// ready for instruction scheduling.
+FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel) {
+  return new X86DAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
new file mode 100644
index 0000000..d31aab0
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -0,0 +1,28765 @@
+//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ISelLowering.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "X86CallingConv.h"
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86ShuffleDecodeConstantPool.h"
+#include "X86TargetMachine.h"
+#include "X86TargetObjectFile.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetOptions.h"
+#include "X86IntrinsicsInfo.h"
+#include <bitset>
+#include <numeric>
+#include <cctype>
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-isel"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+static cl::opt<bool> ExperimentalVectorWideningLegalization(
+    "x86-experimental-vector-widening-legalization", cl::init(false),
+    cl::desc("Enable an experimental vector type legalization through widening "
+             "rather than promotion."),
+    cl::Hidden);
+
+X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
+                                     const X86Subtarget &STI)
+    : TargetLowering(TM), Subtarget(&STI) {
+  X86ScalarSSEf64 = Subtarget->hasSSE2();
+  X86ScalarSSEf32 = Subtarget->hasSSE1();
+  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
+
+  // Set up the TargetLowering object.
+
+  // X86 is weird. It always uses i8 for shift amounts and setcc results.
+  setBooleanContents(ZeroOrOneBooleanContent);
+  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  // For 64-bit, since we have so many registers, use the ILP scheduler.
+  // For 32-bit, use the register pressure specific scheduling.
+  // For Atom, always use ILP scheduling.
+  if (Subtarget->isAtom())
+    setSchedulingPreference(Sched::ILP);
+  else if (Subtarget->is64Bit())
+    setSchedulingPreference(Sched::ILP);
+  else
+    setSchedulingPreference(Sched::RegPressure);
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
+
+  // Bypass expensive divides on Atom when compiling with O2.
+  if (TM.getOptLevel() >= CodeGenOpt::Default) {
+    if (Subtarget->hasSlowDivide32())
+      addBypassSlowDiv(32, 8);
+    if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
+      addBypassSlowDiv(64, 16);
+  }
+
+  if (Subtarget->isTargetKnownWindowsMSVC()) {
+    // Setup Windows compiler runtime calls.
+    setLibcallName(RTLIB::SDIV_I64, "_alldiv");
+    setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
+    setLibcallName(RTLIB::SREM_I64, "_allrem");
+    setLibcallName(RTLIB::UREM_I64, "_aullrem");
+    setLibcallName(RTLIB::MUL_I64, "_allmul");
+    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
+    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
+    setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
+    setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
+    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
+  }
+
+  if (Subtarget->isTargetDarwin()) {
+    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
+    setUseUnderscoreSetJmp(false);
+    setUseUnderscoreLongJmp(false);
+  } else if (Subtarget->isTargetWindowsGNU()) {
+    // MS runtime is weird: it exports _setjmp, but longjmp!
+    setUseUnderscoreSetJmp(true);
+    setUseUnderscoreLongJmp(false);
+  } else {
+    setUseUnderscoreSetJmp(true);
+    setUseUnderscoreLongJmp(true);
+  }
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i8, &X86::GR8RegClass);
+  addRegisterClass(MVT::i16, &X86::GR16RegClass);
+  addRegisterClass(MVT::i32, &X86::GR32RegClass);
+  if (Subtarget->is64Bit())
+    addRegisterClass(MVT::i64, &X86::GR64RegClass);
+
+  for (MVT VT : MVT::integer_valuetypes())
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+
+  // We don't accept any truncstore of integer registers.
+  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
+  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
+  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // SETOEQ and SETUNE require checking two conditions.
+  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
+
+  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
+  // operation.
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
+
+  if (Subtarget->is64Bit()) {
+    if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512())
+      // f32/f64 are legal, f80 is custom.
+      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
+    else
+      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
+  } else if (!Subtarget->useSoftFloat()) {
+    // We have an algorithm for SSE2->double, and we turn this into a
+    // 64-bit FILD followed by conditional FADD for other targets.
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
+    // We have an algorithm for SSE2, and we turn this into a 64-bit
+    // FILD or VCVTUSI2SS/SD for other targets.
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
+  }
+
+  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
+  // this operation.
+  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
+  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
+
+  if (!Subtarget->useSoftFloat()) {
+    // SSE has no i16 to fp conversion, only i32
+    if (X86ScalarSSEf32) {
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
+      // f32 and f64 cases are Legal, f80 case is not
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
+    } else {
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
+    }
+  } else {
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
+  }
+
+  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
+  // this operation.
+  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
+  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
+
+  if (!Subtarget->useSoftFloat()) {
+    // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
+    // are Legal, f80 is custom lowered.
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
+
+    if (X86ScalarSSEf32) {
+      setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
+      // f32 and f64 cases are Legal, f80 case is not
+      setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
+    } else {
+      setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
+      setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
+    }
+  } else {
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Expand);
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Expand);
+  }
+
+  // Handle FP_TO_UINT by promoting the destination to a larger signed
+  // conversion.
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
+
+  if (Subtarget->is64Bit()) {
+    if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) {
+      // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
+    } else {
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
+    }
+  } else if (!Subtarget->useSoftFloat()) {
+    // Since AVX is a superset of SSE3, only check for SSE here.
+    if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
+      // Expand FP_TO_UINT into a select.
+      // FIXME: We would like to use a Custom expander here eventually to do
+      // the optimal thing for SSE vs. the default expansion in the legalizer.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
+    else
+      // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
+      // With SSE3 we can use fisttpll to convert to a signed i64; without
+      // SSE, we're stuck with a fistpll.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
+
+    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
+  }
+
+  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
+  if (!X86ScalarSSEf64) {
+    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
+    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
+      // Without SSE, i64->f64 goes through memory.
+      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
+    }
+  }
+
+  // Scalar integer divide and remainder are lowered to use operations that
+  // produce two results, to match the available instructions. This exposes
+  // the two-result form to trivial CSE, which is able to combine x/y and x%y
+  // into a single instruction.
+  //
+  // Scalar integer multiply-high is also lowered to use two-result
+  // operations, to match the available instructions. However, plain multiply
+  // (low) operations are left as Legal, as there are single-result
+  // instructions for this in x86. Using the two-result multiply instructions
+  // when both high and low results are needed must be arranged by dagcombine.
+  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+    setOperationAction(ISD::MULHS, VT, Expand);
+    setOperationAction(ISD::MULHU, VT, Expand);
+    setOperationAction(ISD::SDIV, VT, Expand);
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
+
+    // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
+    setOperationAction(ISD::ADDC, VT, Custom);
+    setOperationAction(ISD::ADDE, VT, Custom);
+    setOperationAction(ISD::SUBC, VT, Custom);
+    setOperationAction(ISD::SUBE, VT, Custom);
+  }
+
+  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
+  setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::f128,  Expand);
+  setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
+  setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::f128,  Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
+  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
+
+  if (Subtarget->is32Bit() && Subtarget->isTargetKnownWindowsMSVC()) {
+    // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
+    // is. We should promote the value to 64-bits to solve this.
+    // This is what the CRT headers do - `fmodf` is an inline header
+    // function casting to f64 and calling `fmod`.
+    setOperationAction(ISD::FREM           , MVT::f32  , Promote);
+  } else {
+    setOperationAction(ISD::FREM           , MVT::f32  , Expand);
+  }
+
+  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
+  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
+
+  // Promote the i8 variants and force them on up to i32 which has a shorter
+  // encoding.
+  setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
+  AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
+  AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
+  if (Subtarget->hasBMI()) {
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  } else {
+    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
+    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
+  }
+
+  if (Subtarget->hasLZCNT()) {
+    // When promoting the i8 variants, force them to i32 for a shorter
+    // encoding.
+    setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
+    AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
+    AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+  } else {
+    setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
+    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
+    setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+    }
+  }
+
+  // Special handling for half-precision floating point conversions.
+  // If we don't have F16C support, then lower half float conversions
+  // into library calls.
+  if (Subtarget->useSoftFloat() || !Subtarget->hasF16C()) {
+    setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+    setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+  }
+
+  // There's never any support for operations beyond MVT::f32.
+  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+  setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f80, MVT::f16, Expand);
+
+  if (Subtarget->hasPOPCNT()) {
+    setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
+  } else {
+    setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
+    setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
+    setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
+  }
+
+  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
+
+  if (!Subtarget->hasMOVBE())
+    setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
+
+  // These should be promoted to a larger select which is supported.
+  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
+  // X86 wants to expand cmov itself.
+  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
+  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f128 , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f128 , Custom);
+  setOperationAction(ISD::SETCCE          , MVT::i8   , Custom);
+  setOperationAction(ISD::SETCCE          , MVT::i16  , Custom);
+  setOperationAction(ISD::SETCCE          , MVT::i32  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
+    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
+    setOperationAction(ISD::SETCCE        , MVT::i64  , Custom);
+  }
+  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
+  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
+  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
+  // support continuation, user-level threading, and etc.. As a result, no
+  // other SjLj exception interfaces are implemented and please don't build
+  // your own exception handling based on them.
+  // LLVM/Clang supports zero-cost DWARF exception handling.
+  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+
+  // Darwin ABI issue.
+  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
+  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
+  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
+  setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
+    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
+    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
+    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
+    setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
+  }
+  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
+  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
+  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
+    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
+    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
+  }
+
+  if (Subtarget->hasSSE1())
+    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
+
+  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
+
+  // Expand certain atomics
+  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+    setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
+  }
+
+  if (Subtarget->hasCmpxchg16b()) {
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
+  }
+
+  // FIXME - use subtarget debug flags
+  if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
+      !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
+    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+  }
+
+  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
+  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
+
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
+    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
+  } else {
+    // TargetInfo::CharPtrBuiltinVaList
+    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
+    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
+  }
+
+  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
+
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
+
+  // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
+  setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
+  setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
+
+  if (!Subtarget->useSoftFloat() && X86ScalarSSEf64) {
+    // f32 and f64 use SSE.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f32, &X86::FR32RegClass);
+    addRegisterClass(MVT::f64, &X86::FR64RegClass);
+
+    // Use ANDPD to simulate FABS.
+    setOperationAction(ISD::FABS , MVT::f64, Custom);
+    setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+    // Use XORP to simulate FNEG.
+    setOperationAction(ISD::FNEG , MVT::f64, Custom);
+    setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+    // Use ANDPD and ORPD to simulate FCOPYSIGN.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+    // Lower this to FGETSIGNx86 plus an AND.
+    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
+    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
+
+    // We don't support sin/cos/fmod
+    setOperationAction(ISD::FSIN   , MVT::f64, Expand);
+    setOperationAction(ISD::FCOS   , MVT::f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
+    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+
+    // Expand FP immediates into loads from the stack, except for the special
+    // cases we handle.
+    addLegalFPImmediate(APFloat(+0.0)); // xorpd
+    addLegalFPImmediate(APFloat(+0.0f)); // xorps
+  } else if (!Subtarget->useSoftFloat() && X86ScalarSSEf32) {
+    // Use SSE for f32, x87 for f64.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f32, &X86::FR32RegClass);
+    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+
+    // Use ANDPS to simulate FABS.
+    setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+    // Use XORP to simulate FNEG.
+    setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
+
+    // Use ANDPS and ORPS to simulate FCOPYSIGN.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+    // We don't support sin/cos/fmod
+    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
+    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+
+    // Special cases we handle for FP constants.
+    addLegalFPImmediate(APFloat(+0.0f)); // xorps
+    addLegalFPImmediate(APFloat(+0.0)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+
+    if (!TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
+      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+    }
+  } else if (!Subtarget->useSoftFloat()) {
+    // f32 and f64 in x87.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+    addRegisterClass(MVT::f32, &X86::RFP32RegClass);
+
+    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
+    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+
+    if (!TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
+      setOperationAction(ISD::FSIN   , MVT::f32, Expand);
+      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
+      setOperationAction(ISD::FCOS   , MVT::f32, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+    }
+    addLegalFPImmediate(APFloat(+0.0)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
+  }
+
+  // We don't support FMA.
+  setOperationAction(ISD::FMA, MVT::f64, Expand);
+  setOperationAction(ISD::FMA, MVT::f32, Expand);
+
+  // Long double always uses X87, except f128 in MMX.
+  if (!Subtarget->useSoftFloat()) {
+    if (Subtarget->is64Bit() && Subtarget->hasMMX()) {
+      addRegisterClass(MVT::f128, &X86::FR128RegClass);
+      ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
+      setOperationAction(ISD::FABS , MVT::f128, Custom);
+      setOperationAction(ISD::FNEG , MVT::f128, Custom);
+      setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
+    }
+
+    addRegisterClass(MVT::f80, &X86::RFP80RegClass);
+    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
+    {
+      APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
+      addLegalFPImmediate(TmpFlt);  // FLD0
+      TmpFlt.changeSign();
+      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
+
+      bool ignored;
+      APFloat TmpFlt2(+1.0);
+      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+                      &ignored);
+      addLegalFPImmediate(TmpFlt2);  // FLD1
+      TmpFlt2.changeSign();
+      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
+    }
+
+    if (!TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FSIN   , MVT::f80, Expand);
+      setOperationAction(ISD::FCOS   , MVT::f80, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
+    }
+
+    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
+    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
+    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
+    setOperationAction(ISD::FMA, MVT::f80, Expand);
+  }
+
+  // Always use a library call for pow.
+  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
+  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
+  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
+
+  setOperationAction(ISD::FLOG, MVT::f80, Expand);
+  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
+  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
+  setOperationAction(ISD::FEXP, MVT::f80, Expand);
+  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
+  setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
+  setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
+
+  // First set operation action for all vector types to either promote
+  // (for widening) or expand (for scalarization). Then we will selectively
+  // turn on ones that can be effectively codegen'd.
+  for (MVT VT : MVT::vector_valuetypes()) {
+    setOperationAction(ISD::ADD , VT, Expand);
+    setOperationAction(ISD::SUB , VT, Expand);
+    setOperationAction(ISD::FADD, VT, Expand);
+    setOperationAction(ISD::FNEG, VT, Expand);
+    setOperationAction(ISD::FSUB, VT, Expand);
+    setOperationAction(ISD::MUL , VT, Expand);
+    setOperationAction(ISD::FMUL, VT, Expand);
+    setOperationAction(ISD::SDIV, VT, Expand);
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::FDIV, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
+    setOperationAction(ISD::LOAD, VT, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
+    setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
+    setOperationAction(ISD::FABS, VT, Expand);
+    setOperationAction(ISD::FSIN, VT, Expand);
+    setOperationAction(ISD::FSINCOS, VT, Expand);
+    setOperationAction(ISD::FCOS, VT, Expand);
+    setOperationAction(ISD::FSINCOS, VT, Expand);
+    setOperationAction(ISD::FREM, VT, Expand);
+    setOperationAction(ISD::FMA,  VT, Expand);
+    setOperationAction(ISD::FPOWI, VT, Expand);
+    setOperationAction(ISD::FSQRT, VT, Expand);
+    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+    setOperationAction(ISD::FFLOOR, VT, Expand);
+    setOperationAction(ISD::FCEIL, VT, Expand);
+    setOperationAction(ISD::FTRUNC, VT, Expand);
+    setOperationAction(ISD::FRINT, VT, Expand);
+    setOperationAction(ISD::FNEARBYINT, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::MULHS, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::MULHU, VT, Expand);
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
+    setOperationAction(ISD::FPOW, VT, Expand);
+    setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction(ISD::SHL, VT, Expand);
+    setOperationAction(ISD::SRA, VT, Expand);
+    setOperationAction(ISD::SRL, VT, Expand);
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::ROTR, VT, Expand);
+    setOperationAction(ISD::BSWAP, VT, Expand);
+    setOperationAction(ISD::SETCC, VT, Expand);
+    setOperationAction(ISD::FLOG, VT, Expand);
+    setOperationAction(ISD::FLOG2, VT, Expand);
+    setOperationAction(ISD::FLOG10, VT, Expand);
+    setOperationAction(ISD::FEXP, VT, Expand);
+    setOperationAction(ISD::FEXP2, VT, Expand);
+    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
+    setOperationAction(ISD::TRUNCATE, VT, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
+    setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
+    setOperationAction(ISD::ANY_EXTEND, VT, Expand);
+    setOperationAction(ISD::VSELECT, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+    for (MVT InnerVT : MVT::vector_valuetypes()) {
+      setTruncStoreAction(InnerVT, VT, Expand);
+
+      setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
+
+      // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
+      // types, we have to deal with them whether we ask for Expansion or not.
+      // Setting Expand causes its own optimisation problems though, so leave
+      // them legal.
+      if (VT.getVectorElementType() == MVT::i1)
+        setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
+
+      // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
+      // split/scalarized right now.
+      if (VT.getVectorElementType() == MVT::f16)
+        setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
+    }
+  }
+
+  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
+  // with -msoft-float, disable use of MMX as well.
+  if (!Subtarget->useSoftFloat() && Subtarget->hasMMX()) {
+    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
+    // No operations on x86mmx supported, everything uses intrinsics.
+  }
+
+  // MMX-sized vectors (other than x86mmx) are expected to be expanded
+  // into smaller operations.
+  for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) {
+    setOperationAction(ISD::MULHS,              MMXTy,      Expand);
+    setOperationAction(ISD::AND,                MMXTy,      Expand);
+    setOperationAction(ISD::OR,                 MMXTy,      Expand);
+    setOperationAction(ISD::XOR,                MMXTy,      Expand);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MMXTy,      Expand);
+    setOperationAction(ISD::SELECT,             MMXTy,      Expand);
+    setOperationAction(ISD::BITCAST,            MMXTy,      Expand);
+  }
+  setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
+
+  if (!Subtarget->useSoftFloat() && Subtarget->hasSSE1()) {
+    addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
+
+    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
+    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
+  }
+
+  if (!Subtarget->useSoftFloat() && Subtarget->hasSSE2()) {
+    addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
+
+    // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
+    // registers cannot be used even for integer operations.
+    addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
+    addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
+    addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
+    addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
+
+    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
+    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
+    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
+    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
+    setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
+    setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
+    setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
+    setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
+    setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
+    setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
+    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
+    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
+    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
+    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
+    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
+    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
+
+    setOperationAction(ISD::SMAX,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v16i8, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v16i8, Legal);
+
+    setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
+
+    setOperationAction(ISD::CTPOP,              MVT::v16i8, Custom);
+    setOperationAction(ISD::CTPOP,              MVT::v8i16, Custom);
+    setOperationAction(ISD::CTPOP,              MVT::v4i32, Custom);
+    setOperationAction(ISD::CTPOP,              MVT::v2i64, Custom);
+
+    setOperationAction(ISD::CTTZ,               MVT::v16i8, Custom);
+    setOperationAction(ISD::CTTZ,               MVT::v8i16, Custom);
+    setOperationAction(ISD::CTTZ,               MVT::v4i32, Custom);
+    // ISD::CTTZ v2i64 - scalarization is faster.
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,    MVT::v16i8, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,    MVT::v8i16, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,    MVT::v4i32, Custom);
+    // ISD::CTTZ_ZERO_UNDEF v2i64 - scalarization is faster.
+
+    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
+    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::VSELECT,            VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+    }
+
+    // We support custom legalizing of sext and anyext loads for specific
+    // memory vector types which we can load as a scalar (or sequence of
+    // scalars) and extend in-register to a legal 128-bit vector type. For sext
+    // loads these must work with a single scalar load.
+    for (MVT VT : MVT::integer_vector_valuetypes()) {
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
+    }
+
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
+
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+    }
+
+    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
+    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
+      setOperationAction(ISD::AND,    VT, Promote);
+      AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
+      setOperationAction(ISD::OR,     VT, Promote);
+      AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
+      setOperationAction(ISD::XOR,    VT, Promote);
+      AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
+      setOperationAction(ISD::LOAD,   VT, Promote);
+      AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
+    }
+
+    // Custom lower v2i64 and v2f64 selects.
+    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
+    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
+
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
+
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
+
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
+    // As there is no 64-bit GPR available, we need build a special custom
+    // sequence to convert from v2i32 to v2f32.
+    if (!Subtarget->is64Bit())
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
+
+    setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
+    setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
+
+    for (MVT VT : MVT::fp_vector_valuetypes())
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
+
+    setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
+    setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
+    setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
+  }
+
+  if (!Subtarget->useSoftFloat() && Subtarget->hasSSE41()) {
+    for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
+      setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
+      setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
+      setOperationAction(ISD::FTRUNC,           RoundedTy,  Legal);
+      setOperationAction(ISD::FRINT,            RoundedTy,  Legal);
+      setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
+    }
+
+    setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
+
+    // FIXME: Do we need to handle scalar-to-vector here?
+    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
+
+    // We directly match byte blends in the backend as they match the VSELECT
+    // condition form.
+    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
+
+    // SSE41 brings specific instructions for doing vector sign extend even in
+    // cases where we don't have SRA.
+    for (MVT VT : MVT::integer_vector_valuetypes()) {
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
+    }
+
+    // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+
+    // i8 and i16 vectors are custom because the source register and source
+    // source memory operand types are not the same width.  f32 vectors are
+    // custom since the immediate controlling the insert encodes additional
+    // information.
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+
+    // FIXME: these should be Legal, but that's only for the case where
+    // the index is constant.  For now custom expand to deal with that.
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+    }
+  }
+
+  if (Subtarget->hasSSE2()) {
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
+
+    setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
+    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
+
+    setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
+    setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
+
+    setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
+    setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
+
+    // In the customized shift lowering, the legal cases in AVX2 will be
+    // recognized.
+    setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
+    setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
+
+    setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
+    setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
+
+    setOperationAction(ISD::SRA,               MVT::v2i64, Custom);
+    setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
+  }
+
+  if (Subtarget->hasXOP()) {
+    setOperationAction(ISD::ROTL,              MVT::v16i8, Custom);
+    setOperationAction(ISD::ROTL,              MVT::v8i16, Custom);
+    setOperationAction(ISD::ROTL,              MVT::v4i32, Custom);
+    setOperationAction(ISD::ROTL,              MVT::v2i64, Custom);
+    setOperationAction(ISD::ROTL,              MVT::v32i8, Custom);
+    setOperationAction(ISD::ROTL,              MVT::v16i16, Custom);
+    setOperationAction(ISD::ROTL,              MVT::v8i32, Custom);
+    setOperationAction(ISD::ROTL,              MVT::v4i64, Custom);
+  }
+
+  if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) {
+    addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
+    addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
+
+    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
+
+    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
+    setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
+    setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
+
+    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
+    setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
+    setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
+
+    // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
+    // even though v8i16 is a legal type.
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
+
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
+    setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
+
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
+
+    for (MVT VT : MVT::fp_vector_valuetypes())
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
+
+    setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
+    setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
+    setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
+    setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
+
+    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
+    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
+    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
+
+    setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
+    setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
+    setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
+    setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
+    setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
+    setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
+    setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
+
+    setOperationAction(ISD::CTPOP,             MVT::v32i8, Custom);
+    setOperationAction(ISD::CTPOP,             MVT::v16i16, Custom);
+    setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
+    setOperationAction(ISD::CTPOP,             MVT::v4i64, Custom);
+
+    setOperationAction(ISD::CTTZ,              MVT::v32i8, Custom);
+    setOperationAction(ISD::CTTZ,              MVT::v16i16, Custom);
+    setOperationAction(ISD::CTTZ,              MVT::v8i32, Custom);
+    setOperationAction(ISD::CTTZ,              MVT::v4i64, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v32i8, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v16i16, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v8i32, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v4i64, Custom);
+
+    if (Subtarget->hasAnyFMA()) {
+      setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
+      setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
+      setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
+      setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
+      setOperationAction(ISD::FMA,             MVT::f32, Legal);
+      setOperationAction(ISD::FMA,             MVT::f64, Legal);
+    }
+
+    if (Subtarget->hasInt256()) {
+      setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
+      setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
+      setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
+      setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
+
+      setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
+      setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
+      setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
+      setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
+
+      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
+      setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
+      setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
+      setOperationAction(ISD::MUL,             MVT::v32i8, Custom);
+
+      setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
+      setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
+      setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
+      setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
+
+      setOperationAction(ISD::SMAX,            MVT::v32i8,  Legal);
+      setOperationAction(ISD::SMAX,            MVT::v16i16, Legal);
+      setOperationAction(ISD::SMAX,            MVT::v8i32,  Legal);
+      setOperationAction(ISD::UMAX,            MVT::v32i8,  Legal);
+      setOperationAction(ISD::UMAX,            MVT::v16i16, Legal);
+      setOperationAction(ISD::UMAX,            MVT::v8i32,  Legal);
+      setOperationAction(ISD::SMIN,            MVT::v32i8,  Legal);
+      setOperationAction(ISD::SMIN,            MVT::v16i16, Legal);
+      setOperationAction(ISD::SMIN,            MVT::v8i32,  Legal);
+      setOperationAction(ISD::UMIN,            MVT::v32i8,  Legal);
+      setOperationAction(ISD::UMIN,            MVT::v16i16, Legal);
+      setOperationAction(ISD::UMIN,            MVT::v8i32,  Legal);
+
+      // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
+      // when we have a 256bit-wide blend with immediate.
+      setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
+
+      // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
+
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
+    } else {
+      setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
+      setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
+      setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
+      setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
+
+      setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
+      setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
+      setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
+      setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
+
+      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
+      setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
+      setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
+      setOperationAction(ISD::MUL,             MVT::v32i8, Custom);
+
+      setOperationAction(ISD::SMAX,            MVT::v32i8,  Custom);
+      setOperationAction(ISD::SMAX,            MVT::v16i16, Custom);
+      setOperationAction(ISD::SMAX,            MVT::v8i32,  Custom);
+      setOperationAction(ISD::UMAX,            MVT::v32i8,  Custom);
+      setOperationAction(ISD::UMAX,            MVT::v16i16, Custom);
+      setOperationAction(ISD::UMAX,            MVT::v8i32,  Custom);
+      setOperationAction(ISD::SMIN,            MVT::v32i8,  Custom);
+      setOperationAction(ISD::SMIN,            MVT::v16i16, Custom);
+      setOperationAction(ISD::SMIN,            MVT::v8i32,  Custom);
+      setOperationAction(ISD::UMIN,            MVT::v32i8,  Custom);
+      setOperationAction(ISD::UMIN,            MVT::v16i16, Custom);
+      setOperationAction(ISD::UMIN,            MVT::v8i32,  Custom);
+    }
+
+    // In the customized shift lowering, the legal cases in AVX2 will be
+    // recognized.
+    setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
+    setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
+
+    setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
+    setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
+
+    setOperationAction(ISD::SRA,               MVT::v4i64, Custom);
+    setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
+
+    // Custom lower several nodes for 256-bit types.
+    for (MVT VT : MVT::vector_valuetypes()) {
+      if (VT.getScalarSizeInBits() >= 32) {
+        setOperationAction(ISD::MLOAD,  VT, Legal);
+        setOperationAction(ISD::MSTORE, VT, Legal);
+      }
+      // Extract subvector is special because the value type
+      // (result) is 128-bit but the source is 256-bit wide.
+      if (VT.is128BitVector()) {
+        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+      }
+      // Do not attempt to custom lower other non-256-bit vectors
+      if (!VT.is256BitVector())
+        continue;
+
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::VSELECT,            VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
+      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
+    }
+
+    if (Subtarget->hasInt256())
+      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
+
+    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
+    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
+      setOperationAction(ISD::AND,    VT, Promote);
+      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
+      setOperationAction(ISD::OR,     VT, Promote);
+      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
+      setOperationAction(ISD::XOR,    VT, Promote);
+      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
+      setOperationAction(ISD::LOAD,   VT, Promote);
+      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
+    }
+  }
+
+  if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) {
+    addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
+    addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
+    addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
+    addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
+
+    addRegisterClass(MVT::i1,     &X86::VK1RegClass);
+    addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
+    addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
+
+    for (MVT VT : MVT::fp_vector_valuetypes())
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
+
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i8, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i8, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i16, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i16, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v32i16, MVT::v32i8, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v32i16, MVT::v32i8, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64,  MVT::v8i8,  Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64,  MVT::v8i8,  Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64,  MVT::v8i16,  Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64,  MVT::v8i16,  Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64,  MVT::v8i32,  Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64,  MVT::v8i32,  Legal);
+
+    setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
+    setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
+    setOperationAction(ISD::SELECT_CC,          MVT::i1,    Expand);
+    setOperationAction(ISD::XOR,                MVT::i1,    Legal);
+    setOperationAction(ISD::OR,                 MVT::i1,    Legal);
+    setOperationAction(ISD::AND,                MVT::i1,    Legal);
+    setOperationAction(ISD::SUB,                MVT::i1,    Custom);
+    setOperationAction(ISD::ADD,                MVT::i1,    Custom);
+    setOperationAction(ISD::MUL,                MVT::i1,    Custom);
+    setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
+
+    setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v16f32, Custom);
+
+    setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v8f64, Custom);
+    setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
+    setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
+
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i8, Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Custom);
+    setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
+    setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
+
+    setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
+    setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
+    setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
+    setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
+    setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
+    if (Subtarget->hasVLX()){
+      setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
+      setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
+      setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
+      setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
+      setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
+
+      setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
+      setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
+      setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
+      setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
+      setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
+    } else {
+      setOperationAction(ISD::MLOAD,    MVT::v8i32, Custom);
+      setOperationAction(ISD::MLOAD,    MVT::v8f32, Custom);
+      setOperationAction(ISD::MSTORE,   MVT::v8i32, Custom);
+      setOperationAction(ISD::MSTORE,   MVT::v8f32, Custom);
+    }
+    setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i1,  Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v16i1, Custom);
+    if (Subtarget->hasDQI()) {
+      setOperationAction(ISD::TRUNCATE,         MVT::v2i1, Custom);
+      setOperationAction(ISD::TRUNCATE,         MVT::v4i1, Custom);
+
+      setOperationAction(ISD::SINT_TO_FP,       MVT::v8i64, Legal);
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v8i64, Legal);
+      setOperationAction(ISD::FP_TO_SINT,       MVT::v8i64, Legal);
+      setOperationAction(ISD::FP_TO_UINT,       MVT::v8i64, Legal);
+      if (Subtarget->hasVLX()) {
+        setOperationAction(ISD::SINT_TO_FP,    MVT::v4i64, Legal);
+        setOperationAction(ISD::SINT_TO_FP,    MVT::v2i64, Legal);
+        setOperationAction(ISD::UINT_TO_FP,    MVT::v4i64, Legal);
+        setOperationAction(ISD::UINT_TO_FP,    MVT::v2i64, Legal);
+        setOperationAction(ISD::FP_TO_SINT,    MVT::v4i64, Legal);
+        setOperationAction(ISD::FP_TO_SINT,    MVT::v2i64, Legal);
+        setOperationAction(ISD::FP_TO_UINT,    MVT::v4i64, Legal);
+        setOperationAction(ISD::FP_TO_UINT,    MVT::v2i64, Legal);
+      }
+    }
+    if (Subtarget->hasVLX()) {
+      setOperationAction(ISD::SINT_TO_FP,       MVT::v8i32, Legal);
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v8i32, Legal);
+      setOperationAction(ISD::FP_TO_SINT,       MVT::v8i32, Legal);
+      setOperationAction(ISD::FP_TO_UINT,       MVT::v8i32, Legal);
+      setOperationAction(ISD::SINT_TO_FP,       MVT::v4i32, Legal);
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v4i32, Legal);
+      setOperationAction(ISD::FP_TO_SINT,       MVT::v4i32, Legal);
+      setOperationAction(ISD::FP_TO_UINT,       MVT::v4i32, Legal);
+    }
+    setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
+    setOperationAction(ISD::ANY_EXTEND,         MVT::v16i32, Custom);
+    setOperationAction(ISD::ANY_EXTEND,         MVT::v8i64, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
+    if (Subtarget->hasDQI()) {
+      setOperationAction(ISD::SIGN_EXTEND,        MVT::v4i32, Custom);
+      setOperationAction(ISD::SIGN_EXTEND,        MVT::v2i64, Custom);
+    }
+    setOperationAction(ISD::FFLOOR,             MVT::v16f32, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v8f64, Legal);
+    setOperationAction(ISD::FCEIL,              MVT::v16f32, Legal);
+    setOperationAction(ISD::FCEIL,              MVT::v8f64, Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::v16f32, Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::v8f64, Legal);
+    setOperationAction(ISD::FRINT,              MVT::v16f32, Legal);
+    setOperationAction(ISD::FRINT,              MVT::v8f64, Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::v16f32, Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::v8f64, Legal);
+
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1,   Custom);
+
+    setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
+
+    setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v16i1, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v8i1,  Custom);
+
+    setOperationAction(ISD::SMAX,               MVT::v16i32, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v8i64, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v16i32, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v8i64, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v16i32, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v8i64, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v16i32, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v8i64, Legal);
+
+    setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
+    setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
+
+    setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
+    setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
+
+    setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
+
+    setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
+    setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
+
+    setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
+    setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
+
+    setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
+    setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
+
+    setOperationAction(ISD::AND,                MVT::v8i64, Legal);
+    setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
+    setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
+    setOperationAction(ISD::AND,                MVT::v16i32, Legal);
+    setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
+    setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
+
+    if (Subtarget->hasCDI()) {
+      setOperationAction(ISD::CTLZ,             MVT::v8i64,  Legal);
+      setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i64,  Expand);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v16i32, Expand);
+
+      setOperationAction(ISD::CTLZ,             MVT::v8i16,  Custom);
+      setOperationAction(ISD::CTLZ,             MVT::v16i8,  Custom);
+      setOperationAction(ISD::CTLZ,             MVT::v16i16, Custom);
+      setOperationAction(ISD::CTLZ,             MVT::v32i8,  Custom);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i16,  Expand);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v16i8,  Expand);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v16i16, Expand);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v32i8,  Expand);
+
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i64,  Custom);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v16i32, Custom);
+
+      if (Subtarget->hasVLX()) {
+        setOperationAction(ISD::CTLZ,             MVT::v4i64, Legal);
+        setOperationAction(ISD::CTLZ,             MVT::v8i32, Legal);
+        setOperationAction(ISD::CTLZ,             MVT::v2i64, Legal);
+        setOperationAction(ISD::CTLZ,             MVT::v4i32, Legal);
+        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i64, Expand);
+        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i32, Expand);
+        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v2i64, Expand);
+        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i32, Expand);
+
+        setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i64, Custom);
+        setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i32, Custom);
+        setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v2i64, Custom);
+        setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i32, Custom);
+      } else {
+        setOperationAction(ISD::CTLZ,             MVT::v4i64, Custom);
+        setOperationAction(ISD::CTLZ,             MVT::v8i32, Custom);
+        setOperationAction(ISD::CTLZ,             MVT::v2i64, Custom);
+        setOperationAction(ISD::CTLZ,             MVT::v4i32, Custom);
+        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i64, Expand);
+        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v8i32, Expand);
+        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v2i64, Expand);
+        setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::v4i32, Expand);
+      }
+    } // Subtarget->hasCDI()
+
+    if (Subtarget->hasDQI()) {
+      setOperationAction(ISD::MUL,             MVT::v2i64, Legal);
+      setOperationAction(ISD::MUL,             MVT::v4i64, Legal);
+      setOperationAction(ISD::MUL,             MVT::v8i64, Legal);
+    }
+    // Custom lower several nodes.
+    for (MVT VT : MVT::vector_valuetypes()) {
+      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+      if (EltSize == 1) {
+        setOperationAction(ISD::AND, VT, Legal);
+        setOperationAction(ISD::OR,  VT, Legal);
+        setOperationAction(ISD::XOR,  VT, Legal);
+      }
+      if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) {
+        setOperationAction(ISD::MGATHER,  VT, Custom);
+        setOperationAction(ISD::MSCATTER, VT, Custom);
+      }
+      // Extract subvector is special because the value type
+      // (result) is 256/128-bit but the source is 512-bit wide.
+      if (VT.is128BitVector() || VT.is256BitVector()) {
+        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+      }
+      if (VT.getVectorElementType() == MVT::i1)
+        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+
+      // Do not attempt to custom lower other non-512-bit vectors
+      if (!VT.is512BitVector())
+        continue;
+
+      if (EltSize >= 32) {
+        setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
+        setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
+        setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
+        setOperationAction(ISD::VSELECT,             VT, Legal);
+        setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
+        setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
+        setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
+        setOperationAction(ISD::MLOAD,               VT, Legal);
+        setOperationAction(ISD::MSTORE,              VT, Legal);
+        setOperationAction(ISD::MGATHER,  VT, Legal);
+        setOperationAction(ISD::MSCATTER, VT, Custom);
+      }
+    }
+    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
+    }
+  }// has  AVX-512
+
+  if (!Subtarget->useSoftFloat() && Subtarget->hasBWI()) {
+    addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
+    addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
+
+    addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
+    addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
+
+    setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
+    setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
+    setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
+    setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
+    setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
+    setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
+    setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
+    setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
+    setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i16, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v32i1, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v64i1, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i8, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v32i16, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v64i8, Legal);
+    setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i1, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i1, Custom);
+
+    setOperationAction(ISD::SMAX,               MVT::v64i8, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v32i16, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v64i8, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v32i16, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v64i8, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v32i16, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v64i8, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v32i16, Legal);
+
+    setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
+    setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
+    if (Subtarget->hasVLX())
+      setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
+
+    if (Subtarget->hasCDI()) {
+      setOperationAction(ISD::CTLZ,            MVT::v32i16, Custom);
+      setOperationAction(ISD::CTLZ,            MVT::v64i8,  Custom);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Expand);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8,  Expand);
+    }
+
+    for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
+      setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
+      setOperationAction(ISD::VSELECT,             VT, Legal);
+      setOperationAction(ISD::SRL,                 VT, Custom);
+      setOperationAction(ISD::SHL,                 VT, Custom);
+      setOperationAction(ISD::SRA,                 VT, Custom);
+
+      setOperationAction(ISD::AND,    VT, Promote);
+      AddPromotedToType (ISD::AND,    VT, MVT::v8i64);
+      setOperationAction(ISD::OR,     VT, Promote);
+      AddPromotedToType (ISD::OR,     VT, MVT::v8i64);
+      setOperationAction(ISD::XOR,    VT, Promote);
+      AddPromotedToType (ISD::XOR,    VT, MVT::v8i64);
+    }
+  }
+
+  if (!Subtarget->useSoftFloat() && Subtarget->hasVLX()) {
+    addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
+    addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
+
+    setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v4i1, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v2i1, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i1, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i1, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i1, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i1, Custom);
+
+    setOperationAction(ISD::AND,                MVT::v8i32, Legal);
+    setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
+    setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
+    setOperationAction(ISD::AND,                MVT::v4i32, Legal);
+    setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
+    setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
+    setOperationAction(ISD::SRA,                MVT::v2i64, Custom);
+    setOperationAction(ISD::SRA,                MVT::v4i64, Custom);
+
+    setOperationAction(ISD::SMAX,               MVT::v2i64, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v4i64, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v2i64, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v4i64, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v2i64, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v4i64, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v2i64, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v4i64, Legal);
+  }
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  if (!Subtarget->is64Bit()) {
+    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+  }
+
+  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
+  // handle type legalization for these operations here.
+  //
+  // FIXME: We really should do custom legalization for addition and
+  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
+  // than generic legalization for 64-bit multiplication-with-overflow, though.
+  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+    if (VT == MVT::i64 && !Subtarget->is64Bit())
+      continue;
+    // Add/Sub/Mul with overflow operations are custom lowered.
+    setOperationAction(ISD::SADDO, VT, Custom);
+    setOperationAction(ISD::UADDO, VT, Custom);
+    setOperationAction(ISD::SSUBO, VT, Custom);
+    setOperationAction(ISD::USUBO, VT, Custom);
+    setOperationAction(ISD::SMULO, VT, Custom);
+    setOperationAction(ISD::UMULO, VT, Custom);
+  }
+
+  if (!Subtarget->is64Bit()) {
+    // These libcalls are not available in 32-bit.
+    setLibcallName(RTLIB::SHL_I128, nullptr);
+    setLibcallName(RTLIB::SRL_I128, nullptr);
+    setLibcallName(RTLIB::SRA_I128, nullptr);
+  }
+
+  // Combine sin / cos into one node or libcall if possible.
+  if (Subtarget->hasSinCos()) {
+    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+    setLibcallName(RTLIB::SINCOS_F64, "sincos");
+    if (Subtarget->isTargetDarwin()) {
+      // For MacOSX, we don't want the normal expansion of a libcall to sincos.
+      // We want to issue a libcall to __sincos_stret to avoid memory traffic.
+      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+    }
+  }
+
+  if (Subtarget->isTargetWin64()) {
+    setOperationAction(ISD::SDIV, MVT::i128, Custom);
+    setOperationAction(ISD::UDIV, MVT::i128, Custom);
+    setOperationAction(ISD::SREM, MVT::i128, Custom);
+    setOperationAction(ISD::UREM, MVT::i128, Custom);
+    setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
+    setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
+  }
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::BITCAST);
+  setTargetDAGCombine(ISD::VSELECT);
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine(ISD::SRA);
+  setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::FSUB);
+  setTargetDAGCombine(ISD::FNEG);
+  setTargetDAGCombine(ISD::FMA);
+  setTargetDAGCombine(ISD::FMINNUM);
+  setTargetDAGCombine(ISD::FMAXNUM);
+  setTargetDAGCombine(ISD::SUB);
+  setTargetDAGCombine(ISD::LOAD);
+  setTargetDAGCombine(ISD::MLOAD);
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::MSTORE);
+  setTargetDAGCombine(ISD::TRUNCATE);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::ANY_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
+  setTargetDAGCombine(ISD::SETCC);
+  setTargetDAGCombine(ISD::BUILD_VECTOR);
+  setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::XOR);
+  setTargetDAGCombine(ISD::MSCATTER);
+  setTargetDAGCombine(ISD::MGATHER);
+
+  computeRegisterProperties(Subtarget->getRegisterInfo());
+
+  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
+  MaxStoresPerMemsetOptSize = 8;
+  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
+  MaxStoresPerMemcpyOptSize = 4;
+  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
+  MaxStoresPerMemmoveOptSize = 4;
+  setPrefLoopAlignment(4); // 2^4 bytes.
+
+  // A predictable cmov does not hurt on an in-order CPU.
+  // FIXME: Use a CPU attribute to trigger this, not a CPU model.
+  PredictableSelectIsExpensive = !Subtarget->isAtom();
+  EnableExtLdPromotion = true;
+  setPrefFunctionAlignment(4); // 2^4 bytes.
+
+  verifyIntrinsicTables();
+}
+
+// This has so far only been implemented for 64-bit MachO.
+bool X86TargetLowering::useLoadStackGuardNode() const {
+  return Subtarget->isTargetMachO() && Subtarget->is64Bit();
+}
+
+TargetLoweringBase::LegalizeTypeAction
+X86TargetLowering::getPreferredVectorAction(EVT VT) const {
+  if (ExperimentalVectorWideningLegalization &&
+      VT.getVectorNumElements() != 1 &&
+      VT.getVectorElementType().getSimpleVT() != MVT::i1)
+    return TypeWidenVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
+EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                                          EVT VT) const {
+  if (!VT.isVector())
+    return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
+
+  if (VT.isSimple()) {
+    MVT VVT = VT.getSimpleVT();
+    const unsigned NumElts = VVT.getVectorNumElements();
+    const MVT EltVT = VVT.getVectorElementType();
+    if (VVT.is512BitVector()) {
+      if (Subtarget->hasAVX512())
+        if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
+            EltVT == MVT::f32 || EltVT == MVT::f64)
+          switch(NumElts) {
+          case  8: return MVT::v8i1;
+          case 16: return MVT::v16i1;
+        }
+      if (Subtarget->hasBWI())
+        if (EltVT == MVT::i8 || EltVT == MVT::i16)
+          switch(NumElts) {
+          case 32: return MVT::v32i1;
+          case 64: return MVT::v64i1;
+        }
+    }
+
+    if (VVT.is256BitVector() || VVT.is128BitVector()) {
+      if (Subtarget->hasVLX())
+        if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
+            EltVT == MVT::f32 || EltVT == MVT::f64)
+          switch(NumElts) {
+          case 2: return MVT::v2i1;
+          case 4: return MVT::v4i1;
+          case 8: return MVT::v8i1;
+        }
+      if (Subtarget->hasBWI() && Subtarget->hasVLX())
+        if (EltVT == MVT::i8 || EltVT == MVT::i16)
+          switch(NumElts) {
+          case  8: return MVT::v8i1;
+          case 16: return MVT::v16i1;
+          case 32: return MVT::v32i1;
+        }
+    }
+  }
+
+  return VT.changeVectorElementTypeToInteger();
+}
+
+/// Helper for getByValTypeAlignment to determine
+/// the desired ByVal argument alignment.
+static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
+  if (MaxAlign == 16)
+    return;
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    if (VTy->getBitWidth() == 128)
+      MaxAlign = 16;
+  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    unsigned EltAlign = 0;
+    getMaxByValAlign(ATy->getElementType(), EltAlign);
+    if (EltAlign > MaxAlign)
+      MaxAlign = EltAlign;
+  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    for (auto *EltTy : STy->elements()) {
+      unsigned EltAlign = 0;
+      getMaxByValAlign(EltTy, EltAlign);
+      if (EltAlign > MaxAlign)
+        MaxAlign = EltAlign;
+      if (MaxAlign == 16)
+        break;
+    }
+  }
+}
+
+/// Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area. For X86, aggregates
+/// that contain SSE vectors are placed at 16-byte boundaries while the rest
+/// are at 4-byte boundaries.
+unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
+                                                  const DataLayout &DL) const {
+  if (Subtarget->is64Bit()) {
+    // Max of 8 and alignment of type.
+    unsigned TyAlign = DL.getABITypeAlignment(Ty);
+    if (TyAlign > 8)
+      return TyAlign;
+    return 8;
+  }
+
+  unsigned Align = 4;
+  if (Subtarget->hasSSE1())
+    getMaxByValAlign(Ty, Align);
+  return Align;
+}
+
+/// Returns the target specific optimal type for load
+/// and store operations as a result of memset, memcpy, and memmove
+/// lowering. If DstAlign is zero that means it's safe to destination
+/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+/// means there isn't a need to check it against alignment requirement,
+/// probably because the source does not need to be loaded. If 'IsMemset' is
+/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+/// source is constant so it does not need to be loaded.
+/// It returns EVT::Other if the type should be determined using generic
+/// target-independent logic.
+EVT
+X86TargetLowering::getOptimalMemOpType(uint64_t Size,
+                                       unsigned DstAlign, unsigned SrcAlign,
+                                       bool IsMemset, bool ZeroMemset,
+                                       bool MemcpyStrSrc,
+                                       MachineFunction &MF) const {
+  const Function *F = MF.getFunction();
+  if ((!IsMemset || ZeroMemset) &&
+      !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+    if (Size >= 16 &&
+        (!Subtarget->isUnalignedMem16Slow() ||
+         ((DstAlign == 0 || DstAlign >= 16) &&
+          (SrcAlign == 0 || SrcAlign >= 16)))) {
+      if (Size >= 32) {
+        // FIXME: Check if unaligned 32-byte accesses are slow.
+        if (Subtarget->hasInt256())
+          return MVT::v8i32;
+        if (Subtarget->hasFp256())
+          return MVT::v8f32;
+      }
+      if (Subtarget->hasSSE2())
+        return MVT::v4i32;
+      if (Subtarget->hasSSE1())
+        return MVT::v4f32;
+    } else if (!MemcpyStrSrc && Size >= 8 &&
+               !Subtarget->is64Bit() &&
+               Subtarget->hasSSE2()) {
+      // Do not use f64 to lower memcpy if source is string constant. It's
+      // better to use i32 to avoid the loads.
+      return MVT::f64;
+    }
+  }
+  // This is a compromise. If we reach here, unaligned accesses may be slow on
+  // this target. However, creating smaller, aligned accesses could be even
+  // slower and would certainly be a lot more code.
+  if (Subtarget->is64Bit() && Size >= 8)
+    return MVT::i64;
+  return MVT::i32;
+}
+
+bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
+  if (VT == MVT::f32)
+    return X86ScalarSSEf32;
+  else if (VT == MVT::f64)
+    return X86ScalarSSEf64;
+  return true;
+}
+
+bool
+X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                  unsigned,
+                                                  unsigned,
+                                                  bool *Fast) const {
+  if (Fast) {
+    switch (VT.getSizeInBits()) {
+    default:
+      // 8-byte and under are always assumed to be fast.
+      *Fast = true;
+      break;
+    case 128:
+      *Fast = !Subtarget->isUnalignedMem16Slow();
+      break;
+    case 256:
+      *Fast = !Subtarget->isUnalignedMem32Slow();
+      break;
+    // TODO: What about AVX-512 (512-bit) accesses?
+    }
+  }
+  // Misaligned accesses of any size are always allowed.
+  return true;
+}
+
+/// Return the entry encoding for a jump table in the
+/// current function.  The returned value is a member of the
+/// MachineJumpTableInfo::JTEntryKind enum.
+unsigned X86TargetLowering::getJumpTableEncoding() const {
+  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
+  // symbol.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      Subtarget->isPICStyleGOT())
+    return MachineJumpTableInfo::EK_Custom32;
+
+  // Otherwise, use the normal jump table encoding heuristics.
+  return TargetLowering::getJumpTableEncoding();
+}
+
+bool X86TargetLowering::useSoftFloat() const {
+  return Subtarget->useSoftFloat();
+}
+
+const MCExpr *
+X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                                             const MachineBasicBlock *MBB,
+                                             unsigned uid,MCContext &Ctx) const{
+  assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
+         Subtarget->isPICStyleGOT());
+  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
+  // entries.
+  return MCSymbolRefExpr::create(MBB->getSymbol(),
+                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
+}
+
+/// Returns relocation base for the given PIC jumptable.
+SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
+                                                    SelectionDAG &DAG) const {
+  if (!Subtarget->is64Bit())
+    // This doesn't have SDLoc associated with it, but is not really the
+    // same as a Register.
+    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+                       getPointerTy(DAG.getDataLayout()));
+  return Table;
+}
+
+/// This returns the relocation base for the given PIC jumptable,
+/// the same as getPICJumpTableRelocBase, but as an MCExpr.
+const MCExpr *X86TargetLowering::
+getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
+                             MCContext &Ctx) const {
+  // X86-64 uses RIP relative addressing based on the jump table label.
+  if (Subtarget->isPICStyleRIPRel())
+    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
+
+  // Otherwise, the reference is relative to the PIC base.
+  return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
+}
+
+std::pair<const TargetRegisterClass *, uint8_t>
+X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+                                           MVT VT) const {
+  const TargetRegisterClass *RRC = nullptr;
+  uint8_t Cost = 1;
+  switch (VT.SimpleTy) {
+  default:
+    return TargetLowering::findRepresentativeClass(TRI, VT);
+  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
+    RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
+    break;
+  case MVT::x86mmx:
+    RRC = &X86::VR64RegClass;
+    break;
+  case MVT::f32: case MVT::f64:
+  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
+  case MVT::v4f32: case MVT::v2f64:
+  case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
+  case MVT::v4f64:
+    RRC = &X86::VR128RegClass;
+    break;
+  }
+  return std::make_pair(RRC, Cost);
+}
+
+bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
+                                               unsigned &Offset) const {
+  if (!Subtarget->isTargetLinux())
+    return false;
+
+  if (Subtarget->is64Bit()) {
+    // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
+    Offset = 0x28;
+    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
+      AddressSpace = 256;
+    else
+      AddressSpace = 257;
+  } else {
+    // %gs:0x14 on i386
+    Offset = 0x14;
+    AddressSpace = 256;
+  }
+  return true;
+}
+
+Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+  if (!Subtarget->isTargetAndroid())
+    return TargetLowering::getSafeStackPointerLocation(IRB);
+
+  // Android provides a fixed TLS slot for the SafeStack pointer. See the
+  // definition of TLS_SLOT_SAFESTACK in
+  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+  unsigned AddressSpace, Offset;
+  if (Subtarget->is64Bit()) {
+    // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
+    Offset = 0x48;
+    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
+      AddressSpace = 256;
+    else
+      AddressSpace = 257;
+  } else {
+    // %gs:0x24 on i386
+    Offset = 0x24;
+    AddressSpace = 256;
+  }
+
+  return ConstantExpr::getIntToPtr(
+      ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
+      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
+}
+
+bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
+                                            unsigned DestAS) const {
+  assert(SrcAS != DestAS && "Expected different address spaces!");
+
+  return SrcAS < 256 && DestAS < 256;
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "X86GenCallingConv.inc"
+
+bool X86TargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_X86);
+}
+
+const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
+  static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
+  return ScratchRegs;
+}
+
+SDValue
+X86TargetLowering::LowerReturn(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
+                               SDLoc dl, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+  if (CallConv == CallingConv::X86_INTR && !Outs.empty())
+    report_fatal_error("X86 interrupts may not return any value");
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
+
+  SDValue Flag;
+  SmallVector<SDValue, 6> RetOps;
+  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+  // Operand #1 = Bytes To Pop
+  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
+                   MVT::i16));
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    SDValue ValToCopy = OutVals[i];
+    EVT ValVT = ValToCopy.getValueType();
+
+    // Promote values to the appropriate types.
+    if (VA.getLocInfo() == CCValAssign::SExt)
+      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::ZExt)
+      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::AExt) {
+      if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
+        ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
+      else
+        ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    }
+    else if (VA.getLocInfo() == CCValAssign::BCvt)
+      ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
+
+    assert(VA.getLocInfo() != CCValAssign::FPExt &&
+           "Unexpected FP-extend for return value.");
+
+    // If this is x86-64, and we disabled SSE, we can't return FP values,
+    // or SSE or MMX vectors.
+    if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
+         VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
+          (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
+      report_fatal_error("SSE register return with SSE disabled");
+    }
+    // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
+    // llvm-gcc has never done it right and no one has noticed, so this
+    // should be OK for now.
+    if (ValVT == MVT::f64 &&
+        (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
+      report_fatal_error("SSE2 register return with SSE2 disabled");
+
+    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
+    // the RET instruction and handled by the FP Stackifier.
+    if (VA.getLocReg() == X86::FP0 ||
+        VA.getLocReg() == X86::FP1) {
+      // If this is a copy from an xmm register to ST(0), use an FPExtend to
+      // change the value to the FP stack register class.
+      if (isScalarFPTypeInSSEReg(VA.getValVT()))
+        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
+      RetOps.push_back(ValToCopy);
+      // Don't emit a copytoreg.
+      continue;
+    }
+
+    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
+    // which is returned in RAX / RDX.
+    if (Subtarget->is64Bit()) {
+      if (ValVT == MVT::x86mmx) {
+        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
+          ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
+          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+                                  ValToCopy);
+          // If we don't have SSE2 available, convert to v4f32 so the generated
+          // register is legal.
+          if (!Subtarget->hasSSE2())
+            ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
+        }
+      }
+    }
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  // All x86 ABIs require that for returning structs by value we copy
+  // the sret argument into %rax/%eax (depending on ABI) for the return.
+  // We saved the argument into a virtual register in the entry block,
+  // so now we copy the value out and into %rax/%eax.
+  //
+  // Checking Function.hasStructRetAttr() here is insufficient because the IR
+  // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
+  // false, then an sret argument may be implicitly inserted in the SelDAG. In
+  // either case FuncInfo->setSRetReturnReg() will have been called.
+  if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg,
+                                     getPointerTy(MF.getDataLayout()));
+
+    unsigned RetValReg
+        = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
+          X86::RAX : X86::EAX;
+    Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
+    Flag = Chain.getValue(1);
+
+    // RAX/EAX now acts like a return value.
+    RetOps.push_back(
+        DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  X86ISD::NodeType opcode = X86ISD::RET_FLAG;
+  if (CallConv == CallingConv::X86_INTR)
+    opcode = X86ISD::IRET;
+  return DAG.getNode(opcode, dl, MVT::Other, RetOps);
+}
+
+bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
+  if (N->getNumValues() != 1)
+    return false;
+  if (!N->hasNUsesOfValue(1, 0))
+    return false;
+
+  SDValue TCChain = Chain;
+  SDNode *Copy = *N->use_begin();
+  if (Copy->getOpcode() == ISD::CopyToReg) {
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+      return false;
+    TCChain = Copy->getOperand(0);
+  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
+    return false;
+
+  bool HasRet = false;
+  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
+       UI != UE; ++UI) {
+    if (UI->getOpcode() != X86ISD::RET_FLAG)
+      return false;
+    // If we are returning more than one value, we can definitely
+    // not make a tail call see PR19530
+    if (UI->getNumOperands() > 4)
+      return false;
+    if (UI->getNumOperands() == 4 &&
+        UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
+      return false;
+    HasRet = true;
+  }
+
+  if (!HasRet)
+    return false;
+
+  Chain = TCChain;
+  return true;
+}
+
+EVT
+X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
+                                            ISD::NodeType ExtendKind) const {
+  MVT ReturnMVT;
+  // TODO: Is this also valid on 32-bit?
+  if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
+    ReturnMVT = MVT::i8;
+  else
+    ReturnMVT = MVT::i32;
+
+  EVT MinVT = getRegisterType(Context, ReturnMVT);
+  return VT.bitsLT(MinVT) ? MinVT : VT;
+}
+
+/// Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue
+X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   SDLoc dl, SelectionDAG &DAG,
+                                   SmallVectorImpl<SDValue> &InVals) const {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  bool Is64Bit = Subtarget->is64Bit();
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    EVT CopyVT = VA.getLocVT();
+
+    // If this is x86-64, and we disabled SSE, we can't return FP values
+    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
+        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
+      report_fatal_error("SSE register return with SSE disabled");
+    }
+
+    // If we prefer to use the value in xmm registers, copy it out as f80 and
+    // use a truncate to move it from fp stack reg to xmm reg.
+    bool RoundAfterCopy = false;
+    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+        isScalarFPTypeInSSEReg(VA.getValVT())) {
+      CopyVT = MVT::f80;
+      RoundAfterCopy = (CopyVT != VA.getLocVT());
+    }
+
+    Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+                               CopyVT, InFlag).getValue(1);
+    SDValue Val = Chain.getValue(0);
+
+    if (RoundAfterCopy)
+      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+                        // This truncation won't change the value.
+                        DAG.getIntPtrConstant(1, dl));
+
+    if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1)
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//                C & StdCall & Fast Calling Convention implementation
+//===----------------------------------------------------------------------===//
+//  StdCall calling convention seems to be standard for many Windows' API
+//  routines and around. It differs from C calling convention just a little:
+//  callee should clean up the stack, not caller. Symbols should be also
+//  decorated in some fancy way :) It doesn't support any vector arguments.
+//  For info on fast calling convention see Fast Calling Convention (tail call)
+//  implementation LowerX86_32FastCCCallTo.
+
+/// CallIsStructReturn - Determines whether a call uses struct return
+/// semantics.
+enum StructReturnType {
+  NotStructReturn,
+  RegStructReturn,
+  StackStructReturn
+};
+static StructReturnType
+callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
+  if (Outs.empty())
+    return NotStructReturn;
+
+  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
+  if (!Flags.isSRet())
+    return NotStructReturn;
+  if (Flags.isInReg() || IsMCU)
+    return RegStructReturn;
+  return StackStructReturn;
+}
+
+/// Determines whether a function uses struct return semantics.
+static StructReturnType
+argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
+  if (Ins.empty())
+    return NotStructReturn;
+
+  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
+  if (!Flags.isSRet())
+    return NotStructReturn;
+  if (Flags.isInReg() || IsMCU)
+    return RegStructReturn;
+  return StackStructReturn;
+}
+
+/// Make a copy of an aggregate at address specified by "Src" to address
+/// "Dst" with size and alignment information specified by the specific
+/// parameter attribute. The copy will be passed as a byval function parameter.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+                          SDLoc dl) {
+  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
+
+  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+                       /*isVolatile*/false, /*AlwaysInline=*/true,
+                       /*isTailCall*/false,
+                       MachinePointerInfo(), MachinePointerInfo());
+}
+
+/// Return true if the calling convention is one that we can guarantee TCO for.
+static bool canGuaranteeTCO(CallingConv::ID CC) {
+  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
+          CC == CallingConv::HiPE || CC == CallingConv::HHVM);
+}
+
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+  switch (CC) {
+  // C calling conventions:
+  case CallingConv::C:
+  case CallingConv::X86_64_Win64:
+  case CallingConv::X86_64_SysV:
+  // Callee pop conventions:
+  case CallingConv::X86_ThisCall:
+  case CallingConv::X86_StdCall:
+  case CallingConv::X86_VectorCall:
+  case CallingConv::X86_FastCall:
+    return true;
+  default:
+    return canGuaranteeTCO(CC);
+  }
+}
+
+/// Return true if the function is being made into a tailcall target by
+/// changing its ABI.
+static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
+  return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
+}
+
+bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  auto Attr =
+      CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
+  if (!CI->isTailCall() || Attr.getValueAsString() == "true")
+    return false;
+
+  CallSite CS(CI);
+  CallingConv::ID CalleeCC = CS.getCallingConv();
+  if (!mayTailCallThisCC(CalleeCC))
+    return false;
+
+  return true;
+}
+
+SDValue
+X86TargetLowering::LowerMemArgument(SDValue Chain,
+                                    CallingConv::ID CallConv,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SDLoc dl, SelectionDAG &DAG,
+                                    const CCValAssign &VA,
+                                    MachineFrameInfo *MFI,
+                                    unsigned i) const {
+  // Create the nodes corresponding to a load from this parameter slot.
+  ISD::ArgFlagsTy Flags = Ins[i].Flags;
+  bool AlwaysUseMutable = shouldGuaranteeTCO(
+      CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
+  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
+  EVT ValVT;
+
+  // If value is passed by pointer we have address passed instead of the value
+  // itself.
+  bool ExtendedInMem = VA.isExtInLoc() &&
+    VA.getValVT().getScalarType() == MVT::i1;
+
+  if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
+    ValVT = VA.getLocVT();
+  else
+    ValVT = VA.getValVT();
+
+  // Calculate SP offset of interrupt parameter, re-arrange the slot normally
+  // taken by a return address.
+  int Offset = 0;
+  if (CallConv == CallingConv::X86_INTR) {
+    const X86Subtarget& Subtarget =
+        static_cast<const X86Subtarget&>(DAG.getSubtarget());
+    // X86 interrupts may take one or two arguments.
+    // On the stack there will be no return address as in regular call.
+    // Offset of last argument need to be set to -4/-8 bytes.
+    // Where offset of the first argument out of two, should be set to 0 bytes.
+    Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
+  }
+
+  // FIXME: For now, all byval parameter objects are marked mutable. This can be
+  // changed with more analysis.
+  // In case of tail call optimization mark all arguments mutable. Since they
+  // could be overwritten by lowering of arguments in case of a tail call.
+  if (Flags.isByVal()) {
+    unsigned Bytes = Flags.getByValSize();
+    if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
+    int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
+    // Adjust SP offset of interrupt parameter.
+    if (CallConv == CallingConv::X86_INTR) {
+      MFI->setObjectOffset(FI, Offset);
+    }
+    return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+  } else {
+    int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
+                                    VA.getLocMemOffset(), isImmutable);
+    // Adjust SP offset of interrupt parameter.
+    if (CallConv == CallingConv::X86_INTR) {
+      MFI->setObjectOffset(FI, Offset);
+    }
+
+    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+    SDValue Val = DAG.getLoad(
+        ValVT, dl, Chain, FIN,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
+        false, false, 0);
+    return ExtendedInMem ?
+      DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
+  }
+}
+
+// FIXME: Get this from tablegen.
+static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
+                                                const X86Subtarget *Subtarget) {
+  assert(Subtarget->is64Bit());
+
+  if (Subtarget->isCallingConvWin64(CallConv)) {
+    static const MCPhysReg GPR64ArgRegsWin64[] = {
+      X86::RCX, X86::RDX, X86::R8,  X86::R9
+    };
+    return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
+  }
+
+  static const MCPhysReg GPR64ArgRegs64Bit[] = {
+    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
+  };
+  return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
+}
+
+// FIXME: Get this from tablegen.
+static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
+                                                CallingConv::ID CallConv,
+                                                const X86Subtarget *Subtarget) {
+  assert(Subtarget->is64Bit());
+  if (Subtarget->isCallingConvWin64(CallConv)) {
+    // The XMM registers which might contain var arg parameters are shadowed
+    // in their paired GPR.  So we only need to save the GPR to their home
+    // slots.
+    // TODO: __vectorcall will change this.
+    return None;
+  }
+
+  const Function *Fn = MF.getFunction();
+  bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
+  bool isSoftFloat = Subtarget->useSoftFloat();
+  assert(!(isSoftFloat && NoImplicitFloatOps) &&
+         "SSE register cannot be used when SSE is disabled!");
+  if (isSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
+    // Kernel mode asks for SSE to be disabled, so there are no XMM argument
+    // registers.
+    return None;
+
+  static const MCPhysReg XMMArgRegs64Bit[] = {
+    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+    X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+  };
+  return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
+}
+
+SDValue X86TargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+
+  const Function* Fn = MF.getFunction();
+  if (Fn->hasExternalLinkage() &&
+      Subtarget->isTargetCygMing() &&
+      Fn->getName() == "main")
+    FuncInfo->setForceFramePointer(true);
+
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool Is64Bit = Subtarget->is64Bit();
+  bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
+
+  assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
+         "Var args not supported with calling convention fastcc, ghc or hipe");
+
+  if (CallConv == CallingConv::X86_INTR) {
+    bool isLegal = Ins.size() == 1 ||
+                   (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
+                                        (!Is64Bit && Ins[1].VT == MVT::i32)));
+    if (!isLegal)
+      report_fatal_error("X86 interrupts may take one or two arguments");
+  }
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+
+  // Allocate shadow area for Win64
+  if (IsWin64)
+    CCInfo.AllocateStack(32, 8);
+
+  CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
+
+  unsigned LastVal = ~0U;
+  SDValue ArgValue;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
+    // places.
+    assert(VA.getValNo() != LastVal &&
+           "Don't support value assigned to multiple locs yet");
+    (void)LastVal;
+    LastVal = VA.getValNo();
+
+    if (VA.isRegLoc()) {
+      EVT RegVT = VA.getLocVT();
+      const TargetRegisterClass *RC;
+      if (RegVT == MVT::i32)
+        RC = &X86::GR32RegClass;
+      else if (Is64Bit && RegVT == MVT::i64)
+        RC = &X86::GR64RegClass;
+      else if (RegVT == MVT::f32)
+        RC = &X86::FR32RegClass;
+      else if (RegVT == MVT::f64)
+        RC = &X86::FR64RegClass;
+      else if (RegVT == MVT::f128)
+        RC = &X86::FR128RegClass;
+      else if (RegVT.is512BitVector())
+        RC = &X86::VR512RegClass;
+      else if (RegVT.is256BitVector())
+        RC = &X86::VR256RegClass;
+      else if (RegVT.is128BitVector())
+        RC = &X86::VR128RegClass;
+      else if (RegVT == MVT::x86mmx)
+        RC = &X86::VR64RegClass;
+      else if (RegVT == MVT::i1)
+        RC = &X86::VK1RegClass;
+      else if (RegVT == MVT::v8i1)
+        RC = &X86::VK8RegClass;
+      else if (RegVT == MVT::v16i1)
+        RC = &X86::VK16RegClass;
+      else if (RegVT == MVT::v32i1)
+        RC = &X86::VK32RegClass;
+      else if (RegVT == MVT::v64i1)
+        RC = &X86::VK64RegClass;
+      else
+        llvm_unreachable("Unknown argument type!");
+
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+
+      // If this is an 8 or 16-bit value, it is really passed promoted to 32
+      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
+      // right size.
+      if (VA.getLocInfo() == CCValAssign::SExt)
+        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::ZExt)
+        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::BCvt)
+        ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
+
+      if (VA.isExtInLoc()) {
+        // Handle MMX values passed in XMM regs.
+        if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
+          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
+        else
+          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+      }
+    } else {
+      assert(VA.isMemLoc());
+      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
+    }
+
+    // If value is passed via pointer - do a load.
+    if (VA.getLocInfo() == CCValAssign::Indirect)
+      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
+                             MachinePointerInfo(), false, false, false, 0);
+
+    InVals.push_back(ArgValue);
+  }
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    // All x86 ABIs require that for returning structs by value we copy the
+    // sret argument into %rax/%eax (depending on ABI) for the return. Save
+    // the argument into a virtual register so that we can access it from the
+    // return points.
+    if (Ins[i].Flags.isSRet()) {
+      unsigned Reg = FuncInfo->getSRetReturnReg();
+      if (!Reg) {
+        MVT PtrTy = getPointerTy(DAG.getDataLayout());
+        Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+        FuncInfo->setSRetReturnReg(Reg);
+      }
+      SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+      break;
+    }
+  }
+
+  unsigned StackSize = CCInfo.getNextStackOffset();
+  // Align stack specially for tail calls.
+  if (shouldGuaranteeTCO(CallConv,
+                         MF.getTarget().Options.GuaranteedTailCallOpt))
+    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start. We
+  // can skip this if there are no va_start calls.
+  if (MFI->hasVAStart() &&
+      (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
+                   CallConv != CallingConv::X86_ThisCall))) {
+    FuncInfo->setVarArgsFrameIndex(
+        MFI->CreateFixedObject(1, StackSize, true));
+  }
+
+  // Figure out if XMM registers are in use.
+  assert(!(Subtarget->useSoftFloat() &&
+           Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
+         "SSE register cannot be used when SSE is disabled!");
+
+  // 64-bit calling conventions support varargs and register parameters, so we
+  // have to do extra work to spill them in the prologue.
+  if (Is64Bit && isVarArg && MFI->hasVAStart()) {
+    // Find the first unallocated argument registers.
+    ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
+    ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
+    unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
+    assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
+           "SSE register cannot be used when SSE is disabled!");
+
+    // Gather all the live in physical registers.
+    SmallVector<SDValue, 6> LiveGPRs;
+    SmallVector<SDValue, 8> LiveXMMRegs;
+    SDValue ALVal;
+    for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
+      unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
+      LiveGPRs.push_back(
+          DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
+    }
+    if (!ArgXMMs.empty()) {
+      unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
+      ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
+      for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
+        unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
+        LiveXMMRegs.push_back(
+            DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
+      }
+    }
+
+    if (IsWin64) {
+      // Get to the caller-allocated home save location.  Add 8 to account
+      // for the return address.
+      int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
+      FuncInfo->setRegSaveFrameIndex(
+          MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
+      // Fixup to set vararg frame on shadow area (4 x i64).
+      if (NumIntRegs < 4)
+        FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
+    } else {
+      // For X86-64, if there are vararg parameters that are passed via
+      // registers, then we must store them to their spots on the stack so
+      // they may be loaded by deferencing the result of va_next.
+      FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
+      FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
+      FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
+          ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
+    }
+
+    // Store the integer parameter registers.
+    SmallVector<SDValue, 8> MemOps;
+    SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+                                      getPointerTy(DAG.getDataLayout()));
+    unsigned Offset = FuncInfo->getVarArgsGPOffset();
+    for (SDValue Val : LiveGPRs) {
+      SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                                RSFIN, DAG.getIntPtrConstant(Offset, dl));
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                       MachinePointerInfo::getFixedStack(
+                           DAG.getMachineFunction(),
+                           FuncInfo->getRegSaveFrameIndex(), Offset),
+                       false, false, 0);
+      MemOps.push_back(Store);
+      Offset += 8;
+    }
+
+    if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
+      // Now store the XMM (fp + vector) parameter registers.
+      SmallVector<SDValue, 12> SaveXMMOps;
+      SaveXMMOps.push_back(Chain);
+      SaveXMMOps.push_back(ALVal);
+      SaveXMMOps.push_back(DAG.getIntPtrConstant(
+                             FuncInfo->getRegSaveFrameIndex(), dl));
+      SaveXMMOps.push_back(DAG.getIntPtrConstant(
+                             FuncInfo->getVarArgsFPOffset(), dl));
+      SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
+                        LiveXMMRegs.end());
+      MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
+                                   MVT::Other, SaveXMMOps));
+    }
+
+    if (!MemOps.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+  }
+
+  if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
+    // Find the largest legal vector type.
+    MVT VecVT = MVT::Other;
+    // FIXME: Only some x86_32 calling conventions support AVX512.
+    if (Subtarget->hasAVX512() &&
+        (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
+                     CallConv == CallingConv::Intel_OCL_BI)))
+      VecVT = MVT::v16f32;
+    else if (Subtarget->hasAVX())
+      VecVT = MVT::v8f32;
+    else if (Subtarget->hasSSE2())
+      VecVT = MVT::v4f32;
+
+    // We forward some GPRs and some vector types.
+    SmallVector<MVT, 2> RegParmTypes;
+    MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
+    RegParmTypes.push_back(IntVT);
+    if (VecVT != MVT::Other)
+      RegParmTypes.push_back(VecVT);
+
+    // Compute the set of forwarded registers. The rest are scratch.
+    SmallVectorImpl<ForwardedRegister> &Forwards =
+        FuncInfo->getForwardedMustTailRegParms();
+    CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
+
+    // Conservatively forward AL on x86_64, since it might be used for varargs.
+    if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
+      unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
+      Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
+    }
+
+    // Copy all forwards from physical to virtual registers.
+    for (ForwardedRegister &F : Forwards) {
+      // FIXME: Can we use a less constrained schedule?
+      SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
+      F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
+      Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
+    }
+  }
+
+  // Some CCs need callee pop.
+  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
+    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
+  } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
+    // X86 interrupts must pop the error code if present
+    FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
+  } else {
+    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
+    // If this is an sret function, the return should pop the hidden pointer.
+    if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
+        !Subtarget->getTargetTriple().isOSMSVCRT() &&
+        argsAreStructReturn(Ins, Subtarget->isTargetMCU()) == StackStructReturn)
+      FuncInfo->setBytesToPopOnReturn(4);
+  }
+
+  if (!Is64Bit) {
+    // RegSaveFrameIndex is X86-64 only.
+    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
+    if (CallConv == CallingConv::X86_FastCall ||
+        CallConv == CallingConv::X86_ThisCall)
+      // fastcc functions can't have varargs.
+      FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
+  }
+
+  FuncInfo->setArgumentStackSize(StackSize);
+
+  if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
+    EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
+    if (Personality == EHPersonality::CoreCLR) {
+      assert(Is64Bit);
+      // TODO: Add a mechanism to frame lowering that will allow us to indicate
+      // that we'd prefer this slot be allocated towards the bottom of the frame
+      // (i.e. near the stack pointer after allocating the frame).  Every
+      // funclet needs a copy of this slot in its (mostly empty) frame, and the
+      // offset from the bottom of this and each funclet's frame must be the
+      // same, so the size of funclets' (mostly empty) frames is dictated by
+      // how far this slot is from the bottom (since they allocate just enough
+      // space to accomodate holding this slot at the correct offset).
+      int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
+      EHInfo->PSPSymFrameIdx = PSPSymFI;
+    }
+  }
+
+  return Chain;
+}
+
+SDValue
+X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
+                                    SDValue StackPtr, SDValue Arg,
+                                    SDLoc dl, SelectionDAG &DAG,
+                                    const CCValAssign &VA,
+                                    ISD::ArgFlagsTy Flags) const {
+  unsigned LocMemOffset = VA.getLocMemOffset();
+  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
+  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                       StackPtr, PtrOff);
+  if (Flags.isByVal())
+    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
+
+  return DAG.getStore(
+      Chain, dl, Arg, PtrOff,
+      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
+      false, false, 0);
+}
+
+/// Emit a load of return address if tail call
+/// optimization is performed and it is required.
+SDValue
+X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
+                                           SDValue &OutRetAddr, SDValue Chain,
+                                           bool IsTailCall, bool Is64Bit,
+                                           int FPDiff, SDLoc dl) const {
+  // Adjust the Return address stack slot.
+  EVT VT = getPointerTy(DAG.getDataLayout());
+  OutRetAddr = getReturnAddressFrameIndex(DAG);
+
+  // Load the "old" Return address.
+  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
+                           false, false, false, 0);
+  return SDValue(OutRetAddr.getNode(), 1);
+}
+
+/// Emit a store of the return address if tail call
+/// optimization is performed and it is required (FPDiff!=0).
+static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
+                                        SDValue Chain, SDValue RetAddrFrIdx,
+                                        EVT PtrVT, unsigned SlotSize,
+                                        int FPDiff, SDLoc dl) {
+  // Store the return address to the appropriate stack slot.
+  if (!FPDiff) return Chain;
+  // Calculate the new stack slot for the return address.
+  int NewReturnAddrFI =
+    MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
+                                         false);
+  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
+  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
+                       MachinePointerInfo::getFixedStack(
+                           DAG.getMachineFunction(), NewReturnAddrFI),
+                       false, false, 0);
+  return Chain;
+}
+
+/// Returns a vector_shuffle mask for an movs{s|d}, movd
+/// operation of specified width.
+static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
+                       SDValue V2) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> Mask;
+  Mask.push_back(NumElems);
+  for (unsigned i = 1; i != NumElems; ++i)
+    Mask.push_back(i);
+  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+SDValue
+X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                             SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool &isTailCall                      = CLI.IsTailCall;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool Is64Bit        = Subtarget->is64Bit();
+  bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
+  StructReturnType SR = callIsStructReturn(Outs, Subtarget->isTargetMCU());
+  bool IsSibcall      = false;
+  X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
+  auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
+
+  if (CallConv == CallingConv::X86_INTR)
+    report_fatal_error("X86 interrupts may not be called directly");
+
+  if (Attr.getValueAsString() == "true")
+    isTailCall = false;
+
+  if (Subtarget->isPICStyleGOT() &&
+      !MF.getTarget().Options.GuaranteedTailCallOpt) {
+    // If we are using a GOT, disable tail calls to external symbols with
+    // default visibility. Tail calling such a symbol requires using a GOT
+    // relocation, which forces early binding of the symbol. This breaks code
+    // that require lazy function symbol resolution. Using musttail or
+    // GuaranteedTailCallOpt will override this.
+    GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+    if (!G || (!G->getGlobal()->hasLocalLinkage() &&
+               G->getGlobal()->hasDefaultVisibility()))
+      isTailCall = false;
+  }
+
+  bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
+  if (IsMustTail) {
+    // Force this to be a tail call.  The verifier rules are enough to ensure
+    // that we can lower this successfully without moving the return address
+    // around.
+    isTailCall = true;
+  } else if (isTailCall) {
+    // Check if it's really possible to do a tail call.
+    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                    isVarArg, SR != NotStructReturn,
+                    MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
+                    Outs, OutVals, Ins, DAG);
+
+    // Sibcalls are automatically detected tailcalls which do not require
+    // ABI changes.
+    if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
+      IsSibcall = true;
+
+    if (isTailCall)
+      ++NumTailCalls;
+  }
+
+  assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
+         "Var args not supported with calling convention fastcc, ghc or hipe");
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+
+  // Allocate shadow area for Win64
+  if (IsWin64)
+    CCInfo.AllocateStack(32, 8);
+
+  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
+  if (IsSibcall)
+    // This is a sibcall. The memory operands are available in caller's
+    // own caller's stack.
+    NumBytes = 0;
+  else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+           canGuaranteeTCO(CallConv))
+    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
+
+  int FPDiff = 0;
+  if (isTailCall && !IsSibcall && !IsMustTail) {
+    // Lower arguments at fp - stackoffset + fpdiff.
+    unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
+
+    FPDiff = NumBytesCallerPushed - NumBytes;
+
+    // Set the delta of movement of the returnaddr stackslot.
+    // But only set if delta is greater than previous delta.
+    if (FPDiff < X86Info->getTCReturnAddrDelta())
+      X86Info->setTCReturnAddrDelta(FPDiff);
+  }
+
+  unsigned NumBytesToPush = NumBytes;
+  unsigned NumBytesToPop = NumBytes;
+
+  // If we have an inalloca argument, all stack space has already been allocated
+  // for us and be right at the top of the stack.  We don't support multiple
+  // arguments passed in memory when using inalloca.
+  if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
+    NumBytesToPush = 0;
+    if (!ArgLocs.back().isMemLoc())
+      report_fatal_error("cannot use inalloca attribute on a register "
+                         "parameter");
+    if (ArgLocs.back().getLocMemOffset() != 0)
+      report_fatal_error("any parameter with the inalloca attribute must be "
+                         "the only memory argument");
+  }
+
+  if (!IsSibcall)
+    Chain = DAG.getCALLSEQ_START(
+        Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
+
+  SDValue RetAddrFrIdx;
+  // Load return address for tail calls.
+  if (isTailCall && FPDiff)
+    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
+                                    Is64Bit, FPDiff, dl);
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+  SDValue StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.  In the case
+  // of tail call optimization arguments are handle later.
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    // Skip inalloca arguments, they have already been written.
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    if (Flags.isInAlloca())
+      continue;
+
+    CCValAssign &VA = ArgLocs[i];
+    EVT RegVT = VA.getLocVT();
+    SDValue Arg = OutVals[i];
+    bool isByVal = Flags.isByVal();
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::AExt:
+      if (Arg.getValueType().isVector() &&
+          Arg.getValueType().getVectorElementType() == MVT::i1)
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
+      else if (RegVT.is128BitVector()) {
+        // Special case: passing MMX values in XMM registers.
+        Arg = DAG.getBitcast(MVT::i64, Arg);
+        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
+        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
+      } else
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getBitcast(RegVT, Arg);
+      break;
+    case CCValAssign::Indirect: {
+      // Store the argument.
+      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
+      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+      Chain = DAG.getStore(
+          Chain, dl, Arg, SpillSlot,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+          false, false, 0);
+      Arg = SpillSlot;
+      break;
+    }
+    }
+
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      if (isVarArg && IsWin64) {
+        // Win64 ABI requires argument XMM reg to be copied to the corresponding
+        // shadow reg if callee is a varargs function.
+        unsigned ShadowReg = 0;
+        switch (VA.getLocReg()) {
+        case X86::XMM0: ShadowReg = X86::RCX; break;
+        case X86::XMM1: ShadowReg = X86::RDX; break;
+        case X86::XMM2: ShadowReg = X86::R8; break;
+        case X86::XMM3: ShadowReg = X86::R9; break;
+        }
+        if (ShadowReg)
+          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
+      }
+    } else if (!IsSibcall && (!isTailCall || isByVal)) {
+      assert(VA.isMemLoc());
+      if (!StackPtr.getNode())
+        StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+                                      getPointerTy(DAG.getDataLayout()));
+      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+                                             dl, DAG, VA, Flags));
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  if (Subtarget->isPICStyleGOT()) {
+    // ELF / PIC requires GOT in the EBX register before function calls via PLT
+    // GOT pointer.
+    if (!isTailCall) {
+      RegsToPass.push_back(std::make_pair(
+          unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+                                          getPointerTy(DAG.getDataLayout()))));
+    } else {
+      // If we are tail calling and generating PIC/GOT style code load the
+      // address of the callee into ECX. The value in ecx is used as target of
+      // the tail jump. This is done to circumvent the ebx/callee-saved problem
+      // for tail calls on PIC/GOT architectures. Normally we would just put the
+      // address of GOT into ebx and then call target@PLT. But for tail calls
+      // ebx would be restored (since ebx is callee saved) before jumping to the
+      // target@PLT.
+
+      // Note: The actual moving to ECX is done further down.
+      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+      if (G && !G->getGlobal()->hasLocalLinkage() &&
+          G->getGlobal()->hasDefaultVisibility())
+        Callee = LowerGlobalAddress(Callee, DAG);
+      else if (isa<ExternalSymbolSDNode>(Callee))
+        Callee = LowerExternalSymbol(Callee, DAG);
+    }
+  }
+
+  if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
+    // From AMD64 ABI document:
+    // For calls that may call functions that use varargs or stdargs
+    // (prototype-less calls or calls to functions containing ellipsis (...) in
+    // the declaration) %al is used as hidden argument to specify the number
+    // of SSE registers used. The contents of %al do not need to match exactly
+    // the number of registers, but must be an ubound on the number of SSE
+    // registers used and is in the range 0 - 8 inclusive.
+
+    // Count the number of XMM registers allocated.
+    static const MCPhysReg XMMArgRegs[] = {
+      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+    };
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
+    assert((Subtarget->hasSSE1() || !NumXMMRegs)
+           && "SSE registers cannot be used when SSE is disabled");
+
+    RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
+                                        DAG.getConstant(NumXMMRegs, dl,
+                                                        MVT::i8)));
+  }
+
+  if (isVarArg && IsMustTail) {
+    const auto &Forwards = X86Info->getForwardedMustTailRegParms();
+    for (const auto &F : Forwards) {
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
+      RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
+    }
+  }
+
+  // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
+  // don't need this because the eligibility check rejects calls that require
+  // shuffling arguments passed in memory.
+  if (!IsSibcall && isTailCall) {
+    // Force all the incoming stack arguments to be loaded from the stack
+    // before any new outgoing arguments are stored to the stack, because the
+    // outgoing stack slots may alias the incoming argument stack slots, and
+    // the alias isn't otherwise explicit. This is slightly more conservative
+    // than necessary, because it means that each store effectively depends
+    // on every argument instead of just those arguments it would clobber.
+    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
+
+    SmallVector<SDValue, 8> MemOpChains2;
+    SDValue FIN;
+    int FI = 0;
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+      CCValAssign &VA = ArgLocs[i];
+      if (VA.isRegLoc())
+        continue;
+      assert(VA.isMemLoc());
+      SDValue Arg = OutVals[i];
+      ISD::ArgFlagsTy Flags = Outs[i].Flags;
+      // Skip inalloca arguments.  They don't require any work.
+      if (Flags.isInAlloca())
+        continue;
+      // Create frame index.
+      int32_t Offset = VA.getLocMemOffset()+FPDiff;
+      uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+      FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+      FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+
+      if (Flags.isByVal()) {
+        // Copy relative to framepointer.
+        SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
+        if (!StackPtr.getNode())
+          StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+                                        getPointerTy(DAG.getDataLayout()));
+        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                             StackPtr, Source);
+
+        MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
+                                                         ArgChain,
+                                                         Flags, DAG, dl));
+      } else {
+        // Store relative to framepointer.
+        MemOpChains2.push_back(DAG.getStore(
+            ArgChain, dl, Arg, FIN,
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+            false, false, 0));
+      }
+    }
+
+    if (!MemOpChains2.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
+
+    // Store the return address to the appropriate stack slot.
+    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
+                                     getPointerTy(DAG.getDataLayout()),
+                                     RegInfo->getSlotSize(), FPDiff, dl);
+  }
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into registers.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
+    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
+    // In the 64-bit large code model, we have to make all calls
+    // through a register, since the call instruction's 32-bit
+    // pc-relative offset may not be large enough to hold the whole
+    // address.
+  } else if (Callee->getOpcode() == ISD::GlobalAddress) {
+    // If the callee is a GlobalAddress node (quite common, every direct call
+    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
+    // it.
+    GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
+
+    // We should use extra load for direct calls to dllimported functions in
+    // non-JIT mode.
+    const GlobalValue *GV = G->getGlobal();
+    if (!GV->hasDLLImportStorageClass()) {
+      unsigned char OpFlags = 0;
+      bool ExtraLoad = false;
+      unsigned WrapperKind = ISD::DELETED_NODE;
+
+      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
+      // external symbols most go through the PLT in PIC mode.  If the symbol
+      // has hidden or protected visibility, or if it is static or local, then
+      // we don't need to use the PLT - we can directly call it.
+      if (Subtarget->isTargetELF() &&
+          DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
+          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
+        OpFlags = X86II::MO_PLT;
+      } else if (Subtarget->isPICStyleStubAny() &&
+                 !GV->isStrongDefinitionForLinker() &&
+                 (!Subtarget->getTargetTriple().isMacOSX() ||
+                  Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
+        // PC-relative references to external symbols should go through $stub,
+        // unless we're building with the leopard linker or later, which
+        // automatically synthesizes these stubs.
+        OpFlags = X86II::MO_DARWIN_STUB;
+      } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) &&
+                 cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) {
+        // If the function is marked as non-lazy, generate an indirect call
+        // which loads from the GOT directly. This avoids runtime overhead
+        // at the cost of eager binding (and one extra byte of encoding).
+        OpFlags = X86II::MO_GOTPCREL;
+        WrapperKind = X86ISD::WrapperRIP;
+        ExtraLoad = true;
+      }
+
+      Callee = DAG.getTargetGlobalAddress(
+          GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
+
+      // Add a wrapper if needed.
+      if (WrapperKind != ISD::DELETED_NODE)
+        Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
+                             getPointerTy(DAG.getDataLayout()), Callee);
+      // Add extra indirection if needed.
+      if (ExtraLoad)
+        Callee = DAG.getLoad(
+            getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
+            MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false,
+            false, 0);
+    }
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    unsigned char OpFlags = 0;
+
+    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
+    // external symbols should go through the PLT.
+    if (Subtarget->isTargetELF() &&
+        DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
+      OpFlags = X86II::MO_PLT;
+    } else if (Subtarget->isPICStyleStubAny() &&
+               (!Subtarget->getTargetTriple().isMacOSX() ||
+                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
+      // PC-relative references to external symbols should go through $stub,
+      // unless we're building with the leopard linker or later, which
+      // automatically synthesizes these stubs.
+      OpFlags = X86II::MO_DARWIN_STUB;
+    }
+
+    Callee = DAG.getTargetExternalSymbol(
+        S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
+  } else if (Subtarget->isTarget64BitILP32() &&
+             Callee->getValueType(0) == MVT::i32) {
+    // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
+    Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
+  }
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+
+  if (!IsSibcall && isTailCall) {
+    Chain = DAG.getCALLSEQ_END(Chain,
+                               DAG.getIntPtrConstant(NumBytesToPop, dl, true),
+                               DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+    InFlag = Chain.getValue(1);
+  }
+
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  if (isTailCall)
+    Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+
+  // If this is an invoke in a 32-bit function using a funclet-based
+  // personality, assume the function clobbers all registers. If an exception
+  // is thrown, the runtime will not restore CSRs.
+  // FIXME: Model this more precisely so that we can register allocate across
+  // the normal edge and spill and fill across the exceptional edge.
+  if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
+    const Function *CallerFn = MF.getFunction();
+    EHPersonality Pers =
+        CallerFn->hasPersonalityFn()
+            ? classifyEHPersonality(CallerFn->getPersonalityFn())
+            : EHPersonality::Unknown;
+    if (isFuncletEHPersonality(Pers))
+      Mask = RegInfo->getNoPreservedMask();
+  }
+
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  if (isTailCall) {
+    // We used to do:
+    //// If this is the first return lowered for this function, add the regs
+    //// to the liveout set for the function.
+    // This isn't right, although it's probably harmless on x86; liveouts
+    // should be computed from returns not tail calls.  Consider a void
+    // function making a tail call to a function returning int.
+    MF.getFrameInfo()->setHasTailCall();
+    return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
+  }
+
+  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  unsigned NumBytesForCalleeToPop;
+  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+                       DAG.getTarget().Options.GuaranteedTailCallOpt))
+    NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
+  else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
+           !Subtarget->getTargetTriple().isOSMSVCRT() &&
+           SR == StackStructReturn)
+    // If this is a call to a struct-return function, the callee
+    // pops the hidden struct pointer, so we have to push it back.
+    // This is common for Darwin/X86, Linux & Mingw32 targets.
+    // For MSVC Win32 targets, the caller pops the hidden struct pointer.
+    NumBytesForCalleeToPop = 4;
+  else
+    NumBytesForCalleeToPop = 0;  // Callee pops nothing.
+
+  // Returns a flag for retval copy to use.
+  if (!IsSibcall) {
+    Chain = DAG.getCALLSEQ_END(Chain,
+                               DAG.getIntPtrConstant(NumBytesToPop, dl, true),
+                               DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
+                                                     true),
+                               InFlag, dl);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
+}
+
+//===----------------------------------------------------------------------===//
+//                Fast Calling Convention (tail call) implementation
+//===----------------------------------------------------------------------===//
+
+//  Like std call, callee cleans arguments, convention except that ECX is
+//  reserved for storing the tail called function address. Only 2 registers are
+//  free for argument passing (inreg). Tail call optimization is performed
+//  provided:
+//                * tailcallopt is enabled
+//                * caller/callee are fastcc
+//  On X86_64 architecture with GOT-style position independent code only local
+//  (within module) calls are supported at the moment.
+//  To keep the stack aligned according to platform abi the function
+//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
+//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
+//  If a tail called function callee has more arguments than the caller the
+//  caller needs to make sure that there is room to move the RETADDR to. This is
+//  achieved by reserving an area the size of the argument delta right after the
+//  original RETADDR, but before the saved framepointer or the spilled registers
+//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
+//  stack layout:
+//    arg1
+//    arg2
+//    RETADDR
+//    [ new RETADDR
+//      move area ]
+//    (possible EBP)
+//    ESI
+//    EDI
+//    local1 ..
+
+/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
+/// requirement.
+unsigned
+X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
+                                               SelectionDAG& DAG) const {
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+  unsigned StackAlignment = TFI.getStackAlignment();
+  uint64_t AlignMask = StackAlignment - 1;
+  int64_t Offset = StackSize;
+  unsigned SlotSize = RegInfo->getSlotSize();
+  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
+    // Number smaller than 12 so just add the difference.
+    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
+  } else {
+    // Mask out lower bits, add stackalignment once plus the 12 bytes.
+    Offset = ((~AlignMask) & Offset) + StackAlignment +
+      (StackAlignment-SlotSize);
+  }
+  return Offset;
+}
+
+/// Return true if the given stack call argument is already available in the
+/// same position (relatively) of the caller's incoming argument stack.
+static
+bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
+                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
+                         const X86InstrInfo *TII) {
+  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
+  int FI = INT_MAX;
+  if (Arg.getOpcode() == ISD::CopyFromReg) {
+    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(VR))
+      return false;
+    MachineInstr *Def = MRI->getVRegDef(VR);
+    if (!Def)
+      return false;
+    if (!Flags.isByVal()) {
+      if (!TII->isLoadFromStackSlot(Def, FI))
+        return false;
+    } else {
+      unsigned Opcode = Def->getOpcode();
+      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
+           Opcode == X86::LEA64_32r) &&
+          Def->getOperand(1).isFI()) {
+        FI = Def->getOperand(1).getIndex();
+        Bytes = Flags.getByValSize();
+      } else
+        return false;
+    }
+  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
+    if (Flags.isByVal())
+      // ByVal argument is passed in as a pointer but it's now being
+      // dereferenced. e.g.
+      // define @foo(%struct.X* %A) {
+      //   tail call @bar(%struct.X* byval %A)
+      // }
+      return false;
+    SDValue Ptr = Ld->getBasePtr();
+    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+    if (!FINode)
+      return false;
+    FI = FINode->getIndex();
+  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
+    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
+    FI = FINode->getIndex();
+    Bytes = Flags.getByValSize();
+  } else
+    return false;
+
+  assert(FI != INT_MAX);
+  if (!MFI->isFixedObjectIndex(FI))
+    return false;
+  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
+}
+
+/// Check whether the call is eligible for tail call optimization. Targets
+/// that want to do tail call optimization should implement this function.
+bool X86TargetLowering::IsEligibleForTailCallOptimization(
+    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+    bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+  if (!mayTailCallThisCC(CalleeCC))
+    return false;
+
+  // If -tailcallopt is specified, make fastcc functions tail-callable.
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *CallerF = MF.getFunction();
+
+  // If the function return type is x86_fp80 and the callee return type is not,
+  // then the FP_EXTEND of the call result is not a nop. It's not safe to
+  // perform a tailcall optimization here.
+  if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
+    return false;
+
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+  bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
+  bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
+
+  // Win64 functions have extra shadow space for argument homing. Don't do the
+  // sibcall if the caller and callee have mismatched expectations for this
+  // space.
+  if (IsCalleeWin64 != IsCallerWin64)
+    return false;
+
+  if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
+    if (canGuaranteeTCO(CalleeCC) && CCMatch)
+      return true;
+    return false;
+  }
+
+  // Look for obvious safe cases to perform tail call optimization that do not
+  // require ABI changes. This is what gcc calls sibcall.
+
+  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
+  // emit a special epilogue.
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  if (RegInfo->needsStackRealignment(MF))
+    return false;
+
+  // Also avoid sibcall optimization if either caller or callee uses struct
+  // return semantics.
+  if (isCalleeStructRet || isCallerStructRet)
+    return false;
+
+  // Do not sibcall optimize vararg calls unless all arguments are passed via
+  // registers.
+  if (isVarArg && !Outs.empty()) {
+    // Optimizing for varargs on Win64 is unlikely to be safe without
+    // additional testing.
+    if (IsCalleeWin64 || IsCallerWin64)
+      return false;
+
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                   *DAG.getContext());
+
+    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+      if (!ArgLocs[i].isRegLoc())
+        return false;
+  }
+
+  // If the call result is in ST0 / ST1, it needs to be popped off the x87
+  // stack.  Therefore, if it's not used by the call it is not safe to optimize
+  // this into a sibcall.
+  bool Unused = false;
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    if (!Ins[i].Used) {
+      Unused = true;
+      break;
+    }
+  }
+  if (Unused) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
+                   *DAG.getContext());
+    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+      CCValAssign &VA = RVLocs[i];
+      if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
+        return false;
+    }
+  }
+
+  // If the calling conventions do not match, then we'd better make sure the
+  // results are returned in the same way as what the caller expects.
+  if (!CCMatch) {
+    SmallVector<CCValAssign, 16> RVLocs1;
+    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
+                    *DAG.getContext());
+    CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
+
+    SmallVector<CCValAssign, 16> RVLocs2;
+    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
+                    *DAG.getContext());
+    CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
+
+    if (RVLocs1.size() != RVLocs2.size())
+      return false;
+    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
+      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
+        return false;
+      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
+        return false;
+      if (RVLocs1[i].isRegLoc()) {
+        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
+          return false;
+      } else {
+        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
+          return false;
+      }
+    }
+  }
+
+  unsigned StackArgsSize = 0;
+
+  // If the callee takes no arguments then go on to check the results of the
+  // call.
+  if (!Outs.empty()) {
+    // Check if stack adjustment is needed. For now, do not do this if any
+    // argument is passed on the stack.
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                   *DAG.getContext());
+
+    // Allocate shadow area for Win64
+    if (IsCalleeWin64)
+      CCInfo.AllocateStack(32, 8);
+
+    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+    StackArgsSize = CCInfo.getNextStackOffset();
+
+    if (CCInfo.getNextStackOffset()) {
+      // Check if the arguments are already laid out in the right way as
+      // the caller's fixed stack objects.
+      MachineFrameInfo *MFI = MF.getFrameInfo();
+      const MachineRegisterInfo *MRI = &MF.getRegInfo();
+      const X86InstrInfo *TII = Subtarget->getInstrInfo();
+      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+        CCValAssign &VA = ArgLocs[i];
+        SDValue Arg = OutVals[i];
+        ISD::ArgFlagsTy Flags = Outs[i].Flags;
+        if (VA.getLocInfo() == CCValAssign::Indirect)
+          return false;
+        if (!VA.isRegLoc()) {
+          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
+                                   MFI, MRI, TII))
+            return false;
+        }
+      }
+    }
+
+    // If the tailcall address may be in a register, then make sure it's
+    // possible to register allocate for it. In 32-bit, the call address can
+    // only target EAX, EDX, or ECX since the tail call must be scheduled after
+    // callee-saved registers are restored. These happen to be the same
+    // registers used to pass 'inreg' arguments so watch out for those.
+    if (!Subtarget->is64Bit() &&
+        ((!isa<GlobalAddressSDNode>(Callee) &&
+          !isa<ExternalSymbolSDNode>(Callee)) ||
+         DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
+      unsigned NumInRegs = 0;
+      // In PIC we need an extra register to formulate the address computation
+      // for the callee.
+      unsigned MaxInRegs =
+        (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
+
+      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+        CCValAssign &VA = ArgLocs[i];
+        if (!VA.isRegLoc())
+          continue;
+        unsigned Reg = VA.getLocReg();
+        switch (Reg) {
+        default: break;
+        case X86::EAX: case X86::EDX: case X86::ECX:
+          if (++NumInRegs == MaxInRegs)
+            return false;
+          break;
+        }
+      }
+    }
+  }
+
+  bool CalleeWillPop =
+      X86::isCalleePop(CalleeCC, Subtarget->is64Bit(), isVarArg,
+                       MF.getTarget().Options.GuaranteedTailCallOpt);
+
+  if (unsigned BytesToPop =
+          MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
+    // If we have bytes to pop, the callee must pop them.
+    bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
+    if (!CalleePopMatches)
+      return false;
+  } else if (CalleeWillPop && StackArgsSize > 0) {
+    // If we don't have bytes to pop, make sure the callee doesn't pop any.
+    return false;
+  }
+
+  return true;
+}
+
+FastISel *
+X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                  const TargetLibraryInfo *libInfo) const {
+  return X86::createFastISel(funcInfo, libInfo);
+}
+
+//===----------------------------------------------------------------------===//
+//                           Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+static bool MayFoldLoad(SDValue Op) {
+  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
+}
+
+static bool MayFoldIntoStore(SDValue Op) {
+  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
+}
+
+static bool isTargetShuffle(unsigned Opcode) {
+  switch(Opcode) {
+  default: return false;
+  case X86ISD::BLENDI:
+  case X86ISD::PSHUFB:
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+  case X86ISD::SHUFP:
+  case X86ISD::PALIGNR:
+  case X86ISD::MOVLHPS:
+  case X86ISD::MOVLHPD:
+  case X86ISD::MOVHLPS:
+  case X86ISD::MOVLPS:
+  case X86ISD::MOVLPD:
+  case X86ISD::MOVSHDUP:
+  case X86ISD::MOVSLDUP:
+  case X86ISD::MOVDDUP:
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD:
+  case X86ISD::UNPCKL:
+  case X86ISD::UNPCKH:
+  case X86ISD::VPERMILPI:
+  case X86ISD::VPERM2X128:
+  case X86ISD::VPERMI:
+  case X86ISD::VPERMV:
+  case X86ISD::VPERMV3:
+    return true;
+  }
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
+                                    SDValue V1, unsigned TargetMask,
+                                    SelectionDAG &DAG) {
+  switch(Opc) {
+  default: llvm_unreachable("Unknown x86 shuffle node");
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+  case X86ISD::VPERMILPI:
+  case X86ISD::VPERMI:
+    return DAG.getNode(Opc, dl, VT, V1,
+                       DAG.getConstant(TargetMask, dl, MVT::i8));
+  }
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT,
+                                    SDValue V1, SDValue V2, SelectionDAG &DAG) {
+  switch(Opc) {
+  default: llvm_unreachable("Unknown x86 shuffle node");
+  case X86ISD::MOVLHPS:
+  case X86ISD::MOVLHPD:
+  case X86ISD::MOVHLPS:
+  case X86ISD::MOVLPS:
+  case X86ISD::MOVLPD:
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD:
+  case X86ISD::UNPCKL:
+  case X86ISD::UNPCKH:
+    return DAG.getNode(Opc, dl, VT, V1, V2);
+  }
+}
+
+SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  int ReturnAddrIndex = FuncInfo->getRAIndex();
+
+  if (ReturnAddrIndex == 0) {
+    // Set up a frame object for the return address.
+    unsigned SlotSize = RegInfo->getSlotSize();
+    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
+                                                           -(int64_t)SlotSize,
+                                                           false);
+    FuncInfo->setRAIndex(ReturnAddrIndex);
+  }
+
+  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
+}
+
+bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+                                       bool hasSymbolicDisplacement) {
+  // Offset should fit into 32 bit immediate field.
+  if (!isInt<32>(Offset))
+    return false;
+
+  // If we don't have a symbolic displacement - we don't have any extra
+  // restrictions.
+  if (!hasSymbolicDisplacement)
+    return true;
+
+  // FIXME: Some tweaks might be needed for medium code model.
+  if (M != CodeModel::Small && M != CodeModel::Kernel)
+    return false;
+
+  // For small code model we assume that latest object is 16MB before end of 31
+  // bits boundary. We may also accept pretty large negative constants knowing
+  // that all objects are in the positive half of address space.
+  if (M == CodeModel::Small && Offset < 16*1024*1024)
+    return true;
+
+  // For kernel code model we know that all object resist in the negative half
+  // of 32bits address space. We may not accept negative offsets, since they may
+  // be just off and we may accept pretty large positive ones.
+  if (M == CodeModel::Kernel && Offset >= 0)
+    return true;
+
+  return false;
+}
+
+/// Determines whether the callee is required to pop its own arguments.
+/// Callee pop is necessary to support tail calls.
+bool X86::isCalleePop(CallingConv::ID CallingConv,
+                      bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
+  // If GuaranteeTCO is true, we force some calls to be callee pop so that we
+  // can guarantee TCO.
+  if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
+    return true;
+
+  switch (CallingConv) {
+  default:
+    return false;
+  case CallingConv::X86_StdCall:
+  case CallingConv::X86_FastCall:
+  case CallingConv::X86_ThisCall:
+  case CallingConv::X86_VectorCall:
+    return !is64Bit;
+  }
+}
+
+/// \brief Return true if the condition is an unsigned comparison operation.
+static bool isX86CCUnsigned(unsigned X86CC) {
+  switch (X86CC) {
+  default: llvm_unreachable("Invalid integer condition!");
+  case X86::COND_E:     return true;
+  case X86::COND_G:     return false;
+  case X86::COND_GE:    return false;
+  case X86::COND_L:     return false;
+  case X86::COND_LE:    return false;
+  case X86::COND_NE:    return true;
+  case X86::COND_B:     return true;
+  case X86::COND_A:     return true;
+  case X86::COND_BE:    return true;
+  case X86::COND_AE:    return true;
+  }
+}
+
+static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Invalid integer condition!");
+  case ISD::SETEQ:  return X86::COND_E;
+  case ISD::SETGT:  return X86::COND_G;
+  case ISD::SETGE:  return X86::COND_GE;
+  case ISD::SETLT:  return X86::COND_L;
+  case ISD::SETLE:  return X86::COND_LE;
+  case ISD::SETNE:  return X86::COND_NE;
+  case ISD::SETULT: return X86::COND_B;
+  case ISD::SETUGT: return X86::COND_A;
+  case ISD::SETULE: return X86::COND_BE;
+  case ISD::SETUGE: return X86::COND_AE;
+  }
+}
+
+/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
+/// condition code, returning the condition code and the LHS/RHS of the
+/// comparison to make.
+static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP,
+                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
+  if (!isFP) {
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
+        // X > -1   -> X == 0, jump !sign.
+        RHS = DAG.getConstant(0, DL, RHS.getValueType());
+        return X86::COND_NS;
+      }
+      if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+        // X < 0   -> X == 0, jump on sign.
+        return X86::COND_S;
+      }
+      if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+        // X < 1   -> X <= 0
+        RHS = DAG.getConstant(0, DL, RHS.getValueType());
+        return X86::COND_LE;
+      }
+    }
+
+    return TranslateIntegerX86CC(SetCCOpcode);
+  }
+
+  // First determine if it is required or is profitable to flip the operands.
+
+  // If LHS is a foldable load, but RHS is not, flip the condition.
+  if (ISD::isNON_EXTLoad(LHS.getNode()) &&
+      !ISD::isNON_EXTLoad(RHS.getNode())) {
+    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
+    std::swap(LHS, RHS);
+  }
+
+  switch (SetCCOpcode) {
+  default: break;
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    std::swap(LHS, RHS);
+    break;
+  }
+
+  // On a floating point condition, the flags are set as follows:
+  // ZF  PF  CF   op
+  //  0 | 0 | 0 | X > Y
+  //  0 | 0 | 1 | X < Y
+  //  1 | 0 | 0 | X == Y
+  //  1 | 1 | 1 | unordered
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Condcode should be pre-legalized away");
+  case ISD::SETUEQ:
+  case ISD::SETEQ:   return X86::COND_E;
+  case ISD::SETOLT:              // flipped
+  case ISD::SETOGT:
+  case ISD::SETGT:   return X86::COND_A;
+  case ISD::SETOLE:              // flipped
+  case ISD::SETOGE:
+  case ISD::SETGE:   return X86::COND_AE;
+  case ISD::SETUGT:              // flipped
+  case ISD::SETULT:
+  case ISD::SETLT:   return X86::COND_B;
+  case ISD::SETUGE:              // flipped
+  case ISD::SETULE:
+  case ISD::SETLE:   return X86::COND_BE;
+  case ISD::SETONE:
+  case ISD::SETNE:   return X86::COND_NE;
+  case ISD::SETUO:   return X86::COND_P;
+  case ISD::SETO:    return X86::COND_NP;
+  case ISD::SETOEQ:
+  case ISD::SETUNE:  return X86::COND_INVALID;
+  }
+}
+
+/// Is there a floating point cmov for the specific X86 condition code?
+/// Current x86 isa includes the following FP cmov instructions:
+/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
+static bool hasFPCMov(unsigned X86CC) {
+  switch (X86CC) {
+  default:
+    return false;
+  case X86::COND_B:
+  case X86::COND_BE:
+  case X86::COND_E:
+  case X86::COND_P:
+  case X86::COND_A:
+  case X86::COND_AE:
+  case X86::COND_NE:
+  case X86::COND_NP:
+    return true;
+  }
+}
+
+/// Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
+    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
+      return true;
+  }
+  return false;
+}
+
+bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
+                                              ISD::LoadExtType ExtTy,
+                                              EVT NewVT) const {
+  // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
+  // relocation target a movq or addq instruction: don't let the load shrink.
+  SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
+  if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
+    if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
+      return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
+  return true;
+}
+
+/// \brief Returns true if it is beneficial to convert a load of a constant
+/// to just the constant itself.
+bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                          Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0 || BitSize > 64)
+    return false;
+  return true;
+}
+
+bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
+                                                unsigned Index) const {
+  if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+    return false;
+
+  return (Index == 0 || Index == ResVT.getVectorNumElements());
+}
+
+bool X86TargetLowering::isCheapToSpeculateCttz() const {
+  // Speculate cttz only if we can directly use TZCNT.
+  return Subtarget->hasBMI();
+}
+
+bool X86TargetLowering::isCheapToSpeculateCtlz() const {
+  // Speculate ctlz only if we can directly use LZCNT.
+  return Subtarget->hasLZCNT();
+}
+
+/// Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size is undef.
+static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
+  for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
+    if (0 <= Mask[i])
+      return false;
+  return true;
+}
+
+/// Return true if Val is undef or if its value falls within the
+/// specified range (L, H].
+static bool isUndefOrInRange(int Val, int Low, int Hi) {
+  return (Val < 0) || (Val >= Low && Val < Hi);
+}
+
+/// Val is either less than zero (undef) or equal to the specified value.
+static bool isUndefOrEqual(int Val, int CmpVal) {
+  return (Val < 0 || Val == CmpVal);
+}
+
+/// Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// sequential range (Low, Low+Size]. or is undef.
+static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
+                                       unsigned Pos, unsigned Size, int Low) {
+  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+    if (!isUndefOrEqual(Mask[i], Low))
+      return false;
+  return true;
+}
+
+/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
+/// extract that is suitable for instruction that extract 128 or 256 bit vectors
+static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
+  assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
+  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
+    return false;
+
+  // The index should be aligned on a vecWidth-bit boundary.
+  uint64_t Index =
+    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+
+  MVT VT = N->getSimpleValueType(0);
+  unsigned ElSize = VT.getVectorElementType().getSizeInBits();
+  bool Result = (Index * ElSize) % vecWidth == 0;
+
+  return Result;
+}
+
+/// Return true if the specified INSERT_SUBVECTOR
+/// operand specifies a subvector insert that is suitable for input to
+/// insertion of 128 or 256-bit subvectors
+static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
+  assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
+  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
+    return false;
+  // The index should be aligned on a vecWidth-bit boundary.
+  uint64_t Index =
+    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+
+  MVT VT = N->getSimpleValueType(0);
+  unsigned ElSize = VT.getVectorElementType().getSizeInBits();
+  bool Result = (Index * ElSize) % vecWidth == 0;
+
+  return Result;
+}
+
+bool X86::isVINSERT128Index(SDNode *N) {
+  return isVINSERTIndex(N, 128);
+}
+
+bool X86::isVINSERT256Index(SDNode *N) {
+  return isVINSERTIndex(N, 256);
+}
+
+bool X86::isVEXTRACT128Index(SDNode *N) {
+  return isVEXTRACTIndex(N, 128);
+}
+
+bool X86::isVEXTRACT256Index(SDNode *N) {
+  return isVEXTRACTIndex(N, 256);
+}
+
+static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
+  assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
+  assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
+         "Illegal extract subvector for VEXTRACT");
+
+  uint64_t Index =
+    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+
+  MVT VecVT = N->getOperand(0).getSimpleValueType();
+  MVT ElVT = VecVT.getVectorElementType();
+
+  unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
+  return Index / NumElemsPerChunk;
+}
+
+static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
+  assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
+  assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
+         "Illegal insert subvector for VINSERT");
+
+  uint64_t Index =
+    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+
+  MVT VecVT = N->getSimpleValueType(0);
+  MVT ElVT = VecVT.getVectorElementType();
+
+  unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
+  return Index / NumElemsPerChunk;
+}
+
+/// Return the appropriate immediate to extract the specified
+/// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
+unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
+  return getExtractVEXTRACTImmediate(N, 128);
+}
+
+/// Return the appropriate immediate to extract the specified
+/// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
+unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
+  return getExtractVEXTRACTImmediate(N, 256);
+}
+
+/// Return the appropriate immediate to insert at the specified
+/// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
+unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
+  return getInsertVINSERTImmediate(N, 128);
+}
+
+/// Return the appropriate immediate to insert at the specified
+/// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
+unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
+  return getInsertVINSERTImmediate(N, 256);
+}
+
+/// Returns true if Elt is a constant zero or a floating point constant +0.0.
+bool X86::isZeroNode(SDValue Elt) {
+  return isNullConstant(Elt) || isNullFPConstant(Elt);
+}
+
+// Build a vector of constants
+// Use an UNDEF node if MaskElt == -1.
+// Spilt 64-bit constants in the 32-bit mode.
+static SDValue getConstVector(ArrayRef<int> Values, MVT VT,
+                              SelectionDAG &DAG,
+                              SDLoc dl, bool IsMask = false) {
+
+  SmallVector<SDValue, 32>  Ops;
+  bool Split = false;
+
+  MVT ConstVecVT = VT;
+  unsigned NumElts = VT.getVectorNumElements();
+  bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
+  if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
+    ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
+    Split = true;
+  }
+
+  MVT EltVT = ConstVecVT.getVectorElementType();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    bool IsUndef = Values[i] < 0 && IsMask;
+    SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
+      DAG.getConstant(Values[i], dl, EltVT);
+    Ops.push_back(OpNode);
+    if (Split)
+      Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
+                    DAG.getConstant(0, dl, EltVT));
+  }
+  SDValue ConstsNode = DAG.getNode(ISD::BUILD_VECTOR, dl, ConstVecVT, Ops);
+  if (Split)
+    ConstsNode = DAG.getBitcast(VT, ConstsNode);
+  return ConstsNode;
+}
+
+/// Returns a vector of specified type with all zero elements.
+static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget,
+                             SelectionDAG &DAG, SDLoc dl) {
+  assert(VT.isVector() && "Expected a vector type");
+
+  // Always build SSE zero vectors as <4 x i32> bitcasted
+  // to their dest type. This ensures they get CSE'd.
+  SDValue Vec;
+  if (VT.is128BitVector()) {  // SSE
+    if (Subtarget->hasSSE2()) {  // SSE2
+      SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+    } else { // SSE1
+      SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32);
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
+    }
+  } else if (VT.is256BitVector()) { // AVX
+    if (Subtarget->hasInt256()) { // AVX2
+      SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
+    } else {
+      // 256-bit logic and arithmetic instructions in AVX are all
+      // floating-point, no support for integer ops. Emit fp zeroed vectors.
+      SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32);
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
+    }
+  } else if (VT.is512BitVector()) { // AVX-512
+      SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+                        Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
+  } else if (VT.getVectorElementType() == MVT::i1) {
+
+    assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16)
+            && "Unexpected vector type");
+    assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8)
+            && "Unexpected vector type");
+    SDValue Cst = DAG.getConstant(0, dl, MVT::i1);
+    SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  } else
+    llvm_unreachable("Unexpected vector type");
+
+  return DAG.getBitcast(VT, Vec);
+}
+
+static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
+                                SelectionDAG &DAG, SDLoc dl,
+                                unsigned vectorWidth) {
+  assert((vectorWidth == 128 || vectorWidth == 256) &&
+         "Unsupported vector width");
+  EVT VT = Vec.getValueType();
+  EVT ElVT = VT.getVectorElementType();
+  unsigned Factor = VT.getSizeInBits()/vectorWidth;
+  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
+                                  VT.getVectorNumElements()/Factor);
+
+  // Extract from UNDEF is UNDEF.
+  if (Vec.getOpcode() == ISD::UNDEF)
+    return DAG.getUNDEF(ResultVT);
+
+  // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
+  unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
+  assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
+
+  // This is the index of the first element of the vectorWidth-bit chunk
+  // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+  IdxVal &= ~(ElemsPerChunk - 1);
+
+  // If the input is a buildvector just emit a smaller one.
+  if (Vec.getOpcode() == ISD::BUILD_VECTOR)
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
+                       makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
+
+  SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
+}
+
+/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
+/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
+/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
+/// instructions or a simple subregister reference. Idx is an index in the
+/// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
+/// lowering EXTRACT_VECTOR_ELT operations easier.
+static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
+                                   SelectionDAG &DAG, SDLoc dl) {
+  assert((Vec.getValueType().is256BitVector() ||
+          Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
+  return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
+}
+
+/// Generate a DAG to grab 256-bits from a 512-bit vector.
+static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
+                                   SelectionDAG &DAG, SDLoc dl) {
+  assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
+  return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
+}
+
+static SDValue InsertSubVector(SDValue Result, SDValue Vec,
+                               unsigned IdxVal, SelectionDAG &DAG,
+                               SDLoc dl, unsigned vectorWidth) {
+  assert((vectorWidth == 128 || vectorWidth == 256) &&
+         "Unsupported vector width");
+  // Inserting UNDEF is Result
+  if (Vec.getOpcode() == ISD::UNDEF)
+    return Result;
+  EVT VT = Vec.getValueType();
+  EVT ElVT = VT.getVectorElementType();
+  EVT ResultVT = Result.getValueType();
+
+  // Insert the relevant vectorWidth bits.
+  unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
+  assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
+
+  // This is the index of the first element of the vectorWidth-bit chunk
+  // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+  IdxVal &= ~(ElemsPerChunk - 1);
+
+  SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
+}
+
+/// Generate a DAG to put 128-bits into a vector > 128 bits.  This
+/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
+/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
+/// simple superregister reference.  Idx is an index in the 128 bits
+/// we want.  It need not be aligned to a 128-bit boundary.  That makes
+/// lowering INSERT_VECTOR_ELT operations easier.
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+                                  SelectionDAG &DAG, SDLoc dl) {
+  assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
+
+  // For insertion into the zero index (low half) of a 256-bit vector, it is
+  // more efficient to generate a blend with immediate instead of an insert*128.
+  // We are still creating an INSERT_SUBVECTOR below with an undef node to
+  // extend the subvector to the size of the result vector. Make sure that
+  // we are not recursing on that node by checking for undef here.
+  if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
+      Result.getOpcode() != ISD::UNDEF) {
+    EVT ResultVT = Result.getValueType();
+    SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
+    SDValue Undef = DAG.getUNDEF(ResultVT);
+    SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
+                                 Vec, ZeroIndex);
+
+    // The blend instruction, and therefore its mask, depend on the data type.
+    MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
+    if (ScalarType.isFloatingPoint()) {
+      // Choose either vblendps (float) or vblendpd (double).
+      unsigned ScalarSize = ScalarType.getSizeInBits();
+      assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
+      unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
+      SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
+      return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
+    }
+
+    const X86Subtarget &Subtarget =
+    static_cast<const X86Subtarget &>(DAG.getSubtarget());
+
+    // AVX2 is needed for 256-bit integer blend support.
+    // Integers must be cast to 32-bit because there is only vpblendd;
+    // vpblendw can't be used for this because it has a handicapped mask.
+
+    // If we don't have AVX2, then cast to float. Using a wrong domain blend
+    // is still more efficient than using the wrong domain vinsertf128 that
+    // will be created by InsertSubVector().
+    MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
+
+    SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
+    Result = DAG.getBitcast(CastVT, Result);
+    Vec256 = DAG.getBitcast(CastVT, Vec256);
+    Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
+    return DAG.getBitcast(ResultVT, Vec256);
+  }
+
+  return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
+}
+
+static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+                                  SelectionDAG &DAG, SDLoc dl) {
+  assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
+  return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
+}
+
+/// Insert i1-subvector to i1-vector.
+static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) {
+
+  SDLoc dl(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue SubVec = Op.getOperand(1);
+  SDValue Idx = Op.getOperand(2);
+
+  if (!isa<ConstantSDNode>(Idx))
+    return SDValue();
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  if (IdxVal == 0  && Vec.isUndef()) // the operation is legal
+    return Op;
+
+  MVT OpVT = Op.getSimpleValueType();
+  MVT SubVecVT = SubVec.getSimpleValueType();
+  unsigned NumElems = OpVT.getVectorNumElements();
+  unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
+
+  assert(IdxVal + SubVecNumElems <= NumElems &&
+         IdxVal % SubVecVT.getSizeInBits() == 0 &&
+         "Unexpected index value in INSERT_SUBVECTOR");
+
+  // There are 3 possible cases:
+  // 1. Subvector should be inserted in the lower part (IdxVal == 0)
+  // 2. Subvector should be inserted in the upper part
+  //    (IdxVal + SubVecNumElems == NumElems)
+  // 3. Subvector should be inserted in the middle (for example v2i1
+  //    to v16i1, index 2)
+
+  SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+  SDValue Undef = DAG.getUNDEF(OpVT);
+  SDValue WideSubVec =
+    DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, SubVec, ZeroIdx);
+  if (Vec.isUndef())
+    return DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec,
+      DAG.getConstant(IdxVal, dl, MVT::i8));
+
+  if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
+    unsigned ShiftLeft = NumElems - SubVecNumElems;
+    unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
+    WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec,
+      DAG.getConstant(ShiftLeft, dl, MVT::i8));
+    return ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, OpVT, WideSubVec,
+      DAG.getConstant(ShiftRight, dl, MVT::i8)) : WideSubVec;
+  }
+
+  if (IdxVal == 0) {
+    // Zero lower bits of the Vec
+    SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
+    Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
+    Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
+    // Merge them together
+    return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec);
+  }
+
+  // Simple case when we put subvector in the upper part
+  if (IdxVal + SubVecNumElems == NumElems) {
+    // Zero upper bits of the Vec
+    WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec,
+                        DAG.getConstant(IdxVal, dl, MVT::i8));
+    SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
+    Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
+    Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
+    return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec);
+  }
+  // Subvector should be inserted in the middle - use shuffle
+  SmallVector<int, 64> Mask;
+  for (unsigned i = 0; i < NumElems; ++i)
+    Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
+                    i : i + NumElems);
+  return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
+}
+
+/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
+/// instructions. This is used because creating CONCAT_VECTOR nodes of
+/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
+/// large BUILD_VECTORS.
+static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
+                                   unsigned NumElems, SelectionDAG &DAG,
+                                   SDLoc dl) {
+  SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+  return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
+}
+
+static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
+                                   unsigned NumElems, SelectionDAG &DAG,
+                                   SDLoc dl) {
+  SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+  return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
+}
+
+/// Returns a vector of specified type with all bits set.
+/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
+/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
+/// Then bitcast to their original type, ensuring they get CSE'd.
+static SDValue getOnesVector(EVT VT, const X86Subtarget *Subtarget,
+                             SelectionDAG &DAG, SDLoc dl) {
+  assert(VT.isVector() && "Expected a vector type");
+
+  SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32);
+  SDValue Vec;
+  if (VT.is512BitVector()) {
+    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+                      Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
+  } else if (VT.is256BitVector()) {
+    if (Subtarget->hasInt256()) { // AVX2
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
+    } else { // AVX
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+      Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
+    }
+  } else if (VT.is128BitVector()) {
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+  } else
+    llvm_unreachable("Unexpected vector type");
+
+  return DAG.getBitcast(VT, Vec);
+}
+
+/// Returns a vector_shuffle node for an unpackl operation.
+static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
+                          SDValue V2) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> Mask;
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
+    Mask.push_back(i);
+    Mask.push_back(i + NumElems);
+  }
+  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// Returns a vector_shuffle node for an unpackh operation.
+static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
+                          SDValue V2) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> Mask;
+  for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
+    Mask.push_back(i + Half);
+    Mask.push_back(i + NumElems + Half);
+  }
+  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// Return a vector_shuffle of the specified vector of zero or undef vector.
+/// This produces a shuffle where the low element of V2 is swizzled into the
+/// zero/undef vector, landing at element Idx.
+/// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
+static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
+                                           bool IsZero,
+                                           const X86Subtarget *Subtarget,
+                                           SelectionDAG &DAG) {
+  MVT VT = V2.getSimpleValueType();
+  SDValue V1 = IsZero
+    ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 16> MaskVec;
+  for (unsigned i = 0; i != NumElems; ++i)
+    // If this is the insertion idx, put the low elt of V2 here.
+    MaskVec.push_back(i == Idx ? NumElems : i);
+  return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
+}
+
+/// Calculates the shuffle mask corresponding to the target-specific opcode.
+/// Returns true if the Mask could be calculated. Sets IsUnary to true if only
+/// uses one source. Note that this will set IsUnary for shuffles which use a
+/// single input multiple times, and in those cases it will
+/// adjust the mask to only have indices within that single input.
+/// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero.
+static bool getTargetShuffleMask(SDNode *N, MVT VT,
+                                 SmallVectorImpl<int> &Mask, bool &IsUnary) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SDValue ImmN;
+
+  IsUnary = false;
+  bool IsFakeUnary = false;
+  switch(N->getOpcode()) {
+  case X86ISD::BLENDI:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    break;
+  case X86ISD::SHUFP:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    break;
+  case X86ISD::UNPCKH:
+    DecodeUNPCKHMask(VT, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    break;
+  case X86ISD::UNPCKL:
+    DecodeUNPCKLMask(VT, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    break;
+  case X86ISD::MOVHLPS:
+    DecodeMOVHLPSMask(NumElems, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    break;
+  case X86ISD::MOVLHPS:
+    DecodeMOVLHPSMask(NumElems, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    break;
+  case X86ISD::PALIGNR:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    break;
+  case X86ISD::PSHUFD:
+  case X86ISD::VPERMILPI:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::PSHUFHW:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::PSHUFLW:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::PSHUFB: {
+    IsUnary = true;
+    SDValue MaskNode = N->getOperand(1);
+    while (MaskNode->getOpcode() == ISD::BITCAST)
+      MaskNode = MaskNode->getOperand(0);
+
+    if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
+      // If we have a build-vector, then things are easy.
+      MVT VT = MaskNode.getSimpleValueType();
+      assert(VT.isVector() &&
+             "Can't produce a non-vector with a build_vector!");
+      if (!VT.isInteger())
+        return false;
+
+      int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
+
+      SmallVector<uint64_t, 32> RawMask;
+      for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
+        SDValue Op = MaskNode->getOperand(i);
+        if (Op->getOpcode() == ISD::UNDEF) {
+          RawMask.push_back((uint64_t)SM_SentinelUndef);
+          continue;
+        }
+        auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
+        if (!CN)
+          return false;
+        APInt MaskElement = CN->getAPIntValue();
+
+        // We now have to decode the element which could be any integer size and
+        // extract each byte of it.
+        for (int j = 0; j < NumBytesPerElement; ++j) {
+          // Note that this is x86 and so always little endian: the low byte is
+          // the first byte of the mask.
+          RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
+          MaskElement = MaskElement.lshr(8);
+        }
+      }
+      DecodePSHUFBMask(RawMask, Mask);
+      break;
+    }
+
+    auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
+    if (!MaskLoad)
+      return false;
+
+    SDValue Ptr = MaskLoad->getBasePtr();
+    if (Ptr->getOpcode() == X86ISD::Wrapper ||
+        Ptr->getOpcode() == X86ISD::WrapperRIP)
+      Ptr = Ptr->getOperand(0);
+
+    auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
+    if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
+      return false;
+
+    if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
+      DecodePSHUFBMask(C, Mask);
+      break;
+    }
+
+    return false;
+  }
+  case X86ISD::VPERMI:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD:
+    DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
+    break;
+  case X86ISD::VPERM2X128:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    // Mask only contains negative index if an element is zero.
+    if (std::any_of(Mask.begin(), Mask.end(),
+                    [](int M){ return M == SM_SentinelZero; }))
+      return false;
+    break;
+  case X86ISD::MOVSLDUP:
+    DecodeMOVSLDUPMask(VT, Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::MOVSHDUP:
+    DecodeMOVSHDUPMask(VT, Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::MOVDDUP:
+    DecodeMOVDDUPMask(VT, Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::MOVLHPD:
+  case X86ISD::MOVLPD:
+  case X86ISD::MOVLPS:
+    // Not yet implemented
+    return false;
+  case X86ISD::VPERMV: {
+    IsUnary = true;
+    SDValue MaskNode = N->getOperand(0);
+    while (MaskNode->getOpcode() == ISD::BITCAST)
+      MaskNode = MaskNode->getOperand(0);
+
+    unsigned MaskLoBits = Log2_64(VT.getVectorNumElements());
+    SmallVector<uint64_t, 32> RawMask;
+    if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
+      // If we have a build-vector, then things are easy.
+      assert(MaskNode.getSimpleValueType().isInteger() &&
+             MaskNode.getSimpleValueType().getVectorNumElements() ==
+             VT.getVectorNumElements());
+
+      for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) {
+        SDValue Op = MaskNode->getOperand(i);
+        if (Op->getOpcode() == ISD::UNDEF)
+          RawMask.push_back((uint64_t)SM_SentinelUndef);
+        else if (isa<ConstantSDNode>(Op)) {
+          APInt MaskElement = cast<ConstantSDNode>(Op)->getAPIntValue();
+          RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue());
+        } else
+          return false;
+      }
+      DecodeVPERMVMask(RawMask, Mask);
+      break;
+    }
+    if (MaskNode->getOpcode() == X86ISD::VBROADCAST) {
+      unsigned NumEltsInMask = MaskNode->getNumOperands();
+      MaskNode = MaskNode->getOperand(0);
+      if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode)) {
+        APInt MaskEltValue = CN->getAPIntValue();
+        for (unsigned i = 0; i < NumEltsInMask; ++i)
+          RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue());
+        DecodeVPERMVMask(RawMask, Mask);
+        break;
+      }
+      // It may be a scalar load
+    }
+
+    auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
+    if (!MaskLoad)
+      return false;
+
+    SDValue Ptr = MaskLoad->getBasePtr();
+    if (Ptr->getOpcode() == X86ISD::Wrapper ||
+        Ptr->getOpcode() == X86ISD::WrapperRIP)
+      Ptr = Ptr->getOperand(0);
+
+    auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
+    if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
+      return false;
+
+    if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
+      DecodeVPERMVMask(C, VT, Mask);
+      break;
+    }
+    return false;
+  }
+  case X86ISD::VPERMV3: {
+    IsUnary = false;
+    SDValue MaskNode = N->getOperand(1);
+    while (MaskNode->getOpcode() == ISD::BITCAST)
+      MaskNode = MaskNode->getOperand(1);
+
+    if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
+      // If we have a build-vector, then things are easy.
+      assert(MaskNode.getSimpleValueType().isInteger() &&
+             MaskNode.getSimpleValueType().getVectorNumElements() ==
+             VT.getVectorNumElements());
+
+      SmallVector<uint64_t, 32> RawMask;
+      unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()*2);
+
+      for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) {
+        SDValue Op = MaskNode->getOperand(i);
+        if (Op->getOpcode() == ISD::UNDEF)
+          RawMask.push_back((uint64_t)SM_SentinelUndef);
+        else {
+          auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
+          if (!CN)
+            return false;
+          APInt MaskElement = CN->getAPIntValue();
+          RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue());
+        }
+      }
+      DecodeVPERMV3Mask(RawMask, Mask);
+      break;
+    }
+
+    auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
+    if (!MaskLoad)
+      return false;
+
+    SDValue Ptr = MaskLoad->getBasePtr();
+    if (Ptr->getOpcode() == X86ISD::Wrapper ||
+        Ptr->getOpcode() == X86ISD::WrapperRIP)
+      Ptr = Ptr->getOperand(0);
+
+    auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
+    if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
+      return false;
+
+    if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
+      DecodeVPERMV3Mask(C, VT, Mask);
+      break;
+    }
+    return false;
+  }
+  default: llvm_unreachable("unknown target shuffle node");
+  }
+
+  // Empty mask indicates the decode failed.
+  if (Mask.empty())
+    return false;
+
+  // If we have a fake unary shuffle, the shuffle mask is spread across two
+  // inputs that are actually the same node. Re-map the mask to always point
+  // into the first input.
+  if (IsFakeUnary)
+    for (int &M : Mask)
+      if (M >= (int)Mask.size())
+        M -= Mask.size();
+
+  return true;
+}
+
+/// Returns the scalar element that will make up the ith
+/// element of the result of the vector shuffle.
+static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
+                                   unsigned Depth) {
+  if (Depth == 6)
+    return SDValue();  // Limit search depth.
+
+  SDValue V = SDValue(N, 0);
+  EVT VT = V.getValueType();
+  unsigned Opcode = V.getOpcode();
+
+  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
+  if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
+    int Elt = SV->getMaskElt(Index);
+
+    if (Elt < 0)
+      return DAG.getUNDEF(VT.getVectorElementType());
+
+    unsigned NumElems = VT.getVectorNumElements();
+    SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
+                                         : SV->getOperand(1);
+    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
+  }
+
+  // Recurse into target specific vector shuffles to find scalars.
+  if (isTargetShuffle(Opcode)) {
+    MVT ShufVT = V.getSimpleValueType();
+    unsigned NumElems = ShufVT.getVectorNumElements();
+    SmallVector<int, 16> ShuffleMask;
+    bool IsUnary;
+
+    if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
+      return SDValue();
+
+    int Elt = ShuffleMask[Index];
+    if (Elt < 0)
+      return DAG.getUNDEF(ShufVT.getVectorElementType());
+
+    SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
+                                         : N->getOperand(1);
+    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
+                               Depth+1);
+  }
+
+  // Actual nodes that may contain scalar elements
+  if (Opcode == ISD::BITCAST) {
+    V = V.getOperand(0);
+    EVT SrcVT = V.getValueType();
+    unsigned NumElems = VT.getVectorNumElements();
+
+    if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
+      return SDValue();
+  }
+
+  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
+    return (Index == 0) ? V.getOperand(0)
+                        : DAG.getUNDEF(VT.getVectorElementType());
+
+  if (V.getOpcode() == ISD::BUILD_VECTOR)
+    return V.getOperand(Index);
+
+  return SDValue();
+}
+
+/// Custom lower build_vector of v16i8.
+static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
+                                       unsigned NumNonZero, unsigned NumZero,
+                                       SelectionDAG &DAG,
+                                       const X86Subtarget* Subtarget,
+                                       const TargetLowering &TLI) {
+  if (NumNonZero > 8)
+    return SDValue();
+
+  SDLoc dl(Op);
+  SDValue V;
+  bool First = true;
+
+  // SSE4.1 - use PINSRB to insert each byte directly.
+  if (Subtarget->hasSSE41()) {
+    for (unsigned i = 0; i < 16; ++i) {
+      bool isNonZero = (NonZeros & (1 << i)) != 0;
+      if (isNonZero) {
+        if (First) {
+          if (NumZero)
+            V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
+          else
+            V = DAG.getUNDEF(MVT::v16i8);
+          First = false;
+        }
+        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+                        MVT::v16i8, V, Op.getOperand(i),
+                        DAG.getIntPtrConstant(i, dl));
+      }
+    }
+
+    return V;
+  }
+
+  // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
+  for (unsigned i = 0; i < 16; ++i) {
+    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
+    if (ThisIsNonZero && First) {
+      if (NumZero)
+        V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+      else
+        V = DAG.getUNDEF(MVT::v8i16);
+      First = false;
+    }
+
+    if ((i & 1) != 0) {
+      SDValue ThisElt, LastElt;
+      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+      if (LastIsNonZero) {
+        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
+                              MVT::i16, Op.getOperand(i-1));
+      }
+      if (ThisIsNonZero) {
+        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
+        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
+                              ThisElt, DAG.getConstant(8, dl, MVT::i8));
+        if (LastIsNonZero)
+          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
+      } else
+        ThisElt = LastElt;
+
+      if (ThisElt.getNode())
+        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
+                        DAG.getIntPtrConstant(i/2, dl));
+    }
+  }
+
+  return DAG.getBitcast(MVT::v16i8, V);
+}
+
+/// Custom lower build_vector of v8i16.
+static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
+                                     unsigned NumNonZero, unsigned NumZero,
+                                     SelectionDAG &DAG,
+                                     const X86Subtarget* Subtarget,
+                                     const TargetLowering &TLI) {
+  if (NumNonZero > 4)
+    return SDValue();
+
+  SDLoc dl(Op);
+  SDValue V;
+  bool First = true;
+  for (unsigned i = 0; i < 8; ++i) {
+    bool isNonZero = (NonZeros & (1 << i)) != 0;
+    if (isNonZero) {
+      if (First) {
+        if (NumZero)
+          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+        else
+          V = DAG.getUNDEF(MVT::v8i16);
+        First = false;
+      }
+      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+                      MVT::v8i16, V, Op.getOperand(i),
+                      DAG.getIntPtrConstant(i, dl));
+    }
+  }
+
+  return V;
+}
+
+/// Custom lower build_vector of v4i32 or v4f32.
+static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
+                                     const X86Subtarget *Subtarget,
+                                     const TargetLowering &TLI) {
+  // Find all zeroable elements.
+  std::bitset<4> Zeroable;
+  for (int i=0; i < 4; ++i) {
+    SDValue Elt = Op->getOperand(i);
+    Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
+  }
+  assert(Zeroable.size() - Zeroable.count() > 1 &&
+         "We expect at least two non-zero elements!");
+
+  // We only know how to deal with build_vector nodes where elements are either
+  // zeroable or extract_vector_elt with constant index.
+  SDValue FirstNonZero;
+  unsigned FirstNonZeroIdx;
+  for (unsigned i=0; i < 4; ++i) {
+    if (Zeroable[i])
+      continue;
+    SDValue Elt = Op->getOperand(i);
+    if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        !isa<ConstantSDNode>(Elt.getOperand(1)))
+      return SDValue();
+    // Make sure that this node is extracting from a 128-bit vector.
+    MVT VT = Elt.getOperand(0).getSimpleValueType();
+    if (!VT.is128BitVector())
+      return SDValue();
+    if (!FirstNonZero.getNode()) {
+      FirstNonZero = Elt;
+      FirstNonZeroIdx = i;
+    }
+  }
+
+  assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
+  SDValue V1 = FirstNonZero.getOperand(0);
+  MVT VT = V1.getSimpleValueType();
+
+  // See if this build_vector can be lowered as a blend with zero.
+  SDValue Elt;
+  unsigned EltMaskIdx, EltIdx;
+  int Mask[4];
+  for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
+    if (Zeroable[EltIdx]) {
+      // The zero vector will be on the right hand side.
+      Mask[EltIdx] = EltIdx+4;
+      continue;
+    }
+
+    Elt = Op->getOperand(EltIdx);
+    // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
+    EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
+    if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
+      break;
+    Mask[EltIdx] = EltIdx;
+  }
+
+  if (EltIdx == 4) {
+    // Let the shuffle legalizer deal with blend operations.
+    SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
+    if (V1.getSimpleValueType() != VT)
+      V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
+    return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
+  }
+
+  // See if we can lower this build_vector to a INSERTPS.
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  SDValue V2 = Elt.getOperand(0);
+  if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
+    V1 = SDValue();
+
+  bool CanFold = true;
+  for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
+    if (Zeroable[i])
+      continue;
+
+    SDValue Current = Op->getOperand(i);
+    SDValue SrcVector = Current->getOperand(0);
+    if (!V1.getNode())
+      V1 = SrcVector;
+    CanFold = SrcVector == V1 &&
+      cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
+  }
+
+  if (!CanFold)
+    return SDValue();
+
+  assert(V1.getNode() && "Expected at least two non-zero elements!");
+  if (V1.getSimpleValueType() != MVT::v4f32)
+    V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
+  if (V2.getSimpleValueType() != MVT::v4f32)
+    V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
+
+  // Ok, we can emit an INSERTPS instruction.
+  unsigned ZMask = Zeroable.to_ulong();
+
+  unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
+  assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+  SDLoc DL(Op);
+  SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+                               DAG.getIntPtrConstant(InsertPSMask, DL));
+  return DAG.getBitcast(VT, Result);
+}
+
+/// Return a vector logical shift node.
+static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
+                         unsigned NumBits, SelectionDAG &DAG,
+                         const TargetLowering &TLI, SDLoc dl) {
+  assert(VT.is128BitVector() && "Unknown type for VShift");
+  MVT ShVT = MVT::v2i64;
+  unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
+  SrcOp = DAG.getBitcast(ShVT, SrcOp);
+  MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+  assert(NumBits % 8 == 0 && "Only support byte sized shifts");
+  SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
+  return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
+}
+
+static SDValue
+LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
+
+  // Check if the scalar load can be widened into a vector load. And if
+  // the address is "base + cst" see if the cst can be "absorbed" into
+  // the shuffle mask.
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
+    SDValue Ptr = LD->getBasePtr();
+    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
+      return SDValue();
+    EVT PVT = LD->getValueType(0);
+    if (PVT != MVT::i32 && PVT != MVT::f32)
+      return SDValue();
+
+    int FI = -1;
+    int64_t Offset = 0;
+    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
+      FI = FINode->getIndex();
+      Offset = 0;
+    } else if (DAG.isBaseWithConstantOffset(Ptr) &&
+               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+      Offset = Ptr.getConstantOperandVal(1);
+      Ptr = Ptr.getOperand(0);
+    } else {
+      return SDValue();
+    }
+
+    // FIXME: 256-bit vector instructions don't require a strict alignment,
+    // improve this code to support it better.
+    unsigned RequiredAlign = VT.getSizeInBits()/8;
+    SDValue Chain = LD->getChain();
+    // Make sure the stack object alignment is at least 16 or 32.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
+      if (MFI->isFixedObjectIndex(FI)) {
+        // Can't change the alignment. FIXME: It's possible to compute
+        // the exact stack offset and reference FI + adjust offset instead.
+        // If someone *really* cares about this. That's the way to implement it.
+        return SDValue();
+      } else {
+        MFI->setObjectAlignment(FI, RequiredAlign);
+      }
+    }
+
+    // (Offset % 16 or 32) must be multiple of 4. Then address is then
+    // Ptr + (Offset & ~15).
+    if (Offset < 0)
+      return SDValue();
+    if ((Offset % RequiredAlign) & 3)
+      return SDValue();
+    int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
+    if (StartOffset) {
+      SDLoc DL(Ptr);
+      Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                        DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
+    }
+
+    int EltNo = (Offset - StartOffset) >> 2;
+    unsigned NumElems = VT.getVectorNumElements();
+
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
+    SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
+                             LD->getPointerInfo().getWithOffset(StartOffset),
+                             false, false, false, 0);
+
+    SmallVector<int, 8> Mask(NumElems, EltNo);
+
+    return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
+  }
+
+  return SDValue();
+}
+
+/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
+/// elements can be replaced by a single large load which has the same value as
+/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
+///
+/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
+///
+/// FIXME: we'd also like to handle the case where the last elements are zero
+/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
+/// There's even a handy isZeroNode for that purpose.
+static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
+                                        SDLoc &DL, SelectionDAG &DAG,
+                                        bool isAfterLegalize) {
+  unsigned NumElems = Elts.size();
+
+  LoadSDNode *LDBase = nullptr;
+  unsigned LastLoadedElt = -1U;
+
+  // For each element in the initializer, see if we've found a load or an undef.
+  // If we don't find an initial load element, or later load elements are
+  // non-consecutive, bail out.
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue Elt = Elts[i];
+    // Look through a bitcast.
+    if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
+      Elt = Elt.getOperand(0);
+    if (!Elt.getNode() ||
+        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
+      return SDValue();
+    if (!LDBase) {
+      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
+        return SDValue();
+      LDBase = cast<LoadSDNode>(Elt.getNode());
+      LastLoadedElt = i;
+      continue;
+    }
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+
+    LoadSDNode *LD = cast<LoadSDNode>(Elt);
+    EVT LdVT = Elt.getValueType();
+    // Each loaded element must be the correct fractional portion of the
+    // requested vector load.
+    if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
+      return SDValue();
+    if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
+      return SDValue();
+    LastLoadedElt = i;
+  }
+
+  // If we have found an entire vector of loads and undefs, then return a large
+  // load of the entire vector width starting at the base pointer.  If we found
+  // consecutive loads for the low half, generate a vzext_load node.
+  if (LastLoadedElt == NumElems - 1) {
+    assert(LDBase && "Did not find base load for merging consecutive loads");
+    EVT EltVT = LDBase->getValueType(0);
+    // Ensure that the input vector size for the merged loads matches the
+    // cumulative size of the input elements.
+    if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
+      return SDValue();
+
+    if (isAfterLegalize &&
+        !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
+      return SDValue();
+
+    SDValue NewLd = SDValue();
+
+    NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+                        LDBase->getPointerInfo(), LDBase->isVolatile(),
+                        LDBase->isNonTemporal(), LDBase->isInvariant(),
+                        LDBase->getAlignment());
+
+    if (LDBase->hasAnyUseOfValue(1)) {
+      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                                     SDValue(LDBase, 1),
+                                     SDValue(NewLd.getNode(), 1));
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+                             SDValue(NewLd.getNode(), 1));
+    }
+
+    return NewLd;
+  }
+
+  //TODO: The code below fires only for for loading the low v2i32 / v2f32
+  //of a v4i32 / v4f32. It's probably worth generalizing.
+  EVT EltVT = VT.getVectorElementType();
+  if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
+      DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
+    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
+    SDValue ResNode =
+        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
+                                LDBase->getPointerInfo(),
+                                LDBase->getAlignment(),
+                                false/*isVolatile*/, true/*ReadMem*/,
+                                false/*WriteMem*/);
+
+    // Make sure the newly-created LOAD is in the same position as LDBase in
+    // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
+    // update uses of LDBase's output chain to use the TokenFactor.
+    if (LDBase->hasAnyUseOfValue(1)) {
+      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                             SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+                             SDValue(ResNode.getNode(), 1));
+    }
+
+    return DAG.getBitcast(VT, ResNode);
+  }
+  return SDValue();
+}
+
+/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
+/// to generate a splat value for the following cases:
+/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
+/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
+/// a scalar load, or a constant.
+/// The VBROADCAST node is returned when a pattern is found,
+/// or SDValue() otherwise.
+static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
+                                    SelectionDAG &DAG) {
+  // VBROADCAST requires AVX.
+  // TODO: Splats could be generated for non-AVX CPUs using SSE
+  // instructions, but there's less potential gain for only 128-bit vectors.
+  if (!Subtarget->hasAVX())
+    return SDValue();
+
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+
+  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
+         "Unsupported vector type for broadcast.");
+
+  SDValue Ld;
+  bool ConstSplatVal;
+
+  switch (Op.getOpcode()) {
+    default:
+      // Unknown pattern found.
+      return SDValue();
+
+    case ISD::BUILD_VECTOR: {
+      auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
+      BitVector UndefElements;
+      SDValue Splat = BVOp->getSplatValue(&UndefElements);
+
+      // We need a splat of a single value to use broadcast, and it doesn't
+      // make any sense if the value is only in one element of the vector.
+      if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
+        return SDValue();
+
+      Ld = Splat;
+      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
+                       Ld.getOpcode() == ISD::ConstantFP);
+
+      // Make sure that all of the users of a non-constant load are from the
+      // BUILD_VECTOR node.
+      if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
+        return SDValue();
+      break;
+    }
+
+    case ISD::VECTOR_SHUFFLE: {
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+
+      // Shuffles must have a splat mask where the first element is
+      // broadcasted.
+      if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
+        return SDValue();
+
+      SDValue Sc = Op.getOperand(0);
+      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
+          Sc.getOpcode() != ISD::BUILD_VECTOR) {
+
+        if (!Subtarget->hasInt256())
+          return SDValue();
+
+        // Use the register form of the broadcast instruction available on AVX2.
+        if (VT.getSizeInBits() >= 256)
+          Sc = Extract128BitVector(Sc, 0, DAG, dl);
+        return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
+      }
+
+      Ld = Sc.getOperand(0);
+      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
+                       Ld.getOpcode() == ISD::ConstantFP);
+
+      // The scalar_to_vector node and the suspected
+      // load node must have exactly one user.
+      // Constants may have multiple users.
+
+      // AVX-512 has register version of the broadcast
+      bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
+        Ld.getValueType().getSizeInBits() >= 32;
+      if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
+          !hasRegVer))
+        return SDValue();
+      break;
+    }
+  }
+
+  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
+  bool IsGE256 = (VT.getSizeInBits() >= 256);
+
+  // When optimizing for size, generate up to 5 extra bytes for a broadcast
+  // instruction to save 8 or more bytes of constant pool data.
+  // TODO: If multiple splats are generated to load the same constant,
+  // it may be detrimental to overall size. There needs to be a way to detect
+  // that condition to know if this is truly a size win.
+  bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
+
+  // Handle broadcasting a single constant scalar from the constant pool
+  // into a vector.
+  // On Sandybridge (no AVX2), it is still better to load a constant vector
+  // from the constant pool and not to broadcast it from a scalar.
+  // But override that restriction when optimizing for size.
+  // TODO: Check if splatting is recommended for other AVX-capable CPUs.
+  if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
+    EVT CVT = Ld.getValueType();
+    assert(!CVT.isVector() && "Must not broadcast a vector type");
+
+    // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
+    // For size optimization, also splat v2f64 and v2i64, and for size opt
+    // with AVX2, also splat i8 and i16.
+    // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
+    if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+        (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
+      const Constant *C = nullptr;
+      if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
+        C = CI->getConstantIntValue();
+      else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
+        C = CF->getConstantFPValue();
+
+      assert(C && "Invalid constant type");
+
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      SDValue CP =
+          DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
+      unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+      Ld = DAG.getLoad(
+          CVT, dl, DAG.getEntryNode(), CP,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
+          false, false, Alignment);
+
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+    }
+  }
+
+  bool IsLoad = ISD::isNormalLoad(Ld.getNode());
+
+  // Handle AVX2 in-register broadcasts.
+  if (!IsLoad && Subtarget->hasInt256() &&
+      (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
+    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+  // The scalar source must be a normal load.
+  if (!IsLoad)
+    return SDValue();
+
+  if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+      (Subtarget->hasVLX() && ScalarSize == 64))
+    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+  // The integer check is needed for the 64-bit into 128-bit so it doesn't match
+  // double since there is no vbroadcastsd xmm
+  if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
+    if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+  }
+
+  // Unsupported broadcast.
+  return SDValue();
+}
+
+/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
+/// underlying vector and index.
+///
+/// Modifies \p ExtractedFromVec to the real vector and returns the real
+/// index.
+static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
+                                         SDValue ExtIdx) {
+  int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
+  if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
+    return Idx;
+
+  // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
+  // lowered this:
+  //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
+  // to:
+  //   (extract_vector_elt (vector_shuffle<2,u,u,u>
+  //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
+  //                           undef)
+  //                       Constant<0>)
+  // In this case the vector is the extract_subvector expression and the index
+  // is 2, as specified by the shuffle.
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
+  SDValue ShuffleVec = SVOp->getOperand(0);
+  MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
+  assert(ShuffleVecVT.getVectorElementType() ==
+         ExtractedFromVec.getSimpleValueType().getVectorElementType());
+
+  int ShuffleIdx = SVOp->getMaskElt(Idx);
+  if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
+    ExtractedFromVec = ShuffleVec;
+    return ShuffleIdx;
+  }
+  return Idx;
+}
+
+static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+
+  // Skip if insert_vec_elt is not supported.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
+    return SDValue();
+
+  SDLoc DL(Op);
+  unsigned NumElems = Op.getNumOperands();
+
+  SDValue VecIn1;
+  SDValue VecIn2;
+  SmallVector<unsigned, 4> InsertIndices;
+  SmallVector<int, 8> Mask(NumElems, -1);
+
+  for (unsigned i = 0; i != NumElems; ++i) {
+    unsigned Opc = Op.getOperand(i).getOpcode();
+
+    if (Opc == ISD::UNDEF)
+      continue;
+
+    if (Opc != ISD::EXTRACT_VECTOR_ELT) {
+      // Quit if more than 1 elements need inserting.
+      if (InsertIndices.size() > 1)
+        return SDValue();
+
+      InsertIndices.push_back(i);
+      continue;
+    }
+
+    SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
+    SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+    // Quit if non-constant index.
+    if (!isa<ConstantSDNode>(ExtIdx))
+      return SDValue();
+    int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
+
+    // Quit if extracted from vector of different type.
+    if (ExtractedFromVec.getValueType() != VT)
+      return SDValue();
+
+    if (!VecIn1.getNode())
+      VecIn1 = ExtractedFromVec;
+    else if (VecIn1 != ExtractedFromVec) {
+      if (!VecIn2.getNode())
+        VecIn2 = ExtractedFromVec;
+      else if (VecIn2 != ExtractedFromVec)
+        // Quit if more than 2 vectors to shuffle
+        return SDValue();
+    }
+
+    if (ExtractedFromVec == VecIn1)
+      Mask[i] = Idx;
+    else if (ExtractedFromVec == VecIn2)
+      Mask[i] = Idx + NumElems;
+  }
+
+  if (!VecIn1.getNode())
+    return SDValue();
+
+  VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
+  for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
+    unsigned Idx = InsertIndices[i];
+    NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
+                     DAG.getIntPtrConstant(Idx, DL));
+  }
+
+  return NV;
+}
+
+static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
+  assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
+         Op.getScalarValueSizeInBits() == 1 &&
+         "Can not convert non-constant vector");
+  uint64_t Immediate = 0;
+  for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
+    SDValue In = Op.getOperand(idx);
+    if (In.getOpcode() != ISD::UNDEF)
+      Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+  }
+  SDLoc dl(Op);
+  MVT VT =
+   MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8));
+  return DAG.getConstant(Immediate, dl, VT);
+}
+// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
+SDValue
+X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
+
+  MVT VT = Op.getSimpleValueType();
+  assert((VT.getVectorElementType() == MVT::i1) &&
+         "Unexpected type in LowerBUILD_VECTORvXi1!");
+
+  SDLoc dl(Op);
+  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+    SDValue Cst = DAG.getTargetConstant(0, dl, MVT::i1);
+    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  }
+
+  if (ISD::isBuildVectorAllOnes(Op.getNode())) {
+    SDValue Cst = DAG.getTargetConstant(1, dl, MVT::i1);
+    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  }
+
+  if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+    SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
+    if (Imm.getValueSizeInBits() == VT.getSizeInBits())
+      return DAG.getBitcast(VT, Imm);
+    SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+                        DAG.getIntPtrConstant(0, dl));
+  }
+
+  // Vector has one or more non-const elements
+  uint64_t Immediate = 0;
+  SmallVector<unsigned, 16> NonConstIdx;
+  bool IsSplat = true;
+  bool HasConstElts = false;
+  int SplatIdx = -1;
+  for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
+    SDValue In = Op.getOperand(idx);
+    if (In.getOpcode() == ISD::UNDEF)
+      continue;
+    if (!isa<ConstantSDNode>(In))
+      NonConstIdx.push_back(idx);
+    else {
+      Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+      HasConstElts = true;
+    }
+    if (SplatIdx == -1)
+      SplatIdx = idx;
+    else if (In != Op.getOperand(SplatIdx))
+      IsSplat = false;
+  }
+
+  // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
+  if (IsSplat)
+    return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
+                       DAG.getConstant(1, dl, VT),
+                       DAG.getConstant(0, dl, VT));
+
+  // insert elements one by one
+  SDValue DstVec;
+  SDValue Imm;
+  if (Immediate) {
+    MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
+    Imm = DAG.getConstant(Immediate, dl, ImmVT);
+  }
+  else if (HasConstElts)
+    Imm = DAG.getConstant(0, dl, VT);
+  else
+    Imm = DAG.getUNDEF(VT);
+  if (Imm.getValueSizeInBits() == VT.getSizeInBits())
+    DstVec = DAG.getBitcast(VT, Imm);
+  else {
+    SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
+    DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+                         DAG.getIntPtrConstant(0, dl));
+  }
+
+  for (unsigned i = 0; i < NonConstIdx.size(); ++i) {
+    unsigned InsertIdx = NonConstIdx[i];
+    DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+                         Op.getOperand(InsertIdx),
+                         DAG.getIntPtrConstant(InsertIdx, dl));
+  }
+  return DstVec;
+}
+
+/// \brief Return true if \p N implements a horizontal binop and return the
+/// operands for the horizontal binop into V0 and V1.
+///
+/// This is a helper function of LowerToHorizontalOp().
+/// This function checks that the build_vector \p N in input implements a
+/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
+/// operation to match.
+/// For example, if \p Opcode is equal to ISD::ADD, then this function
+/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
+/// is equal to ISD::SUB, then this function checks if this is a horizontal
+/// arithmetic sub.
+///
+/// This function only analyzes elements of \p N whose indices are
+/// in range [BaseIdx, LastIdx).
+static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
+                              SelectionDAG &DAG,
+                              unsigned BaseIdx, unsigned LastIdx,
+                              SDValue &V0, SDValue &V1) {
+  EVT VT = N->getValueType(0);
+
+  assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
+  assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
+         "Invalid Vector in input!");
+
+  bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
+  bool CanFold = true;
+  unsigned ExpectedVExtractIdx = BaseIdx;
+  unsigned NumElts = LastIdx - BaseIdx;
+  V0 = DAG.getUNDEF(VT);
+  V1 = DAG.getUNDEF(VT);
+
+  // Check if N implements a horizontal binop.
+  for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
+    SDValue Op = N->getOperand(i + BaseIdx);
+
+    // Skip UNDEFs.
+    if (Op->getOpcode() == ISD::UNDEF) {
+      // Update the expected vector extract index.
+      if (i * 2 == NumElts)
+        ExpectedVExtractIdx = BaseIdx;
+      ExpectedVExtractIdx += 2;
+      continue;
+    }
+
+    CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
+
+    if (!CanFold)
+      break;
+
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    // Try to match the following pattern:
+    // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
+    CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        Op0.getOperand(0) == Op1.getOperand(0) &&
+        isa<ConstantSDNode>(Op0.getOperand(1)) &&
+        isa<ConstantSDNode>(Op1.getOperand(1)));
+    if (!CanFold)
+      break;
+
+    unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+    unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
+
+    if (i * 2 < NumElts) {
+      if (V0.getOpcode() == ISD::UNDEF) {
+        V0 = Op0.getOperand(0);
+        if (V0.getValueType() != VT)
+          return false;
+      }
+    } else {
+      if (V1.getOpcode() == ISD::UNDEF) {
+        V1 = Op0.getOperand(0);
+        if (V1.getValueType() != VT)
+          return false;
+      }
+      if (i * 2 == NumElts)
+        ExpectedVExtractIdx = BaseIdx;
+    }
+
+    SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
+    if (I0 == ExpectedVExtractIdx)
+      CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
+    else if (IsCommutable && I1 == ExpectedVExtractIdx) {
+      // Try to match the following dag sequence:
+      // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
+      CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
+    } else
+      CanFold = false;
+
+    ExpectedVExtractIdx += 2;
+  }
+
+  return CanFold;
+}
+
+/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
+/// a concat_vector.
+///
+/// This is a helper function of LowerToHorizontalOp().
+/// This function expects two 256-bit vectors called V0 and V1.
+/// At first, each vector is split into two separate 128-bit vectors.
+/// Then, the resulting 128-bit vectors are used to implement two
+/// horizontal binary operations.
+///
+/// The kind of horizontal binary operation is defined by \p X86Opcode.
+///
+/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
+/// the two new horizontal binop.
+/// When Mode is set, the first horizontal binop dag node would take as input
+/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
+/// horizontal binop dag node would take as input the lower 128-bit of V1
+/// and the upper 128-bit of V1.
+///   Example:
+///     HADD V0_LO, V0_HI
+///     HADD V1_LO, V1_HI
+///
+/// Otherwise, the first horizontal binop dag node takes as input the lower
+/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
+/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
+///   Example:
+///     HADD V0_LO, V1_LO
+///     HADD V0_HI, V1_HI
+///
+/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
+/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
+/// the upper 128-bits of the result.
+static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
+                                     SDLoc DL, SelectionDAG &DAG,
+                                     unsigned X86Opcode, bool Mode,
+                                     bool isUndefLO, bool isUndefHI) {
+  EVT VT = V0.getValueType();
+  assert(VT.is256BitVector() && VT == V1.getValueType() &&
+         "Invalid nodes in input!");
+
+  unsigned NumElts = VT.getVectorNumElements();
+  SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
+  SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
+  SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
+  SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
+  EVT NewVT = V0_LO.getValueType();
+
+  SDValue LO = DAG.getUNDEF(NewVT);
+  SDValue HI = DAG.getUNDEF(NewVT);
+
+  if (Mode) {
+    // Don't emit a horizontal binop if the result is expected to be UNDEF.
+    if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
+      LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
+    if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
+      HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
+  } else {
+    // Don't emit a horizontal binop if the result is expected to be UNDEF.
+    if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
+                       V1_LO->getOpcode() != ISD::UNDEF))
+      LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
+
+    if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
+                       V1_HI->getOpcode() != ISD::UNDEF))
+      HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
+  }
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
+}
+
+/// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
+/// node.
+static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
+                             const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+  MVT VT = BV->getSimpleValueType(0);
+  if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
+      (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
+    return SDValue();
+
+  SDLoc DL(BV);
+  unsigned NumElts = VT.getVectorNumElements();
+  SDValue InVec0 = DAG.getUNDEF(VT);
+  SDValue InVec1 = DAG.getUNDEF(VT);
+
+  assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
+          VT == MVT::v2f64) && "build_vector with an invalid type found!");
+
+  // Odd-numbered elements in the input build vector are obtained from
+  // adding two integer/float elements.
+  // Even-numbered elements in the input build vector are obtained from
+  // subtracting two integer/float elements.
+  unsigned ExpectedOpcode = ISD::FSUB;
+  unsigned NextExpectedOpcode = ISD::FADD;
+  bool AddFound = false;
+  bool SubFound = false;
+
+  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+    SDValue Op = BV->getOperand(i);
+
+    // Skip 'undef' values.
+    unsigned Opcode = Op.getOpcode();
+    if (Opcode == ISD::UNDEF) {
+      std::swap(ExpectedOpcode, NextExpectedOpcode);
+      continue;
+    }
+
+    // Early exit if we found an unexpected opcode.
+    if (Opcode != ExpectedOpcode)
+      return SDValue();
+
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    // Try to match the following pattern:
+    // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
+    // Early exit if we cannot match that sequence.
+    if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        !isa<ConstantSDNode>(Op0.getOperand(1)) ||
+        !isa<ConstantSDNode>(Op1.getOperand(1)) ||
+        Op0.getOperand(1) != Op1.getOperand(1))
+      return SDValue();
+
+    unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+    if (I0 != i)
+      return SDValue();
+
+    // We found a valid add/sub node. Update the information accordingly.
+    if (i & 1)
+      AddFound = true;
+    else
+      SubFound = true;
+
+    // Update InVec0 and InVec1.
+    if (InVec0.getOpcode() == ISD::UNDEF) {
+      InVec0 = Op0.getOperand(0);
+      if (InVec0.getSimpleValueType() != VT)
+        return SDValue();
+    }
+    if (InVec1.getOpcode() == ISD::UNDEF) {
+      InVec1 = Op1.getOperand(0);
+      if (InVec1.getSimpleValueType() != VT)
+        return SDValue();
+    }
+
+    // Make sure that operands in input to each add/sub node always
+    // come from a same pair of vectors.
+    if (InVec0 != Op0.getOperand(0)) {
+      if (ExpectedOpcode == ISD::FSUB)
+        return SDValue();
+
+      // FADD is commutable. Try to commute the operands
+      // and then test again.
+      std::swap(Op0, Op1);
+      if (InVec0 != Op0.getOperand(0))
+        return SDValue();
+    }
+
+    if (InVec1 != Op1.getOperand(0))
+      return SDValue();
+
+    // Update the pair of expected opcodes.
+    std::swap(ExpectedOpcode, NextExpectedOpcode);
+  }
+
+  // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
+  if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
+      InVec1.getOpcode() != ISD::UNDEF)
+    return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
+
+  return SDValue();
+}
+
+/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
+static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
+                                   const X86Subtarget *Subtarget,
+                                   SelectionDAG &DAG) {
+  MVT VT = BV->getSimpleValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumUndefsLO = 0;
+  unsigned NumUndefsHI = 0;
+  unsigned Half = NumElts/2;
+
+  // Count the number of UNDEF operands in the build_vector in input.
+  for (unsigned i = 0, e = Half; i != e; ++i)
+    if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
+      NumUndefsLO++;
+
+  for (unsigned i = Half, e = NumElts; i != e; ++i)
+    if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
+      NumUndefsHI++;
+
+  // Early exit if this is either a build_vector of all UNDEFs or all the
+  // operands but one are UNDEF.
+  if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
+    return SDValue();
+
+  SDLoc DL(BV);
+  SDValue InVec0, InVec1;
+  if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
+    // Try to match an SSE3 float HADD/HSUB.
+    if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
+
+    if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
+  } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
+    // Try to match an SSSE3 integer HADD/HSUB.
+    if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
+
+    if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
+  }
+
+  if (!Subtarget->hasAVX())
+    return SDValue();
+
+  if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
+    // Try to match an AVX horizontal add/sub of packed single/double
+    // precision floating point values from 256-bit vectors.
+    SDValue InVec2, InVec3;
+    if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
+        ((InVec0.getOpcode() == ISD::UNDEF ||
+          InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+        ((InVec1.getOpcode() == ISD::UNDEF ||
+          InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+      return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
+
+    if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
+        ((InVec0.getOpcode() == ISD::UNDEF ||
+          InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+        ((InVec1.getOpcode() == ISD::UNDEF ||
+          InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+      return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
+  } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
+    // Try to match an AVX2 horizontal add/sub of signed integers.
+    SDValue InVec2, InVec3;
+    unsigned X86Opcode;
+    bool CanFold = true;
+
+    if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
+        ((InVec0.getOpcode() == ISD::UNDEF ||
+          InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+        ((InVec1.getOpcode() == ISD::UNDEF ||
+          InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+      X86Opcode = X86ISD::HADD;
+    else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
+        ((InVec0.getOpcode() == ISD::UNDEF ||
+          InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+        ((InVec1.getOpcode() == ISD::UNDEF ||
+          InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+      X86Opcode = X86ISD::HSUB;
+    else
+      CanFold = false;
+
+    if (CanFold) {
+      // Fold this build_vector into a single horizontal add/sub.
+      // Do this only if the target has AVX2.
+      if (Subtarget->hasAVX2())
+        return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
+
+      // Do not try to expand this build_vector into a pair of horizontal
+      // add/sub if we can emit a pair of scalar add/sub.
+      if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
+        return SDValue();
+
+      // Convert this build_vector into a pair of horizontal binop followed by
+      // a concat vector.
+      bool isUndefLO = NumUndefsLO == Half;
+      bool isUndefHI = NumUndefsHI == Half;
+      return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
+                                   isUndefLO, isUndefHI);
+    }
+  }
+
+  if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
+       VT == MVT::v16i16) && Subtarget->hasAVX()) {
+    unsigned X86Opcode;
+    if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::HADD;
+    else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::HSUB;
+    else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::FHADD;
+    else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::FHSUB;
+    else
+      return SDValue();
+
+    // Don't try to expand this build_vector into a pair of horizontal add/sub
+    // if we can simply emit a pair of scalar add/sub.
+    if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
+      return SDValue();
+
+    // Convert this build_vector into two horizontal add/sub followed by
+    // a concat vector.
+    bool isUndefLO = NumUndefsLO == Half;
+    bool isUndefHI = NumUndefsHI == Half;
+    return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
+                                 isUndefLO, isUndefHI);
+  }
+
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+
+  MVT VT = Op.getSimpleValueType();
+  MVT ExtVT = VT.getVectorElementType();
+  unsigned NumElems = Op.getNumOperands();
+
+  // Generate vectors for predicate vectors.
+  if (VT.getVectorElementType() == MVT::i1 && Subtarget->hasAVX512())
+    return LowerBUILD_VECTORvXi1(Op, DAG);
+
+  // Vectors containing all zeros can be matched by pxor and xorps later
+  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+    // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
+    // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
+    if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
+      return Op;
+
+    return getZeroVector(VT, Subtarget, DAG, dl);
+  }
+
+  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
+  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
+  // vpcmpeqd on 256-bit vectors.
+  if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
+    if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
+      return Op;
+
+    if (!VT.is512BitVector())
+      return getOnesVector(VT, Subtarget, DAG, dl);
+  }
+
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
+  if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
+    return AddSub;
+  if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
+    return HorizontalOp;
+  if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
+    return Broadcast;
+
+  unsigned EVTBits = ExtVT.getSizeInBits();
+
+  unsigned NumZero  = 0;
+  unsigned NumNonZero = 0;
+  uint64_t NonZeros = 0;
+  bool IsAllConstants = true;
+  SmallSet<SDValue, 8> Values;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue Elt = Op.getOperand(i);
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+    Values.insert(Elt);
+    if (Elt.getOpcode() != ISD::Constant &&
+        Elt.getOpcode() != ISD::ConstantFP)
+      IsAllConstants = false;
+    if (X86::isZeroNode(Elt))
+      NumZero++;
+    else {
+      assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
+      NonZeros |= ((uint64_t)1 << i);
+      NumNonZero++;
+    }
+  }
+
+  // All undef vector. Return an UNDEF.  All zero vectors were handled above.
+  if (NumNonZero == 0)
+    return DAG.getUNDEF(VT);
+
+  // Special case for single non-zero, non-undef, element.
+  if (NumNonZero == 1) {
+    unsigned Idx = countTrailingZeros(NonZeros);
+    SDValue Item = Op.getOperand(Idx);
+
+    // If this is an insertion of an i64 value on x86-32, and if the top bits of
+    // the value are obviously zero, truncate the value to i32 and do the
+    // insertion that way.  Only do this if the value is non-constant or if the
+    // value is a constant being inserted into element 0.  It is cheaper to do
+    // a constant pool load than it is to do a movd + shuffle.
+    if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
+        (!IsAllConstants || Idx == 0)) {
+      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
+        // Handle SSE only.
+        assert(VT == MVT::v2i64 && "Expected an SSE value type!");
+        MVT VecVT = MVT::v4i32;
+
+        // Truncate the value (which may itself be a constant) to i32, and
+        // convert it to a vector with movd (S2V+shuffle to zero extend).
+        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
+        return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
+                                      Item, Idx * 2, true, Subtarget, DAG));
+      }
+    }
+
+    // If we have a constant or non-constant insertion into the low element of
+    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
+    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
+    // depending on what the source datatype is.
+    if (Idx == 0) {
+      if (NumZero == 0)
+        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+
+      if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
+          (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
+        if (VT.is512BitVector()) {
+          SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
+          return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
+                             Item, DAG.getIntPtrConstant(0, dl));
+        }
+        assert((VT.is128BitVector() || VT.is256BitVector()) &&
+               "Expected an SSE value type!");
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+      }
+
+      // We can't directly insert an i8 or i16 into a vector, so zero extend
+      // it to i32 first.
+      if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
+        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
+        if (VT.is256BitVector()) {
+          if (Subtarget->hasAVX()) {
+            Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v8i32, Item);
+            Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+          } else {
+            // Without AVX, we need to extend to a 128-bit vector and then
+            // insert into the 256-bit vector.
+            Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
+            SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
+            Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
+          }
+        } else {
+          assert(VT.is128BitVector() && "Expected an SSE value type!");
+          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
+          Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+        }
+        return DAG.getBitcast(VT, Item);
+      }
+    }
+
+    // Is it a vector logical left shift?
+    if (NumElems == 2 && Idx == 1 &&
+        X86::isZeroNode(Op.getOperand(0)) &&
+        !X86::isZeroNode(Op.getOperand(1))) {
+      unsigned NumBits = VT.getSizeInBits();
+      return getVShift(true, VT,
+                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                   VT, Op.getOperand(1)),
+                       NumBits/2, DAG, *this, dl);
+    }
+
+    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
+      return SDValue();
+
+    // Otherwise, if this is a vector with i32 or f32 elements, and the element
+    // is a non-constant being inserted into an element other than the low one,
+    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
+    // movd/movss) to move this into the low element, then shuffle it into
+    // place.
+    if (EVTBits == 32) {
+      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+      return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
+    }
+  }
+
+  // Splat is obviously ok. Let legalizer expand it to a shuffle.
+  if (Values.size() == 1) {
+    if (EVTBits == 32) {
+      // Instead of a shuffle like this:
+      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
+      // Check if it's possible to issue this instead.
+      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
+      unsigned Idx = countTrailingZeros(NonZeros);
+      SDValue Item = Op.getOperand(Idx);
+      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
+        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
+    }
+    return SDValue();
+  }
+
+  // A vector full of immediates; various special cases are already
+  // handled, so this is best done with a single constant-pool load.
+  if (IsAllConstants)
+    return SDValue();
+
+  // For AVX-length vectors, see if we can use a vector load to get all of the
+  // elements, otherwise build the individual 128-bit pieces and use
+  // shuffles to put them in place.
+  if (VT.is256BitVector() || VT.is512BitVector()) {
+    SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
+
+    // Check for a build vector of consecutive loads.
+    if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
+      return LD;
+
+    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
+
+    // Build both the lower and upper subvector.
+    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
+                                makeArrayRef(&V[0], NumElems/2));
+    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
+                                makeArrayRef(&V[NumElems / 2], NumElems/2));
+
+    // Recreate the wider vector with the lower and upper part.
+    if (VT.is256BitVector())
+      return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+    return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+  }
+
+  // Let legalizer expand 2-wide build_vectors.
+  if (EVTBits == 64) {
+    if (NumNonZero == 1) {
+      // One half is zero or undef.
+      unsigned Idx = countTrailingZeros(NonZeros);
+      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
+                               Op.getOperand(Idx));
+      return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
+    }
+    return SDValue();
+  }
+
+  // If element VT is < 32 bits, convert it to inserts into a zero vector.
+  if (EVTBits == 8 && NumElems == 16)
+    if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
+                                          DAG, Subtarget, *this))
+      return V;
+
+  if (EVTBits == 16 && NumElems == 8)
+    if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
+                                          DAG, Subtarget, *this))
+      return V;
+
+  // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
+  if (EVTBits == 32 && NumElems == 4)
+    if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
+      return V;
+
+  // If element VT is == 32 bits, turn it into a number of shuffles.
+  SmallVector<SDValue, 8> V(NumElems);
+  if (NumElems == 4 && NumZero > 0) {
+    for (unsigned i = 0; i < 4; ++i) {
+      bool isZero = !(NonZeros & (1ULL << i));
+      if (isZero)
+        V[i] = getZeroVector(VT, Subtarget, DAG, dl);
+      else
+        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
+        default: break;
+        case 0:
+          V[i] = V[i*2];  // Must be a zero vector.
+          break;
+        case 1:
+          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
+          break;
+        case 2:
+          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
+          break;
+        case 3:
+          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
+          break;
+      }
+    }
+
+    bool Reverse1 = (NonZeros & 0x3) == 2;
+    bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
+    int MaskVec[] = {
+      Reverse1 ? 1 : 0,
+      Reverse1 ? 0 : 1,
+      static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
+      static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
+    };
+    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
+  }
+
+  if (Values.size() > 1 && VT.is128BitVector()) {
+    // Check for a build vector of consecutive loads.
+    for (unsigned i = 0; i < NumElems; ++i)
+      V[i] = Op.getOperand(i);
+
+    // Check for elements which are consecutive loads.
+    if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
+      return LD;
+
+    // Check for a build vector from mostly shuffle plus few inserting.
+    if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
+      return Sh;
+
+    // For SSE 4.1, use insertps to put the high elements into the low element.
+    if (Subtarget->hasSSE41()) {
+      SDValue Result;
+      if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
+        Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
+      else
+        Result = DAG.getUNDEF(VT);
+
+      for (unsigned i = 1; i < NumElems; ++i) {
+        if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
+        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
+                             Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
+      }
+      return Result;
+    }
+
+    // Otherwise, expand into a number of unpckl*, start by extending each of
+    // our (non-undef) elements to the full vector width with the element in the
+    // bottom slot of the vector (which generates no code for SSE).
+    for (unsigned i = 0; i < NumElems; ++i) {
+      if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
+        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+      else
+        V[i] = DAG.getUNDEF(VT);
+    }
+
+    // Next, we iteratively mix elements, e.g. for v4f32:
+    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
+    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
+    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
+    unsigned EltStride = NumElems >> 1;
+    while (EltStride != 0) {
+      for (unsigned i = 0; i < EltStride; ++i) {
+        // If V[i+EltStride] is undef and this is the first round of mixing,
+        // then it is safe to just drop this shuffle: V[i] is already in the
+        // right place, the one element (since it's the first round) being
+        // inserted as undef can be dropped.  This isn't safe for successive
+        // rounds because they will permute elements within both vectors.
+        if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
+            EltStride == NumElems/2)
+          continue;
+
+        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
+      }
+      EltStride >>= 1;
+    }
+    return V[0];
+  }
+  return SDValue();
+}
+
+// 256-bit AVX can use the vinsertf128 instruction
+// to create 256-bit vectors from two other 128-bit ones.
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  MVT ResVT = Op.getSimpleValueType();
+
+  assert((ResVT.is256BitVector() ||
+          ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
+
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  unsigned NumElems = ResVT.getVectorNumElements();
+  if (ResVT.is256BitVector())
+    return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+
+  if (Op.getNumOperands() == 4) {
+    MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
+                                  ResVT.getVectorNumElements()/2);
+    SDValue V3 = Op.getOperand(2);
+    SDValue V4 = Op.getOperand(3);
+    return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
+      Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
+  }
+  return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+}
+
+static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG & DAG) {
+  SDLoc dl(Op);
+  MVT ResVT = Op.getSimpleValueType();
+  unsigned NumOfOperands = Op.getNumOperands();
+
+  assert(isPowerOf2_32(NumOfOperands) &&
+         "Unexpected number of operands in CONCAT_VECTORS");
+
+  SDValue Undef = DAG.getUNDEF(ResVT);
+  if (NumOfOperands > 2) {
+    // Specialize the cases when all, or all but one, of the operands are undef.
+    unsigned NumOfDefinedOps = 0;
+    unsigned OpIdx = 0;
+    for (unsigned i = 0; i < NumOfOperands; i++)
+      if (!Op.getOperand(i).isUndef()) {
+        NumOfDefinedOps++;
+        OpIdx = i;
+      }
+    if (NumOfDefinedOps == 0)
+      return Undef;
+    if (NumOfDefinedOps == 1) {
+      unsigned SubVecNumElts =
+        Op.getOperand(OpIdx).getValueType().getVectorNumElements();
+      SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
+      return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
+                         Op.getOperand(OpIdx), IdxVal);
+    }
+
+    MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
+                                  ResVT.getVectorNumElements()/2);
+    SmallVector<SDValue, 2> Ops;
+    for (unsigned i = 0; i < NumOfOperands/2; i++)
+      Ops.push_back(Op.getOperand(i));
+    SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
+    Ops.clear();
+    for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
+      Ops.push_back(Op.getOperand(i));
+    SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
+  }
+
+  // 2 operands
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  unsigned NumElems = ResVT.getVectorNumElements();
+  assert(V1.getValueType() == V2.getValueType() &&
+         V1.getValueType().getVectorNumElements() == NumElems/2 &&
+         "Unexpected operands in CONCAT_VECTORS");
+
+  if (ResVT.getSizeInBits() >= 16)
+    return Op; // The operation is legal with KUNPCK
+
+  bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
+  SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
+  if (IsZeroV1 && IsZeroV2)
+    return ZeroVec;
+
+  SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+  if (V2.isUndef())
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
+  if (IsZeroV2)
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
+
+  SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
+  if (V1.isUndef())
+    V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
+
+  if (IsZeroV1)
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
+
+  V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op,
+                                   const X86Subtarget *Subtarget,
+                                   SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  if (VT.getVectorElementType() == MVT::i1)
+    return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
+
+  assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
+         (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
+          Op.getNumOperands() == 4)));
+
+  // AVX can use the vinsertf128 instruction to create 256-bit vectors
+  // from two other 128-bit ones.
+
+  // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
+  return LowerAVXCONCAT_VECTORS(Op, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+// Vector shuffle lowering
+//
+// This is an experimental code path for lowering vector shuffles on x86. It is
+// designed to handle arbitrary vector shuffles and blends, gracefully
+// degrading performance as necessary. It works hard to recognize idiomatic
+// shuffles and lower them to optimal instruction patterns without leaving
+// a framework that allows reasonably efficient handling of all vector shuffle
+// patterns.
+//===----------------------------------------------------------------------===//
+
+/// \brief Tiny helper function to identify a no-op mask.
+///
+/// This is a somewhat boring predicate function. It checks whether the mask
+/// array input, which is assumed to be a single-input shuffle mask of the kind
+/// used by the X86 shuffle instructions (not a fully general
+/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
+/// in-place shuffle are 'no-op's.
+static bool isNoopShuffleMask(ArrayRef<int> Mask) {
+  for (int i = 0, Size = Mask.size(); i < Size; ++i)
+    if (Mask[i] != -1 && Mask[i] != i)
+      return false;
+  return true;
+}
+
+/// \brief Helper function to classify a mask as a single-input mask.
+///
+/// This isn't a generic single-input test because in the vector shuffle
+/// lowering we canonicalize single inputs to be the first input operand. This
+/// means we can more quickly test for a single input by only checking whether
+/// an input from the second operand exists. We also assume that the size of
+/// mask corresponds to the size of the input vectors which isn't true in the
+/// fully general case.
+static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
+  for (int M : Mask)
+    if (M >= (int)Mask.size())
+      return false;
+  return true;
+}
+
+/// \brief Test whether there are elements crossing 128-bit lanes in this
+/// shuffle mask.
+///
+/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
+/// and we routinely test for these.
+static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
+  int LaneSize = 128 / VT.getScalarSizeInBits();
+  int Size = Mask.size();
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+      return true;
+  return false;
+}
+
+/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
+///
+/// This checks a shuffle mask to see if it is performing the same
+/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
+/// that it is also not lane-crossing. It may however involve a blend from the
+/// same lane of a second vector.
+///
+/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
+/// non-trivial to compute in the face of undef lanes. The representation is
+/// *not* suitable for use with existing 128-bit shuffles as it will contain
+/// entries from both V1 and V2 inputs to the wider mask.
+static bool
+is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+                                SmallVectorImpl<int> &RepeatedMask) {
+  int LaneSize = 128 / VT.getScalarSizeInBits();
+  RepeatedMask.resize(LaneSize, -1);
+  int Size = Mask.size();
+  for (int i = 0; i < Size; ++i) {
+    if (Mask[i] < 0)
+      continue;
+    if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+      // This entry crosses lanes, so there is no way to model this shuffle.
+      return false;
+
+    // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+    if (RepeatedMask[i % LaneSize] == -1)
+      // This is the first non-undef entry in this slot of a 128-bit lane.
+      RepeatedMask[i % LaneSize] =
+          Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
+    else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
+      // Found a mismatch with the repeated mask.
+      return false;
+  }
+  return true;
+}
+
+/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
+/// arguments.
+///
+/// This is a fast way to test a shuffle mask against a fixed pattern:
+///
+///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
+///
+/// It returns true if the mask is exactly as wide as the argument list, and
+/// each element of the mask is either -1 (signifying undef) or the value given
+/// in the argument.
+static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
+                                ArrayRef<int> ExpectedMask) {
+  if (Mask.size() != ExpectedMask.size())
+    return false;
+
+  int Size = Mask.size();
+
+  // If the values are build vectors, we can look through them to find
+  // equivalent inputs that make the shuffles equivalent.
+  auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
+  auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
+
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) {
+      auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
+      auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
+      if (!MaskBV || !ExpectedBV ||
+          MaskBV->getOperand(Mask[i] % Size) !=
+              ExpectedBV->getOperand(ExpectedMask[i] % Size))
+        return false;
+    }
+
+  return true;
+}
+
+/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
+///
+/// This helper function produces an 8-bit shuffle immediate corresponding to
+/// the ubiquitous shuffle encoding scheme used in x86 instructions for
+/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
+/// example.
+///
+/// NB: We rely heavily on "undef" masks preserving the input lane.
+static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
+                                          SelectionDAG &DAG) {
+  assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
+  assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
+  assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
+  assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
+  assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
+
+  unsigned Imm = 0;
+  Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
+  Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
+  Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
+  Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
+  return DAG.getConstant(Imm, DL, MVT::i8);
+}
+
+/// \brief Compute whether each element of a shuffle is zeroable.
+///
+/// A "zeroable" vector shuffle element is one which can be lowered to zero.
+/// Either it is an undef element in the shuffle mask, the element of the input
+/// referenced is undef, or the element of the input referenced is known to be
+/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
+/// as many lanes with this technique as possible to simplify the remaining
+/// shuffle.
+static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
+                                                     SDValue V1, SDValue V2) {
+  SmallBitVector Zeroable(Mask.size(), false);
+
+  while (V1.getOpcode() == ISD::BITCAST)
+    V1 = V1->getOperand(0);
+  while (V2.getOpcode() == ISD::BITCAST)
+    V2 = V2->getOperand(0);
+
+  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    int M = Mask[i];
+    // Handle the easy cases.
+    if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
+      Zeroable[i] = true;
+      continue;
+    }
+
+    // If this is an index into a build_vector node (which has the same number
+    // of elements), dig out the input value and use it.
+    SDValue V = M < Size ? V1 : V2;
+    if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
+      continue;
+
+    SDValue Input = V.getOperand(M % Size);
+    // The UNDEF opcode check really should be dead code here, but not quite
+    // worth asserting on (it isn't invalid, just unexpected).
+    if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
+      Zeroable[i] = true;
+  }
+
+  return Zeroable;
+}
+
+// X86 has dedicated unpack instructions that can handle specific blend
+// operations: UNPCKH and UNPCKL.
+static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask,
+                                           SDValue V1, SDValue V2,
+                                           SelectionDAG &DAG) {
+  int NumElts = VT.getVectorNumElements();
+  int NumEltsInLane = 128 / VT.getScalarSizeInBits();
+  SmallVector<int, 8> Unpckl;
+  SmallVector<int, 8> Unpckh;
+
+  for (int i = 0; i < NumElts; ++i) {
+    unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
+    int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2);
+    int HiPos = LoPos + NumEltsInLane / 2;
+    Unpckl.push_back(LoPos);
+    Unpckh.push_back(HiPos);
+  }
+
+  if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
+
+  // Commute and try again.
+  ShuffleVectorSDNode::commuteMask(Unpckl);
+  if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
+
+  ShuffleVectorSDNode::commuteMask(Unpckh);
+  if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
+
+  return SDValue();
+}
+
+/// \brief Try to emit a bitmask instruction for a shuffle.
+///
+/// This handles cases where we can model a blend exactly as a bitmask due to
+/// one of the inputs being zeroable.
+static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
+                                           SDValue V2, ArrayRef<int> Mask,
+                                           SelectionDAG &DAG) {
+  MVT EltVT = VT.getVectorElementType();
+  int NumEltBits = EltVT.getSizeInBits();
+  MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
+  SDValue Zero = DAG.getConstant(0, DL, IntEltVT);
+  SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
+                                    IntEltVT);
+  if (EltVT.isFloatingPoint()) {
+    Zero = DAG.getBitcast(EltVT, Zero);
+    AllOnes = DAG.getBitcast(EltVT, AllOnes);
+  }
+  SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  SDValue V;
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    if (Zeroable[i])
+      continue;
+    if (Mask[i] % Size != i)
+      return SDValue(); // Not a blend.
+    if (!V)
+      V = Mask[i] < Size ? V1 : V2;
+    else if (V != (Mask[i] < Size ? V1 : V2))
+      return SDValue(); // Can only let one input through the mask.
+
+    VMaskOps[i] = AllOnes;
+  }
+  if (!V)
+    return SDValue(); // No non-zeroable elements!
+
+  SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
+  V = DAG.getNode(VT.isFloatingPoint()
+                  ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
+                  DL, VT, V, VMask);
+  return V;
+}
+
+/// \brief Try to emit a blend instruction for a shuffle using bit math.
+///
+/// This is used as a fallback approach when first class blend instructions are
+/// unavailable. Currently it is only suitable for integer vectors, but could
+/// be generalized for floating point vectors if desirable.
+static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
+                                            SDValue V2, ArrayRef<int> Mask,
+                                            SelectionDAG &DAG) {
+  assert(VT.isInteger() && "Only supports integer vector types!");
+  MVT EltVT = VT.getVectorElementType();
+  int NumEltBits = EltVT.getSizeInBits();
+  SDValue Zero = DAG.getConstant(0, DL, EltVT);
+  SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
+                                    EltVT);
+  SmallVector<SDValue, 16> MaskOps;
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size)
+      return SDValue(); // Shuffled input!
+    MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
+  }
+
+  SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps);
+  V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
+  // We have to cast V2 around.
+  MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+  V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
+                                      DAG.getBitcast(MaskVT, V1Mask),
+                                      DAG.getBitcast(MaskVT, V2)));
+  return DAG.getNode(ISD::OR, DL, VT, V1, V2);
+}
+
+/// \brief Try to emit a blend instruction for a shuffle.
+///
+/// This doesn't do any checks for the availability of instructions for blending
+/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
+/// be matched in the backend with the type given. What it does check for is
+/// that the shuffle mask is a blend, or convertible into a blend with zero.
+static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
+                                         SDValue V2, ArrayRef<int> Original,
+                                         const X86Subtarget *Subtarget,
+                                         SelectionDAG &DAG) {
+  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+  SmallVector<int, 8> Mask(Original.begin(), Original.end());
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  bool ForceV1Zero = false, ForceV2Zero = false;
+
+  // Attempt to generate the binary blend mask. If an input is zero then
+  // we can use any lane.
+  // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
+  unsigned BlendMask = 0;
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    int M = Mask[i];
+    if (M < 0)
+      continue;
+    if (M == i)
+      continue;
+    if (M == i + Size) {
+      BlendMask |= 1u << i;
+      continue;
+    }
+    if (Zeroable[i]) {
+      if (V1IsZero) {
+        ForceV1Zero = true;
+        Mask[i] = i;
+        continue;
+      }
+      if (V2IsZero) {
+        ForceV2Zero = true;
+        BlendMask |= 1u << i;
+        Mask[i] = i + Size;
+        continue;
+      }
+    }
+    return SDValue(); // Shuffled input!
+  }
+
+  // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
+  if (ForceV1Zero)
+    V1 = getZeroVector(VT, Subtarget, DAG, DL);
+  if (ForceV2Zero)
+    V2 = getZeroVector(VT, Subtarget, DAG, DL);
+
+  auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
+    unsigned ScaledMask = 0;
+    for (int i = 0; i != Size; ++i)
+      if (BlendMask & (1u << i))
+        for (int j = 0; j != Scale; ++j)
+          ScaledMask |= 1u << (i * Scale + j);
+    return ScaledMask;
+  };
+
+  switch (VT.SimpleTy) {
+  case MVT::v2f64:
+  case MVT::v4f32:
+  case MVT::v4f64:
+  case MVT::v8f32:
+    return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
+                       DAG.getConstant(BlendMask, DL, MVT::i8));
+
+  case MVT::v4i64:
+  case MVT::v8i32:
+    assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+    // FALLTHROUGH
+  case MVT::v2i64:
+  case MVT::v4i32:
+    // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
+    // that instruction.
+    if (Subtarget->hasAVX2()) {
+      // Scale the blend by the number of 32-bit dwords per element.
+      int Scale =  VT.getScalarSizeInBits() / 32;
+      BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
+      MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
+      V1 = DAG.getBitcast(BlendVT, V1);
+      V2 = DAG.getBitcast(BlendVT, V2);
+      return DAG.getBitcast(
+          VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
+                          DAG.getConstant(BlendMask, DL, MVT::i8)));
+    }
+    // FALLTHROUGH
+  case MVT::v8i16: {
+    // For integer shuffles we need to expand the mask and cast the inputs to
+    // v8i16s prior to blending.
+    int Scale = 8 / VT.getVectorNumElements();
+    BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
+    V1 = DAG.getBitcast(MVT::v8i16, V1);
+    V2 = DAG.getBitcast(MVT::v8i16, V2);
+    return DAG.getBitcast(VT,
+                          DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
+                                      DAG.getConstant(BlendMask, DL, MVT::i8)));
+  }
+
+  case MVT::v16i16: {
+    assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+    SmallVector<int, 8> RepeatedMask;
+    if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+      // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
+      assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
+      BlendMask = 0;
+      for (int i = 0; i < 8; ++i)
+        if (RepeatedMask[i] >= 16)
+          BlendMask |= 1u << i;
+      return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
+                         DAG.getConstant(BlendMask, DL, MVT::i8));
+    }
+  }
+    // FALLTHROUGH
+  case MVT::v16i8:
+  case MVT::v32i8: {
+    assert((VT.is128BitVector() || Subtarget->hasAVX2()) &&
+           "256-bit byte-blends require AVX2 support!");
+
+    // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
+    if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
+      return Masked;
+
+    // Scale the blend by the number of bytes per element.
+    int Scale = VT.getScalarSizeInBits() / 8;
+
+    // This form of blend is always done on bytes. Compute the byte vector
+    // type.
+    MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+
+    // Compute the VSELECT mask. Note that VSELECT is really confusing in the
+    // mix of LLVM's code generator and the x86 backend. We tell the code
+    // generator that boolean values in the elements of an x86 vector register
+    // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
+    // mapping a select to operand #1, and 'false' mapping to operand #2. The
+    // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
+    // of the element (the remaining are ignored) and 0 in that high bit would
+    // mean operand #1 while 1 in the high bit would mean operand #2. So while
+    // the LLVM model for boolean values in vector elements gets the relevant
+    // bit set, it is set backwards and over constrained relative to x86's
+    // actual model.
+    SmallVector<SDValue, 32> VSELECTMask;
+    for (int i = 0, Size = Mask.size(); i < Size; ++i)
+      for (int j = 0; j < Scale; ++j)
+        VSELECTMask.push_back(
+            Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
+                        : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
+                                          MVT::i8));
+
+    V1 = DAG.getBitcast(BlendVT, V1);
+    V2 = DAG.getBitcast(BlendVT, V2);
+    return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
+                                          DAG.getNode(ISD::BUILD_VECTOR, DL,
+                                                      BlendVT, VSELECTMask),
+                                          V1, V2));
+  }
+
+  default:
+    llvm_unreachable("Not a supported integer vector type!");
+  }
+}
+
+/// \brief Try to lower as a blend of elements from two inputs followed by
+/// a single-input permutation.
+///
+/// This matches the pattern where we can blend elements from two inputs and
+/// then reduce the shuffle to a single-input permutation.
+static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
+                                                   SDValue V2,
+                                                   ArrayRef<int> Mask,
+                                                   SelectionDAG &DAG) {
+  // We build up the blend mask while checking whether a blend is a viable way
+  // to reduce the shuffle.
+  SmallVector<int, 32> BlendMask(Mask.size(), -1);
+  SmallVector<int, 32> PermuteMask(Mask.size(), -1);
+
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    if (Mask[i] < 0)
+      continue;
+
+    assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
+
+    if (BlendMask[Mask[i] % Size] == -1)
+      BlendMask[Mask[i] % Size] = Mask[i];
+    else if (BlendMask[Mask[i] % Size] != Mask[i])
+      return SDValue(); // Can't blend in the needed input!
+
+    PermuteMask[i] = Mask[i] % Size;
+  }
+
+  SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+  return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
+}
+
+/// \brief Generic routine to decompose a shuffle and blend into indepndent
+/// blends and permutes.
+///
+/// This matches the extremely common pattern for handling combined
+/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
+/// operations. It will try to pick the best arrangement of shuffles and
+/// blends.
+static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
+                                                          SDValue V1,
+                                                          SDValue V2,
+                                                          ArrayRef<int> Mask,
+                                                          SelectionDAG &DAG) {
+  // Shuffle the input elements into the desired positions in V1 and V2 and
+  // blend them together.
+  SmallVector<int, 32> V1Mask(Mask.size(), -1);
+  SmallVector<int, 32> V2Mask(Mask.size(), -1);
+  SmallVector<int, 32> BlendMask(Mask.size(), -1);
+  for (int i = 0, Size = Mask.size(); i < Size; ++i)
+    if (Mask[i] >= 0 && Mask[i] < Size) {
+      V1Mask[i] = Mask[i];
+      BlendMask[i] = i;
+    } else if (Mask[i] >= Size) {
+      V2Mask[i] = Mask[i] - Size;
+      BlendMask[i] = i + Size;
+    }
+
+  // Try to lower with the simpler initial blend strategy unless one of the
+  // input shuffles would be a no-op. We prefer to shuffle inputs as the
+  // shuffle may be able to fold with a load or other benefit. However, when
+  // we'll have to do 2x as many shuffles in order to achieve this, blending
+  // first is a better strategy.
+  if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
+    if (SDValue BlendPerm =
+            lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
+      return BlendPerm;
+
+  V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+  V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+  return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+}
+
+/// \brief Try to lower a vector shuffle as a byte rotation.
+///
+/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
+/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
+/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
+/// try to generically lower a vector shuffle through such an pattern. It
+/// does not check for the profitability of lowering either as PALIGNR or
+/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
+/// This matches shuffle vectors that look like:
+///
+///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
+///
+/// Essentially it concatenates V1 and V2, shifts right by some number of
+/// elements, and takes the low elements as the result. Note that while this is
+/// specified as a *right shift* because x86 is little-endian, it is a *left
+/// rotate* of the vector lanes.
+static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
+                                              SDValue V2,
+                                              ArrayRef<int> Mask,
+                                              const X86Subtarget *Subtarget,
+                                              SelectionDAG &DAG) {
+  assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+
+  int NumElts = Mask.size();
+  int NumLanes = VT.getSizeInBits() / 128;
+  int NumLaneElts = NumElts / NumLanes;
+
+  // We need to detect various ways of spelling a rotation:
+  //   [11, 12, 13, 14, 15,  0,  1,  2]
+  //   [-1, 12, 13, 14, -1, -1,  1, -1]
+  //   [-1, -1, -1, -1, -1, -1,  1,  2]
+  //   [ 3,  4,  5,  6,  7,  8,  9, 10]
+  //   [-1,  4,  5,  6, -1, -1,  9, -1]
+  //   [-1,  4,  5,  6, -1, -1, -1, -1]
+  int Rotation = 0;
+  SDValue Lo, Hi;
+  for (int l = 0; l < NumElts; l += NumLaneElts) {
+    for (int i = 0; i < NumLaneElts; ++i) {
+      if (Mask[l + i] == -1)
+        continue;
+      assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!");
+
+      // Get the mod-Size index and lane correct it.
+      int LaneIdx = (Mask[l + i] % NumElts) - l;
+      // Make sure it was in this lane.
+      if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
+        return SDValue();
+
+      // Determine where a rotated vector would have started.
+      int StartIdx = i - LaneIdx;
+      if (StartIdx == 0)
+        // The identity rotation isn't interesting, stop.
+        return SDValue();
+
+      // If we found the tail of a vector the rotation must be the missing
+      // front. If we found the head of a vector, it must be how much of the
+      // head.
+      int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
+
+      if (Rotation == 0)
+        Rotation = CandidateRotation;
+      else if (Rotation != CandidateRotation)
+        // The rotations don't match, so we can't match this mask.
+        return SDValue();
+
+      // Compute which value this mask is pointing at.
+      SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
+
+      // Compute which of the two target values this index should be assigned
+      // to. This reflects whether the high elements are remaining or the low
+      // elements are remaining.
+      SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
+
+      // Either set up this value if we've not encountered it before, or check
+      // that it remains consistent.
+      if (!TargetV)
+        TargetV = MaskV;
+      else if (TargetV != MaskV)
+        // This may be a rotation, but it pulls from the inputs in some
+        // unsupported interleaving.
+        return SDValue();
+    }
+  }
+
+  // Check that we successfully analyzed the mask, and normalize the results.
+  assert(Rotation != 0 && "Failed to locate a viable rotation!");
+  assert((Lo || Hi) && "Failed to find a rotated input vector!");
+  if (!Lo)
+    Lo = Hi;
+  else if (!Hi)
+    Hi = Lo;
+
+  // The actual rotate instruction rotates bytes, so we need to scale the
+  // rotation based on how many bytes are in the vector lane.
+  int Scale = 16 / NumLaneElts;
+
+  // SSSE3 targets can use the palignr instruction.
+  if (Subtarget->hasSSSE3()) {
+    // Cast the inputs to i8 vector of correct length to match PALIGNR.
+    MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
+    Lo = DAG.getBitcast(AlignVT, Lo);
+    Hi = DAG.getBitcast(AlignVT, Hi);
+
+    return DAG.getBitcast(
+        VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Lo, Hi,
+                        DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
+  }
+
+  assert(VT.is128BitVector() &&
+         "Rotate-based lowering only supports 128-bit lowering!");
+  assert(Mask.size() <= 16 &&
+         "Can shuffle at most 16 bytes in a 128-bit vector!");
+
+  // Default SSE2 implementation
+  int LoByteShift = 16 - Rotation * Scale;
+  int HiByteShift = Rotation * Scale;
+
+  // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
+  Lo = DAG.getBitcast(MVT::v2i64, Lo);
+  Hi = DAG.getBitcast(MVT::v2i64, Hi);
+
+  SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
+                                DAG.getConstant(LoByteShift, DL, MVT::i8));
+  SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
+                                DAG.getConstant(HiByteShift, DL, MVT::i8));
+  return DAG.getBitcast(VT,
+                        DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
+}
+
+/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
+///
+/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
+/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
+/// matches elements from one of the input vectors shuffled to the left or
+/// right with zeroable elements 'shifted in'. It handles both the strictly
+/// bit-wise element shifts and the byte shift across an entire 128-bit double
+/// quad word lane.
+///
+/// PSHL : (little-endian) left bit shift.
+/// [ zz, 0, zz,  2 ]
+/// [ -1, 4, zz, -1 ]
+/// PSRL : (little-endian) right bit shift.
+/// [  1, zz,  3, zz]
+/// [ -1, -1,  7, zz]
+/// PSLLDQ : (little-endian) left byte shift
+/// [ zz,  0,  1,  2,  3,  4,  5,  6]
+/// [ zz, zz, -1, -1,  2,  3,  4, -1]
+/// [ zz, zz, zz, zz, zz, zz, -1,  1]
+/// PSRLDQ : (little-endian) right byte shift
+/// [  5, 6,  7, zz, zz, zz, zz, zz]
+/// [ -1, 5,  6,  7, zz, zz, zz, zz]
+/// [  1, 2, -1, -1, -1, -1, zz, zz]
+static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
+                                         SDValue V2, ArrayRef<int> Mask,
+                                         SelectionDAG &DAG) {
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+  int Size = Mask.size();
+  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+  auto CheckZeros = [&](int Shift, int Scale, bool Left) {
+    for (int i = 0; i < Size; i += Scale)
+      for (int j = 0; j < Shift; ++j)
+        if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
+          return false;
+
+    return true;
+  };
+
+  auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
+    for (int i = 0; i != Size; i += Scale) {
+      unsigned Pos = Left ? i + Shift : i;
+      unsigned Low = Left ? i : i + Shift;
+      unsigned Len = Scale - Shift;
+      if (!isSequentialOrUndefInRange(Mask, Pos, Len,
+                                      Low + (V == V1 ? 0 : Size)))
+        return SDValue();
+    }
+
+    int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
+    bool ByteShift = ShiftEltBits > 64;
+    unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
+                           : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
+    int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
+
+    // Normalize the scale for byte shifts to still produce an i64 element
+    // type.
+    Scale = ByteShift ? Scale / 2 : Scale;
+
+    // We need to round trip through the appropriate type for the shift.
+    MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
+    MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
+    assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
+           "Illegal integer vector type");
+    V = DAG.getBitcast(ShiftVT, V);
+
+    V = DAG.getNode(OpCode, DL, ShiftVT, V,
+                    DAG.getConstant(ShiftAmt, DL, MVT::i8));
+    return DAG.getBitcast(VT, V);
+  };
+
+  // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
+  // keep doubling the size of the integer elements up to that. We can
+  // then shift the elements of the integer vector by whole multiples of
+  // their width within the elements of the larger integer vector. Test each
+  // multiple to see if we can find a match with the moved element indices
+  // and that the shifted in elements are all zeroable.
+  for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2)
+    for (int Shift = 1; Shift != Scale; ++Shift)
+      for (bool Left : {true, false})
+        if (CheckZeros(Shift, Scale, Left))
+          for (SDValue V : {V1, V2})
+            if (SDValue Match = MatchShift(Shift, Scale, Left, V))
+              return Match;
+
+  // no match
+  return SDValue();
+}
+
+/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
+static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
+                                           SDValue V2, ArrayRef<int> Mask,
+                                           SelectionDAG &DAG) {
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  assert(!Zeroable.all() && "Fully zeroable shuffle mask");
+
+  int Size = Mask.size();
+  int HalfSize = Size / 2;
+  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+  // Upper half must be undefined.
+  if (!isUndefInRange(Mask, HalfSize, HalfSize))
+    return SDValue();
+
+  // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
+  // Remainder of lower half result is zero and upper half is all undef.
+  auto LowerAsEXTRQ = [&]() {
+    // Determine the extraction length from the part of the
+    // lower half that isn't zeroable.
+    int Len = HalfSize;
+    for (; Len > 0; --Len)
+      if (!Zeroable[Len - 1])
+        break;
+    assert(Len > 0 && "Zeroable shuffle mask");
+
+    // Attempt to match first Len sequential elements from the lower half.
+    SDValue Src;
+    int Idx = -1;
+    for (int i = 0; i != Len; ++i) {
+      int M = Mask[i];
+      if (M < 0)
+        continue;
+      SDValue &V = (M < Size ? V1 : V2);
+      M = M % Size;
+
+      // The extracted elements must start at a valid index and all mask
+      // elements must be in the lower half.
+      if (i > M || M >= HalfSize)
+        return SDValue();
+
+      if (Idx < 0 || (Src == V && Idx == (M - i))) {
+        Src = V;
+        Idx = M - i;
+        continue;
+      }
+      return SDValue();
+    }
+
+    if (Idx < 0)
+      return SDValue();
+
+    assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
+    int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+    int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+    return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
+                       DAG.getConstant(BitLen, DL, MVT::i8),
+                       DAG.getConstant(BitIdx, DL, MVT::i8));
+  };
+
+  if (SDValue ExtrQ = LowerAsEXTRQ())
+    return ExtrQ;
+
+  // INSERTQ: Extract lowest Len elements from lower half of second source and
+  // insert over first source, starting at Idx.
+  // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
+  auto LowerAsInsertQ = [&]() {
+    for (int Idx = 0; Idx != HalfSize; ++Idx) {
+      SDValue Base;
+
+      // Attempt to match first source from mask before insertion point.
+      if (isUndefInRange(Mask, 0, Idx)) {
+        /* EMPTY */
+      } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
+        Base = V1;
+      } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
+        Base = V2;
+      } else {
+        continue;
+      }
+
+      // Extend the extraction length looking to match both the insertion of
+      // the second source and the remaining elements of the first.
+      for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
+        SDValue Insert;
+        int Len = Hi - Idx;
+
+        // Match insertion.
+        if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
+          Insert = V1;
+        } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
+          Insert = V2;
+        } else {
+          continue;
+        }
+
+        // Match the remaining elements of the lower half.
+        if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
+          /* EMPTY */
+        } else if ((!Base || (Base == V1)) &&
+                   isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
+          Base = V1;
+        } else if ((!Base || (Base == V2)) &&
+                   isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
+                                              Size + Hi)) {
+          Base = V2;
+        } else {
+          continue;
+        }
+
+        // We may not have a base (first source) - this can safely be undefined.
+        if (!Base)
+          Base = DAG.getUNDEF(VT);
+
+        int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+        int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+        return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
+                           DAG.getConstant(BitLen, DL, MVT::i8),
+                           DAG.getConstant(BitIdx, DL, MVT::i8));
+      }
+    }
+
+    return SDValue();
+  };
+
+  if (SDValue InsertQ = LowerAsInsertQ())
+    return InsertQ;
+
+  return SDValue();
+}
+
+/// \brief Lower a vector shuffle as a zero or any extension.
+///
+/// Given a specific number of elements, element bit width, and extension
+/// stride, produce either a zero or any extension based on the available
+/// features of the subtarget. The extended elements are consecutive and
+/// begin and can start from an offseted element index in the input; to
+/// avoid excess shuffling the offset must either being in the bottom lane
+/// or at the start of a higher lane. All extended elements must be from
+/// the same lane.
+static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
+    SDLoc DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
+    ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+  assert(Scale > 1 && "Need a scale to extend.");
+  int EltBits = VT.getScalarSizeInBits();
+  int NumElements = VT.getVectorNumElements();
+  int NumEltsPerLane = 128 / EltBits;
+  int OffsetLane = Offset / NumEltsPerLane;
+  assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
+         "Only 8, 16, and 32 bit elements can be extended.");
+  assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
+  assert(0 <= Offset && "Extension offset must be positive.");
+  assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
+         "Extension offset must be in the first lane or start an upper lane.");
+
+  // Check that an index is in same lane as the base offset.
+  auto SafeOffset = [&](int Idx) {
+    return OffsetLane == (Idx / NumEltsPerLane);
+  };
+
+  // Shift along an input so that the offset base moves to the first element.
+  auto ShuffleOffset = [&](SDValue V) {
+    if (!Offset)
+      return V;
+
+    SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
+    for (int i = 0; i * Scale < NumElements; ++i) {
+      int SrcIdx = i + Offset;
+      ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
+    }
+    return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
+  };
+
+  // Found a valid zext mask! Try various lowering strategies based on the
+  // input type and available ISA extensions.
+  if (Subtarget->hasSSE41()) {
+    // Not worth offseting 128-bit vectors if scale == 2, a pattern using
+    // PUNPCK will catch this in a later shuffle match.
+    if (Offset && Scale == 2 && VT.is128BitVector())
+      return SDValue();
+    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
+                                 NumElements / Scale);
+    InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, ShuffleOffset(InputV));
+    return DAG.getBitcast(VT, InputV);
+  }
+
+  assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
+
+  // For any extends we can cheat for larger element sizes and use shuffle
+  // instructions that can fold with a load and/or copy.
+  if (AnyExt && EltBits == 32) {
+    int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
+                         -1};
+    return DAG.getBitcast(
+        VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+                        DAG.getBitcast(MVT::v4i32, InputV),
+                        getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+  }
+  if (AnyExt && EltBits == 16 && Scale > 2) {
+    int PSHUFDMask[4] = {Offset / 2, -1,
+                         SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
+    InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+                         DAG.getBitcast(MVT::v4i32, InputV),
+                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
+    int PSHUFWMask[4] = {1, -1, -1, -1};
+    unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
+    return DAG.getBitcast(
+        VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
+                        DAG.getBitcast(MVT::v8i16, InputV),
+                        getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
+  }
+
+  // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
+  // to 64-bits.
+  if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) {
+    assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
+    assert(VT.is128BitVector() && "Unexpected vector width!");
+
+    int LoIdx = Offset * EltBits;
+    SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                             DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+                                         DAG.getConstant(EltBits, DL, MVT::i8),
+                                         DAG.getConstant(LoIdx, DL, MVT::i8)));
+
+    if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
+        !SafeOffset(Offset + 1))
+      return DAG.getNode(ISD::BITCAST, DL, VT, Lo);
+
+    int HiIdx = (Offset + 1) * EltBits;
+    SDValue Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                             DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+                                         DAG.getConstant(EltBits, DL, MVT::i8),
+                                         DAG.getConstant(HiIdx, DL, MVT::i8)));
+    return DAG.getNode(ISD::BITCAST, DL, VT,
+                       DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
+  }
+
+  // If this would require more than 2 unpack instructions to expand, use
+  // pshufb when available. We can only use more than 2 unpack instructions
+  // when zero extending i8 elements which also makes it easier to use pshufb.
+  if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
+    assert(NumElements == 16 && "Unexpected byte vector width!");
+    SDValue PSHUFBMask[16];
+    for (int i = 0; i < 16; ++i) {
+      int Idx = Offset + (i / Scale);
+      PSHUFBMask[i] = DAG.getConstant(
+          (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
+    }
+    InputV = DAG.getBitcast(MVT::v16i8, InputV);
+    return DAG.getBitcast(VT,
+                          DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
+                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
+                                                  MVT::v16i8, PSHUFBMask)));
+  }
+
+  // If we are extending from an offset, ensure we start on a boundary that
+  // we can unpack from.
+  int AlignToUnpack = Offset % (NumElements / Scale);
+  if (AlignToUnpack) {
+    SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
+    for (int i = AlignToUnpack; i < NumElements; ++i)
+      ShMask[i - AlignToUnpack] = i;
+    InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
+    Offset -= AlignToUnpack;
+  }
+
+  // Otherwise emit a sequence of unpacks.
+  do {
+    unsigned UnpackLoHi = X86ISD::UNPCKL;
+    if (Offset >= (NumElements / 2)) {
+      UnpackLoHi = X86ISD::UNPCKH;
+      Offset -= (NumElements / 2);
+    }
+
+    MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
+    SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
+                         : getZeroVector(InputVT, Subtarget, DAG, DL);
+    InputV = DAG.getBitcast(InputVT, InputV);
+    InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
+    Scale /= 2;
+    EltBits *= 2;
+    NumElements /= 2;
+  } while (Scale > 1);
+  return DAG.getBitcast(VT, InputV);
+}
+
+/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
+///
+/// This routine will try to do everything in its power to cleverly lower
+/// a shuffle which happens to match the pattern of a zero extend. It doesn't
+/// check for the profitability of this lowering,  it tries to aggressively
+/// match this pattern. It will use all of the micro-architectural details it
+/// can to emit an efficient lowering. It handles both blends with all-zero
+/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
+/// masking out later).
+///
+/// The reason we have dedicated lowering for zext-style shuffles is that they
+/// are both incredibly common and often quite performance sensitive.
+static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
+    SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+  int Bits = VT.getSizeInBits();
+  int NumLanes = Bits / 128;
+  int NumElements = VT.getVectorNumElements();
+  int NumEltsPerLane = NumElements / NumLanes;
+  assert(VT.getScalarSizeInBits() <= 32 &&
+         "Exceeds 32-bit integer zero extension limit");
+  assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
+
+  // Define a helper function to check a particular ext-scale and lower to it if
+  // valid.
+  auto Lower = [&](int Scale) -> SDValue {
+    SDValue InputV;
+    bool AnyExt = true;
+    int Offset = 0;
+    int Matches = 0;
+    for (int i = 0; i < NumElements; ++i) {
+      int M = Mask[i];
+      if (M == -1)
+        continue; // Valid anywhere but doesn't tell us anything.
+      if (i % Scale != 0) {
+        // Each of the extended elements need to be zeroable.
+        if (!Zeroable[i])
+          return SDValue();
+
+        // We no longer are in the anyext case.
+        AnyExt = false;
+        continue;
+      }
+
+      // Each of the base elements needs to be consecutive indices into the
+      // same input vector.
+      SDValue V = M < NumElements ? V1 : V2;
+      M = M % NumElements;
+      if (!InputV) {
+        InputV = V;
+        Offset = M - (i / Scale);
+      } else if (InputV != V)
+        return SDValue(); // Flip-flopping inputs.
+
+      // Offset must start in the lowest 128-bit lane or at the start of an
+      // upper lane.
+      // FIXME: Is it ever worth allowing a negative base offset?
+      if (!((0 <= Offset && Offset < NumEltsPerLane) ||
+            (Offset % NumEltsPerLane) == 0))
+        return SDValue();
+
+      // If we are offsetting, all referenced entries must come from the same
+      // lane.
+      if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
+        return SDValue();
+
+      if ((M % NumElements) != (Offset + (i / Scale)))
+        return SDValue(); // Non-consecutive strided elements.
+      Matches++;
+    }
+
+    // If we fail to find an input, we have a zero-shuffle which should always
+    // have already been handled.
+    // FIXME: Maybe handle this here in case during blending we end up with one?
+    if (!InputV)
+      return SDValue();
+
+    // If we are offsetting, don't extend if we only match a single input, we
+    // can always do better by using a basic PSHUF or PUNPCK.
+    if (Offset != 0 && Matches < 2)
+      return SDValue();
+
+    return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
+        DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
+  };
+
+  // The widest scale possible for extending is to a 64-bit integer.
+  assert(Bits % 64 == 0 &&
+         "The number of bits in a vector must be divisible by 64 on x86!");
+  int NumExtElements = Bits / 64;
+
+  // Each iteration, try extending the elements half as much, but into twice as
+  // many elements.
+  for (; NumExtElements < NumElements; NumExtElements *= 2) {
+    assert(NumElements % NumExtElements == 0 &&
+           "The input vector size must be divisible by the extended size.");
+    if (SDValue V = Lower(NumElements / NumExtElements))
+      return V;
+  }
+
+  // General extends failed, but 128-bit vectors may be able to use MOVQ.
+  if (Bits != 128)
+    return SDValue();
+
+  // Returns one of the source operands if the shuffle can be reduced to a
+  // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
+  auto CanZExtLowHalf = [&]() {
+    for (int i = NumElements / 2; i != NumElements; ++i)
+      if (!Zeroable[i])
+        return SDValue();
+    if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
+      return V1;
+    if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
+      return V2;
+    return SDValue();
+  };
+
+  if (SDValue V = CanZExtLowHalf()) {
+    V = DAG.getBitcast(MVT::v2i64, V);
+    V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
+    return DAG.getBitcast(VT, V);
+  }
+
+  // No viable ext lowering found.
+  return SDValue();
+}
+
+/// \brief Try to get a scalar value for a specific element of a vector.
+///
+/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
+static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
+                                              SelectionDAG &DAG) {
+  MVT VT = V.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  while (V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+  // If the bitcasts shift the element size, we can't extract an equivalent
+  // element from it.
+  MVT NewVT = V.getSimpleValueType();
+  if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
+    return SDValue();
+
+  if (V.getOpcode() == ISD::BUILD_VECTOR ||
+      (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
+    // Ensure the scalar operand is the same size as the destination.
+    // FIXME: Add support for scalar truncation where possible.
+    SDValue S = V.getOperand(Idx);
+    if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
+      return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, S);
+  }
+
+  return SDValue();
+}
+
+/// \brief Helper to test for a load that can be folded with x86 shuffles.
+///
+/// This is particularly important because the set of instructions varies
+/// significantly based on whether the operand is a load or not.
+static bool isShuffleFoldableLoad(SDValue V) {
+  while (V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+
+  return ISD::isNON_EXTLoad(V.getNode());
+}
+
+/// \brief Try to lower insertion of a single element into a zero vector.
+///
+/// This is a common pattern that we have especially efficient patterns to lower
+/// across all subtarget feature sets.
+static SDValue lowerVectorShuffleAsElementInsertion(
+    SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  MVT ExtVT = VT;
+  MVT EltVT = VT.getVectorElementType();
+
+  int V2Index = std::find_if(Mask.begin(), Mask.end(),
+                             [&Mask](int M) { return M >= (int)Mask.size(); }) -
+                Mask.begin();
+  bool IsV1Zeroable = true;
+  for (int i = 0, Size = Mask.size(); i < Size; ++i)
+    if (i != V2Index && !Zeroable[i]) {
+      IsV1Zeroable = false;
+      break;
+    }
+
+  // Check for a single input from a SCALAR_TO_VECTOR node.
+  // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
+  // all the smarts here sunk into that routine. However, the current
+  // lowering of BUILD_VECTOR makes that nearly impossible until the old
+  // vector shuffle lowering is dead.
+  SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
+                                               DAG);
+  if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
+    // We need to zext the scalar if it is smaller than an i32.
+    V2S = DAG.getBitcast(EltVT, V2S);
+    if (EltVT == MVT::i8 || EltVT == MVT::i16) {
+      // Using zext to expand a narrow element won't work for non-zero
+      // insertions.
+      if (!IsV1Zeroable)
+        return SDValue();
+
+      // Zero-extend directly to i32.
+      ExtVT = MVT::v4i32;
+      V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
+    }
+    V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
+  } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
+             EltVT == MVT::i16) {
+    // Either not inserting from the low element of the input or the input
+    // element size is too small to use VZEXT_MOVL to clear the high bits.
+    return SDValue();
+  }
+
+  if (!IsV1Zeroable) {
+    // If V1 can't be treated as a zero vector we have fewer options to lower
+    // this. We can't support integer vectors or non-zero targets cheaply, and
+    // the V1 elements can't be permuted in any way.
+    assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
+    if (!VT.isFloatingPoint() || V2Index != 0)
+      return SDValue();
+    SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
+    V1Mask[V2Index] = -1;
+    if (!isNoopShuffleMask(V1Mask))
+      return SDValue();
+    // This is essentially a special case blend operation, but if we have
+    // general purpose blend operations, they are always faster. Bail and let
+    // the rest of the lowering handle these as blends.
+    if (Subtarget->hasSSE41())
+      return SDValue();
+
+    // Otherwise, use MOVSD or MOVSS.
+    assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
+           "Only two types of floating point element types to handle!");
+    return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
+                       ExtVT, V1, V2);
+  }
+
+  // This lowering only works for the low element with floating point vectors.
+  if (VT.isFloatingPoint() && V2Index != 0)
+    return SDValue();
+
+  V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
+  if (ExtVT != VT)
+    V2 = DAG.getBitcast(VT, V2);
+
+  if (V2Index != 0) {
+    // If we have 4 or fewer lanes we can cheaply shuffle the element into
+    // the desired position. Otherwise it is more efficient to do a vector
+    // shift left. We know that we can do a vector shift left because all
+    // the inputs are zero.
+    if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
+      SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
+      V2Shuffle[V2Index] = 0;
+      V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
+    } else {
+      V2 = DAG.getBitcast(MVT::v2i64, V2);
+      V2 = DAG.getNode(
+          X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
+          DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
+                          DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
+                              DAG.getDataLayout(), VT)));
+      V2 = DAG.getBitcast(VT, V2);
+    }
+  }
+  return V2;
+}
+
+/// \brief Try to lower broadcast of a single - truncated - integer element,
+/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
+///
+/// This assumes we have AVX2.
+static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0,
+                                                  int BroadcastIdx,
+                                                  const X86Subtarget *Subtarget,
+                                                  SelectionDAG &DAG) {
+  assert(Subtarget->hasAVX2() &&
+         "We can only lower integer broadcasts with AVX2!");
+
+  EVT EltVT = VT.getVectorElementType();
+  EVT V0VT = V0.getValueType();
+
+  assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
+  assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
+
+  EVT V0EltVT = V0VT.getVectorElementType();
+  if (!V0EltVT.isInteger())
+    return SDValue();
+
+  const unsigned EltSize = EltVT.getSizeInBits();
+  const unsigned V0EltSize = V0EltVT.getSizeInBits();
+
+  // This is only a truncation if the original element type is larger.
+  if (V0EltSize <= EltSize)
+    return SDValue();
+
+  assert(((V0EltSize % EltSize) == 0) &&
+         "Scalar type sizes must all be powers of 2 on x86!");
+
+  const unsigned V0Opc = V0.getOpcode();
+  const unsigned Scale = V0EltSize / EltSize;
+  const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
+
+  if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
+      V0Opc != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  SDValue Scalar = V0.getOperand(V0BroadcastIdx);
+
+  // If we're extracting non-least-significant bits, shift so we can truncate.
+  // Hopefully, we can fold away the trunc/srl/load into the broadcast.
+  // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
+  // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
+  if (const int OffsetIdx = BroadcastIdx % Scale)
+    Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
+            DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
+
+  return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
+                     DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
+}
+
+/// \brief Try to lower broadcast of a single element.
+///
+/// For convenience, this code also bundles all of the subtarget feature set
+/// filtering. While a little annoying to re-dispatch on type here, there isn't
+/// a convenient way to factor it out.
+/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
+static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
+                                             ArrayRef<int> Mask,
+                                             const X86Subtarget *Subtarget,
+                                             SelectionDAG &DAG) {
+  if (!Subtarget->hasAVX())
+    return SDValue();
+  if (VT.isInteger() && !Subtarget->hasAVX2())
+    return SDValue();
+
+  // Check that the mask is a broadcast.
+  int BroadcastIdx = -1;
+  for (int M : Mask)
+    if (M >= 0 && BroadcastIdx == -1)
+      BroadcastIdx = M;
+    else if (M >= 0 && M != BroadcastIdx)
+      return SDValue();
+
+  assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
+                                            "a sorted mask where the broadcast "
+                                            "comes from V1.");
+
+  // Go up the chain of (vector) values to find a scalar load that we can
+  // combine with the broadcast.
+  for (;;) {
+    switch (V.getOpcode()) {
+    case ISD::CONCAT_VECTORS: {
+      int OperandSize = Mask.size() / V.getNumOperands();
+      V = V.getOperand(BroadcastIdx / OperandSize);
+      BroadcastIdx %= OperandSize;
+      continue;
+    }
+
+    case ISD::INSERT_SUBVECTOR: {
+      SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
+      auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
+      if (!ConstantIdx)
+        break;
+
+      int BeginIdx = (int)ConstantIdx->getZExtValue();
+      int EndIdx =
+          BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
+      if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
+        BroadcastIdx -= BeginIdx;
+        V = VInner;
+      } else {
+        V = VOuter;
+      }
+      continue;
+    }
+    }
+    break;
+  }
+
+  // Check if this is a broadcast of a scalar. We special case lowering
+  // for scalars so that we can more effectively fold with loads.
+  // First, look through bitcast: if the original value has a larger element
+  // type than the shuffle, the broadcast element is in essence truncated.
+  // Make that explicit to ease folding.
+  if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
+    if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
+            DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
+      return TruncBroadcast;
+
+  // Also check the simpler case, where we can directly reuse the scalar.
+  if (V.getOpcode() == ISD::BUILD_VECTOR ||
+      (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
+    V = V.getOperand(BroadcastIdx);
+
+    // If the scalar isn't a load, we can't broadcast from it in AVX1.
+    // Only AVX2 has register broadcasts.
+    if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
+      return SDValue();
+  } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) {
+    // If we are broadcasting a load that is only used by the shuffle
+    // then we can reduce the vector load to the broadcasted scalar load.
+    LoadSDNode *Ld = cast<LoadSDNode>(V);
+    SDValue BaseAddr = Ld->getOperand(1);
+    EVT AddrVT = BaseAddr.getValueType();
+    EVT SVT = VT.getScalarType();
+    unsigned Offset = BroadcastIdx * SVT.getStoreSize();
+    SDValue NewAddr = DAG.getNode(
+        ISD::ADD, DL, AddrVT, BaseAddr,
+        DAG.getConstant(Offset, DL, AddrVT));
+    V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
+                    DAG.getMachineFunction().getMachineMemOperand(
+                        Ld->getMemOperand(), Offset, SVT.getStoreSize()));
+  } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
+    // We can't broadcast from a vector register without AVX2, and we can only
+    // broadcast from the zero-element of a vector register.
+    return SDValue();
+  }
+
+  return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
+}
+
+// Check for whether we can use INSERTPS to perform the shuffle. We only use
+// INSERTPS when the V1 elements are already in the correct locations
+// because otherwise we can just always use two SHUFPS instructions which
+// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
+// perform INSERTPS if a single V1 element is out of place and all V2
+// elements are zeroable.
+static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
+                                            ArrayRef<int> Mask,
+                                            SelectionDAG &DAG) {
+  assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+  unsigned ZMask = 0;
+  int V1DstIndex = -1;
+  int V2DstIndex = -1;
+  bool V1UsedInPlace = false;
+
+  for (int i = 0; i < 4; ++i) {
+    // Synthesize a zero mask from the zeroable elements (includes undefs).
+    if (Zeroable[i]) {
+      ZMask |= 1 << i;
+      continue;
+    }
+
+    // Flag if we use any V1 inputs in place.
+    if (i == Mask[i]) {
+      V1UsedInPlace = true;
+      continue;
+    }
+
+    // We can only insert a single non-zeroable element.
+    if (V1DstIndex != -1 || V2DstIndex != -1)
+      return SDValue();
+
+    if (Mask[i] < 4) {
+      // V1 input out of place for insertion.
+      V1DstIndex = i;
+    } else {
+      // V2 input for insertion.
+      V2DstIndex = i;
+    }
+  }
+
+  // Don't bother if we have no (non-zeroable) element for insertion.
+  if (V1DstIndex == -1 && V2DstIndex == -1)
+    return SDValue();
+
+  // Determine element insertion src/dst indices. The src index is from the
+  // start of the inserted vector, not the start of the concatenated vector.
+  unsigned V2SrcIndex = 0;
+  if (V1DstIndex != -1) {
+    // If we have a V1 input out of place, we use V1 as the V2 element insertion
+    // and don't use the original V2 at all.
+    V2SrcIndex = Mask[V1DstIndex];
+    V2DstIndex = V1DstIndex;
+    V2 = V1;
+  } else {
+    V2SrcIndex = Mask[V2DstIndex] - 4;
+  }
+
+  // If no V1 inputs are used in place, then the result is created only from
+  // the zero mask and the V2 insertion - so remove V1 dependency.
+  if (!V1UsedInPlace)
+    V1 = DAG.getUNDEF(MVT::v4f32);
+
+  unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
+  assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+
+  // Insert the V2 element into the desired position.
+  SDLoc DL(Op);
+  return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+                     DAG.getConstant(InsertPSMask, DL, MVT::i8));
+}
+
+/// \brief Try to lower a shuffle as a permute of the inputs followed by an
+/// UNPCK instruction.
+///
+/// This specifically targets cases where we end up with alternating between
+/// the two inputs, and so can permute them into something that feeds a single
+/// UNPCK instruction. Note that this routine only targets integer vectors
+/// because for floating point vectors we have a generalized SHUFPS lowering
+/// strategy that handles everything that doesn't *exactly* match an unpack,
+/// making this clever lowering unnecessary.
+static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT,
+                                                    SDValue V1, SDValue V2,
+                                                    ArrayRef<int> Mask,
+                                                    SelectionDAG &DAG) {
+  assert(!VT.isFloatingPoint() &&
+         "This routine only supports integer vectors.");
+  assert(!isSingleInputShuffleMask(Mask) &&
+         "This routine should only be used when blending two inputs.");
+  assert(Mask.size() >= 2 && "Single element masks are invalid.");
+
+  int Size = Mask.size();
+
+  int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) {
+    return M >= 0 && M % Size < Size / 2;
+  });
+  int NumHiInputs = std::count_if(
+      Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; });
+
+  bool UnpackLo = NumLoInputs >= NumHiInputs;
+
+  auto TryUnpack = [&](MVT UnpackVT, int Scale) {
+    SmallVector<int, 32> V1Mask(Mask.size(), -1);
+    SmallVector<int, 32> V2Mask(Mask.size(), -1);
+
+    for (int i = 0; i < Size; ++i) {
+      if (Mask[i] < 0)
+        continue;
+
+      // Each element of the unpack contains Scale elements from this mask.
+      int UnpackIdx = i / Scale;
+
+      // We only handle the case where V1 feeds the first slots of the unpack.
+      // We rely on canonicalization to ensure this is the case.
+      if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
+        return SDValue();
+
+      // Setup the mask for this input. The indexing is tricky as we have to
+      // handle the unpack stride.
+      SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
+      VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
+          Mask[i] % Size;
+    }
+
+    // If we will have to shuffle both inputs to use the unpack, check whether
+    // we can just unpack first and shuffle the result. If so, skip this unpack.
+    if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
+        !isNoopShuffleMask(V2Mask))
+      return SDValue();
+
+    // Shuffle the inputs into place.
+    V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+    V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+
+    // Cast the inputs to the type we will use to unpack them.
+    V1 = DAG.getBitcast(UnpackVT, V1);
+    V2 = DAG.getBitcast(UnpackVT, V2);
+
+    // Unpack the inputs and cast the result back to the desired type.
+    return DAG.getBitcast(
+        VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+                        UnpackVT, V1, V2));
+  };
+
+  // We try each unpack from the largest to the smallest to try and find one
+  // that fits this mask.
+  int OrigNumElements = VT.getVectorNumElements();
+  int OrigScalarSize = VT.getScalarSizeInBits();
+  for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
+    int Scale = ScalarSize / OrigScalarSize;
+    int NumElements = OrigNumElements / Scale;
+    MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
+    if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
+      return Unpack;
+  }
+
+  // If none of the unpack-rooted lowerings worked (or were profitable) try an
+  // initial unpack.
+  if (NumLoInputs == 0 || NumHiInputs == 0) {
+    assert((NumLoInputs > 0 || NumHiInputs > 0) &&
+           "We have to have *some* inputs!");
+    int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
+
+    // FIXME: We could consider the total complexity of the permute of each
+    // possible unpacking. Or at the least we should consider how many
+    // half-crossings are created.
+    // FIXME: We could consider commuting the unpacks.
+
+    SmallVector<int, 32> PermMask;
+    PermMask.assign(Size, -1);
+    for (int i = 0; i < Size; ++i) {
+      if (Mask[i] < 0)
+        continue;
+
+      assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
+
+      PermMask[i] =
+          2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
+    }
+    return DAG.getVectorShuffle(
+        VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
+                            DL, VT, V1, V2),
+        DAG.getUNDEF(VT), PermMask);
+  }
+
+  return SDValue();
+}
+
+/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
+///
+/// This is the basis function for the 2-lane 64-bit shuffles as we have full
+/// support for floating point shuffles but not integer shuffles. These
+/// instructions will incur a domain crossing penalty on some chips though so
+/// it is better to avoid lowering through this for integer vectors where
+/// possible.
+static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
+
+  if (isSingleInputShuffleMask(Mask)) {
+    // Use low duplicate instructions for masks that match their pattern.
+    if (Subtarget->hasSSE3())
+      if (isShuffleEquivalent(V1, V2, Mask, {0, 0}))
+        return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
+
+    // Straight shuffle of a single input vector. Simulate this by using the
+    // single input as both of the "inputs" to this instruction..
+    unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
+
+    if (Subtarget->hasAVX()) {
+      // If we have AVX, we can use VPERMILPS which will allow folding a load
+      // into the shuffle.
+      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
+                         DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+    }
+
+    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1,
+                       DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+  }
+  assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
+  assert(Mask[1] >= 2 && "Non-canonicalized blend!");
+
+  // If we have a single input, insert that into V1 if we can do so cheaply.
+  if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
+    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+            DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
+      return Insertion;
+    // Try inverting the insertion since for v2 masks it is easy to do and we
+    // can't reliably sort the mask one way or the other.
+    int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
+                          Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
+    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+            DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG))
+      return Insertion;
+  }
+
+  // Try to use one of the special instruction patterns to handle two common
+  // blend patterns if a zero-blend above didn't work.
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
+      isShuffleEquivalent(V1, V2, Mask, {1, 3}))
+    if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
+      // We can either use a special instruction to load over the low double or
+      // to move just the low double.
+      return DAG.getNode(
+          isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
+          DL, MVT::v2f64, V2,
+          DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
+
+  if (Subtarget->hasSSE41())
+    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
+                                                  Subtarget, DAG))
+      return Blend;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
+    return V;
+
+  unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
+  return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
+                     DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+}
+
+/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
+///
+/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
+/// the integer unit to minimize domain crossing penalties. However, for blends
+/// it falls back to the floating point shuffle operation with appropriate bit
+/// casting.
+static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
+
+  if (isSingleInputShuffleMask(Mask)) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1,
+                                                          Mask, Subtarget, DAG))
+      return Broadcast;
+
+    // Straight shuffle of a single input vector. For everything from SSE2
+    // onward this has a single fast instruction with no scary immediates.
+    // We have to map the mask as it is actually a v4i32 shuffle instruction.
+    V1 = DAG.getBitcast(MVT::v4i32, V1);
+    int WidenedMask[4] = {
+        std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
+        std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
+    return DAG.getBitcast(
+        MVT::v2i64,
+        DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
+                    getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
+  }
+  assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
+  assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
+  assert(Mask[0] < 2 && "We sort V1 to be the first input.");
+  assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
+
+  // If we have a blend of two PACKUS operations an the blend aligns with the
+  // low and half halves, we can just merge the PACKUS operations. This is
+  // particularly important as it lets us merge shuffles that this routine itself
+  // creates.
+  auto GetPackNode = [](SDValue V) {
+    while (V.getOpcode() == ISD::BITCAST)
+      V = V.getOperand(0);
+
+    return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
+  };
+  if (SDValue V1Pack = GetPackNode(V1))
+    if (SDValue V2Pack = GetPackNode(V2))
+      return DAG.getBitcast(MVT::v2i64,
+                            DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
+                                        Mask[0] == 0 ? V1Pack.getOperand(0)
+                                                     : V1Pack.getOperand(1),
+                                        Mask[1] == 2 ? V2Pack.getOperand(0)
+                                                     : V2Pack.getOperand(1)));
+
+  // Try to use shift instructions.
+  if (SDValue Shift =
+          lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG))
+    return Shift;
+
+  // When loading a scalar and then shuffling it into a vector we can often do
+  // the insertion cheaply.
+  if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+          DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+    return Insertion;
+  // Try inverting the insertion since for v2 masks it is easy to do and we
+  // can't reliably sort the mask one way or the other.
+  int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
+  if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+          DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG))
+    return Insertion;
+
+  // We have different paths for blend lowering, but they all must use the
+  // *exact* same predicate.
+  bool IsBlendSupported = Subtarget->hasSSE41();
+  if (IsBlendSupported)
+    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
+                                                  Subtarget, DAG))
+      return Blend;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
+    return V;
+
+  // Try to use byte rotation instructions.
+  // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
+  if (Subtarget->hasSSSE3())
+    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+            DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+      return Rotate;
+
+  // If we have direct support for blends, we should lower by decomposing into
+  // a permute. That will be faster than the domain cross.
+  if (IsBlendSupported)
+    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
+                                                      Mask, DAG);
+
+  // We implement this with SHUFPD which is pretty lame because it will likely
+  // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
+  // However, all the alternatives are still more cycles and newer chips don't
+  // have this problem. It would be really nice if x86 had better shuffles here.
+  V1 = DAG.getBitcast(MVT::v2f64, V1);
+  V2 = DAG.getBitcast(MVT::v2f64, V2);
+  return DAG.getBitcast(MVT::v2i64,
+                        DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
+}
+
+/// \brief Test whether this can be lowered with a single SHUFPS instruction.
+///
+/// This is used to disable more specialized lowerings when the shufps lowering
+/// will happen to be efficient.
+static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
+  // This routine only handles 128-bit shufps.
+  assert(Mask.size() == 4 && "Unsupported mask size!");
+
+  // To lower with a single SHUFPS we need to have the low half and high half
+  // each requiring a single input.
+  if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4))
+    return false;
+  if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4))
+    return false;
+
+  return true;
+}
+
+/// \brief Lower a vector shuffle using the SHUFPS instruction.
+///
+/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
+/// It makes no assumptions about whether this is the *best* lowering, it simply
+/// uses it.
+static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
+                                            ArrayRef<int> Mask, SDValue V1,
+                                            SDValue V2, SelectionDAG &DAG) {
+  SDValue LowV = V1, HighV = V2;
+  int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
+
+  int NumV2Elements =
+      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+
+  if (NumV2Elements == 1) {
+    int V2Index =
+        std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
+        Mask.begin();
+
+    // Compute the index adjacent to V2Index and in the same half by toggling
+    // the low bit.
+    int V2AdjIndex = V2Index ^ 1;
+
+    if (Mask[V2AdjIndex] == -1) {
+      // Handles all the cases where we have a single V2 element and an undef.
+      // This will only ever happen in the high lanes because we commute the
+      // vector otherwise.
+      if (V2Index < 2)
+        std::swap(LowV, HighV);
+      NewMask[V2Index] -= 4;
+    } else {
+      // Handle the case where the V2 element ends up adjacent to a V1 element.
+      // To make this work, blend them together as the first step.
+      int V1Index = V2AdjIndex;
+      int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
+      V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
+                       getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
+
+      // Now proceed to reconstruct the final blend as we have the necessary
+      // high or low half formed.
+      if (V2Index < 2) {
+        LowV = V2;
+        HighV = V1;
+      } else {
+        HighV = V2;
+      }
+      NewMask[V1Index] = 2; // We put the V1 element in V2[2].
+      NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
+    }
+  } else if (NumV2Elements == 2) {
+    if (Mask[0] < 4 && Mask[1] < 4) {
+      // Handle the easy case where we have V1 in the low lanes and V2 in the
+      // high lanes.
+      NewMask[2] -= 4;
+      NewMask[3] -= 4;
+    } else if (Mask[2] < 4 && Mask[3] < 4) {
+      // We also handle the reversed case because this utility may get called
+      // when we detect a SHUFPS pattern but can't easily commute the shuffle to
+      // arrange things in the right direction.
+      NewMask[0] -= 4;
+      NewMask[1] -= 4;
+      HighV = V1;
+      LowV = V2;
+    } else {
+      // We have a mixture of V1 and V2 in both low and high lanes. Rather than
+      // trying to place elements directly, just blend them and set up the final
+      // shuffle to place them.
+
+      // The first two blend mask elements are for V1, the second two are for
+      // V2.
+      int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
+                          Mask[2] < 4 ? Mask[2] : Mask[3],
+                          (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
+                          (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
+      V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
+                       getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
+
+      // Now we do a normal shuffle of V1 by giving V1 as both operands to
+      // a blend.
+      LowV = HighV = V1;
+      NewMask[0] = Mask[0] < 4 ? 0 : 2;
+      NewMask[1] = Mask[0] < 4 ? 2 : 0;
+      NewMask[2] = Mask[2] < 4 ? 1 : 3;
+      NewMask[3] = Mask[2] < 4 ? 3 : 1;
+    }
+  }
+  return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
+                     getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
+}
+
+/// \brief Lower 4-lane 32-bit floating point shuffles.
+///
+/// Uses instructions exclusively from the floating point unit to minimize
+/// domain crossing penalties, as these are sufficient to implement all v4f32
+/// shuffles.
+static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+  int NumV2Elements =
+      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+
+  if (NumV2Elements == 0) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1,
+                                                          Mask, Subtarget, DAG))
+      return Broadcast;
+
+    // Use even/odd duplicate instructions for masks that match their pattern.
+    if (Subtarget->hasSSE3()) {
+      if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
+        return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
+      if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
+        return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
+    }
+
+    if (Subtarget->hasAVX()) {
+      // If we have AVX, we can use VPERMILPS which will allow folding a load
+      // into the shuffle.
+      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
+                         getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+    }
+
+    // Otherwise, use a straight shuffle of a single input vector. We pass the
+    // input vector to both operands to simulate this with a SHUFPS.
+    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
+                       getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+  }
+
+  // There are special ways we can lower some single-element blends. However, we
+  // have custom ways we can lower more complex single-element blends below that
+  // we defer to if both this and BLENDPS fail to match, so restrict this to
+  // when the V2 input is targeting element 0 of the mask -- that is the fast
+  // case here.
+  if (NumV2Elements == 1 && Mask[0] >= 4)
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2,
+                                                         Mask, Subtarget, DAG))
+      return V;
+
+  if (Subtarget->hasSSE41()) {
+    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
+                                                  Subtarget, DAG))
+      return Blend;
+
+    // Use INSERTPS if we can complete the shuffle efficiently.
+    if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
+      return V;
+
+    if (!isSingleSHUFPSMask(Mask))
+      if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
+              DL, MVT::v4f32, V1, V2, Mask, DAG))
+        return BlendPerm;
+  }
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
+    return V;
+
+  // Otherwise fall back to a SHUFPS lowering strategy.
+  return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
+}
+
+/// \brief Lower 4-lane i32 vector shuffles.
+///
+/// We try to handle these with integer-domain shuffles where we can, but for
+/// blends we use the floating point domain blend instructions.
+static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative. It also allows us to fold memory operands into the
+  // shuffle in many cases.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
+                                                         Mask, Subtarget, DAG))
+    return ZExt;
+
+  int NumV2Elements =
+      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+
+  if (NumV2Elements == 0) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1,
+                                                          Mask, Subtarget, DAG))
+      return Broadcast;
+
+    // Straight shuffle of a single input vector. For everything from SSE2
+    // onward this has a single fast instruction with no scary immediates.
+    // We coerce the shuffle pattern to be compatible with UNPCK instructions
+    // but we aren't actually going to use the UNPCK instruction because doing
+    // so prevents folding a load into this instruction or making a copy.
+    const int UnpackLoMask[] = {0, 0, 1, 1};
+    const int UnpackHiMask[] = {2, 2, 3, 3};
+    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
+      Mask = UnpackLoMask;
+    else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
+      Mask = UnpackHiMask;
+
+    return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
+                       getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+  }
+
+  // Try to use shift instructions.
+  if (SDValue Shift =
+          lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG))
+    return Shift;
+
+  // There are special ways we can lower some single-element blends.
+  if (NumV2Elements == 1)
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2,
+                                                         Mask, Subtarget, DAG))
+      return V;
+
+  // We have different paths for blend lowering, but they all must use the
+  // *exact* same predicate.
+  bool IsBlendSupported = Subtarget->hasSSE41();
+  if (IsBlendSupported)
+    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
+                                                  Subtarget, DAG))
+      return Blend;
+
+  if (SDValue Masked =
+          lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
+    return Masked;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
+    return V;
+
+  // Try to use byte rotation instructions.
+  // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
+  if (Subtarget->hasSSSE3())
+    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+            DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+      return Rotate;
+
+  // If we have direct support for blends, we should lower by decomposing into
+  // a permute. That will be faster than the domain cross.
+  if (IsBlendSupported)
+    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
+                                                      Mask, DAG);
+
+  // Try to lower by permuting the inputs into an unpack instruction.
+  if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1,
+                                                            V2, Mask, DAG))
+    return Unpack;
+
+  // We implement this with SHUFPS because it can blend from two vectors.
+  // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
+  // up the inputs, bypassing domain shift penalties that we would encur if we
+  // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
+  // relevant.
+  return DAG.getBitcast(
+      MVT::v4i32,
+      DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1),
+                           DAG.getBitcast(MVT::v4f32, V2), Mask));
+}
+
+/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
+/// shuffle lowering, and the most complex part.
+///
+/// The lowering strategy is to try to form pairs of input lanes which are
+/// targeted at the same half of the final vector, and then use a dword shuffle
+/// to place them onto the right half, and finally unpack the paired lanes into
+/// their final position.
+///
+/// The exact breakdown of how to form these dword pairs and align them on the
+/// correct sides is really tricky. See the comments within the function for
+/// more of the details.
+///
+/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
+/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
+/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
+/// vector, form the analogous 128-bit 8-element Mask.
+static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
+    SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
+    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+  assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
+  MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+
+  assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
+  MutableArrayRef<int> LoMask = Mask.slice(0, 4);
+  MutableArrayRef<int> HiMask = Mask.slice(4, 4);
+
+  SmallVector<int, 4> LoInputs;
+  std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
+               [](int M) { return M >= 0; });
+  std::sort(LoInputs.begin(), LoInputs.end());
+  LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
+  SmallVector<int, 4> HiInputs;
+  std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
+               [](int M) { return M >= 0; });
+  std::sort(HiInputs.begin(), HiInputs.end());
+  HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
+  int NumLToL =
+      std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
+  int NumHToL = LoInputs.size() - NumLToL;
+  int NumLToH =
+      std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
+  int NumHToH = HiInputs.size() - NumLToH;
+  MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
+  MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
+  MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
+  MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
+
+  // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
+  // such inputs we can swap two of the dwords across the half mark and end up
+  // with <=2 inputs to each half in each half. Once there, we can fall through
+  // to the generic code below. For example:
+  //
+  // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
+  // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
+  //
+  // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
+  // and an existing 2-into-2 on the other half. In this case we may have to
+  // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
+  // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
+  // Fortunately, we don't have to handle anything but a 2-into-2 pattern
+  // because any other situation (including a 3-into-1 or 1-into-3 in the other
+  // half than the one we target for fixing) will be fixed when we re-enter this
+  // path. We will also combine away any sequence of PSHUFD instructions that
+  // result into a single instruction. Here is an example of the tricky case:
+  //
+  // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
+  // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
+  //
+  // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
+  //
+  // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
+  // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
+  //
+  // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
+  // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
+  //
+  // The result is fine to be handled by the generic logic.
+  auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
+                          ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
+                          int AOffset, int BOffset) {
+    assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
+           "Must call this with A having 3 or 1 inputs from the A half.");
+    assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
+           "Must call this with B having 1 or 3 inputs from the B half.");
+    assert(AToAInputs.size() + BToAInputs.size() == 4 &&
+           "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
+
+    bool ThreeAInputs = AToAInputs.size() == 3;
+
+    // Compute the index of dword with only one word among the three inputs in
+    // a half by taking the sum of the half with three inputs and subtracting
+    // the sum of the actual three inputs. The difference is the remaining
+    // slot.
+    int ADWord, BDWord;
+    int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
+    int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
+    int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
+    ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
+    int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
+    int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
+    int TripleNonInputIdx =
+        TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
+    TripleDWord = TripleNonInputIdx / 2;
+
+    // We use xor with one to compute the adjacent DWord to whichever one the
+    // OneInput is in.
+    OneInputDWord = (OneInput / 2) ^ 1;
+
+    // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
+    // and BToA inputs. If there is also such a problem with the BToB and AToB
+    // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
+    // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
+    // is essential that we don't *create* a 3<-1 as then we might oscillate.
+    if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
+      // Compute how many inputs will be flipped by swapping these DWords. We
+      // need
+      // to balance this to ensure we don't form a 3-1 shuffle in the other
+      // half.
+      int NumFlippedAToBInputs =
+          std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
+          std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
+      int NumFlippedBToBInputs =
+          std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
+          std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
+      if ((NumFlippedAToBInputs == 1 &&
+           (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
+          (NumFlippedBToBInputs == 1 &&
+           (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
+        // We choose whether to fix the A half or B half based on whether that
+        // half has zero flipped inputs. At zero, we may not be able to fix it
+        // with that half. We also bias towards fixing the B half because that
+        // will more commonly be the high half, and we have to bias one way.
+        auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
+                                                       ArrayRef<int> Inputs) {
+          int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
+          bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
+                                         PinnedIdx ^ 1) != Inputs.end();
+          // Determine whether the free index is in the flipped dword or the
+          // unflipped dword based on where the pinned index is. We use this bit
+          // in an xor to conditionally select the adjacent dword.
+          int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
+          bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
+                                             FixFreeIdx) != Inputs.end();
+          if (IsFixIdxInput == IsFixFreeIdxInput)
+            FixFreeIdx += 1;
+          IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
+                                        FixFreeIdx) != Inputs.end();
+          assert(IsFixIdxInput != IsFixFreeIdxInput &&
+                 "We need to be changing the number of flipped inputs!");
+          int PSHUFHalfMask[] = {0, 1, 2, 3};
+          std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
+          V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
+                          MVT::v8i16, V,
+                          getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
+
+          for (int &M : Mask)
+            if (M != -1 && M == FixIdx)
+              M = FixFreeIdx;
+            else if (M != -1 && M == FixFreeIdx)
+              M = FixIdx;
+        };
+        if (NumFlippedBToBInputs != 0) {
+          int BPinnedIdx =
+              BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
+          FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
+        } else {
+          assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
+          int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
+          FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
+        }
+      }
+    }
+
+    int PSHUFDMask[] = {0, 1, 2, 3};
+    PSHUFDMask[ADWord] = BDWord;
+    PSHUFDMask[BDWord] = ADWord;
+    V = DAG.getBitcast(
+        VT,
+        DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
+                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+
+    // Adjust the mask to match the new locations of A and B.
+    for (int &M : Mask)
+      if (M != -1 && M/2 == ADWord)
+        M = 2 * BDWord + M % 2;
+      else if (M != -1 && M/2 == BDWord)
+        M = 2 * ADWord + M % 2;
+
+    // Recurse back into this routine to re-compute state now that this isn't
+    // a 3 and 1 problem.
+    return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
+                                                     DAG);
+  };
+  if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
+    return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
+  else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
+    return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
+
+  // At this point there are at most two inputs to the low and high halves from
+  // each half. That means the inputs can always be grouped into dwords and
+  // those dwords can then be moved to the correct half with a dword shuffle.
+  // We use at most one low and one high word shuffle to collect these paired
+  // inputs into dwords, and finally a dword shuffle to place them.
+  int PSHUFLMask[4] = {-1, -1, -1, -1};
+  int PSHUFHMask[4] = {-1, -1, -1, -1};
+  int PSHUFDMask[4] = {-1, -1, -1, -1};
+
+  // First fix the masks for all the inputs that are staying in their
+  // original halves. This will then dictate the targets of the cross-half
+  // shuffles.
+  auto fixInPlaceInputs =
+      [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
+                    MutableArrayRef<int> SourceHalfMask,
+                    MutableArrayRef<int> HalfMask, int HalfOffset) {
+    if (InPlaceInputs.empty())
+      return;
+    if (InPlaceInputs.size() == 1) {
+      SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+          InPlaceInputs[0] - HalfOffset;
+      PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
+      return;
+    }
+    if (IncomingInputs.empty()) {
+      // Just fix all of the in place inputs.
+      for (int Input : InPlaceInputs) {
+        SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
+        PSHUFDMask[Input / 2] = Input / 2;
+      }
+      return;
+    }
+
+    assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
+    SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+        InPlaceInputs[0] - HalfOffset;
+    // Put the second input next to the first so that they are packed into
+    // a dword. We find the adjacent index by toggling the low bit.
+    int AdjIndex = InPlaceInputs[0] ^ 1;
+    SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
+    std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
+    PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
+  };
+  fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
+  fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
+
+  // Now gather the cross-half inputs and place them into a free dword of
+  // their target half.
+  // FIXME: This operation could almost certainly be simplified dramatically to
+  // look more like the 3-1 fixing operation.
+  auto moveInputsToRightHalf = [&PSHUFDMask](
+      MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
+      MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
+      MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
+      int DestOffset) {
+    auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
+      return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
+    };
+    auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
+                                               int Word) {
+      int LowWord = Word & ~1;
+      int HighWord = Word | 1;
+      return isWordClobbered(SourceHalfMask, LowWord) ||
+             isWordClobbered(SourceHalfMask, HighWord);
+    };
+
+    if (IncomingInputs.empty())
+      return;
+
+    if (ExistingInputs.empty()) {
+      // Map any dwords with inputs from them into the right half.
+      for (int Input : IncomingInputs) {
+        // If the source half mask maps over the inputs, turn those into
+        // swaps and use the swapped lane.
+        if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
+          if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
+            SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
+                Input - SourceOffset;
+            // We have to swap the uses in our half mask in one sweep.
+            for (int &M : HalfMask)
+              if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
+                M = Input;
+              else if (M == Input)
+                M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
+          } else {
+            assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
+                       Input - SourceOffset &&
+                   "Previous placement doesn't match!");
+          }
+          // Note that this correctly re-maps both when we do a swap and when
+          // we observe the other side of the swap above. We rely on that to
+          // avoid swapping the members of the input list directly.
+          Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
+        }
+
+        // Map the input's dword into the correct half.
+        if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
+          PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
+        else
+          assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
+                     Input / 2 &&
+                 "Previous placement doesn't match!");
+      }
+
+      // And just directly shift any other-half mask elements to be same-half
+      // as we will have mirrored the dword containing the element into the
+      // same position within that half.
+      for (int &M : HalfMask)
+        if (M >= SourceOffset && M < SourceOffset + 4) {
+          M = M - SourceOffset + DestOffset;
+          assert(M >= 0 && "This should never wrap below zero!");
+        }
+      return;
+    }
+
+    // Ensure we have the input in a viable dword of its current half. This
+    // is particularly tricky because the original position may be clobbered
+    // by inputs being moved and *staying* in that half.
+    if (IncomingInputs.size() == 1) {
+      if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
+        int InputFixed = std::find(std::begin(SourceHalfMask),
+                                   std::end(SourceHalfMask), -1) -
+                         std::begin(SourceHalfMask) + SourceOffset;
+        SourceHalfMask[InputFixed - SourceOffset] =
+            IncomingInputs[0] - SourceOffset;
+        std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
+                     InputFixed);
+        IncomingInputs[0] = InputFixed;
+      }
+    } else if (IncomingInputs.size() == 2) {
+      if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
+          isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
+        // We have two non-adjacent or clobbered inputs we need to extract from
+        // the source half. To do this, we need to map them into some adjacent
+        // dword slot in the source mask.
+        int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
+                              IncomingInputs[1] - SourceOffset};
+
+        // If there is a free slot in the source half mask adjacent to one of
+        // the inputs, place the other input in it. We use (Index XOR 1) to
+        // compute an adjacent index.
+        if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
+            SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
+          SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
+          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+          InputsFixed[1] = InputsFixed[0] ^ 1;
+        } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
+                   SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
+          SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
+          SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
+          InputsFixed[0] = InputsFixed[1] ^ 1;
+        } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
+                   SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
+          // The two inputs are in the same DWord but it is clobbered and the
+          // adjacent DWord isn't used at all. Move both inputs to the free
+          // slot.
+          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
+          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
+          InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
+          InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
+        } else {
+          // The only way we hit this point is if there is no clobbering
+          // (because there are no off-half inputs to this half) and there is no
+          // free slot adjacent to one of the inputs. In this case, we have to
+          // swap an input with a non-input.
+          for (int i = 0; i < 4; ++i)
+            assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
+                   "We can't handle any clobbers here!");
+          assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
+                 "Cannot have adjacent inputs here!");
+
+          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+          SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
+
+          // We also have to update the final source mask in this case because
+          // it may need to undo the above swap.
+          for (int &M : FinalSourceHalfMask)
+            if (M == (InputsFixed[0] ^ 1) + SourceOffset)
+              M = InputsFixed[1] + SourceOffset;
+            else if (M == InputsFixed[1] + SourceOffset)
+              M = (InputsFixed[0] ^ 1) + SourceOffset;
+
+          InputsFixed[1] = InputsFixed[0] ^ 1;
+        }
+
+        // Point everything at the fixed inputs.
+        for (int &M : HalfMask)
+          if (M == IncomingInputs[0])
+            M = InputsFixed[0] + SourceOffset;
+          else if (M == IncomingInputs[1])
+            M = InputsFixed[1] + SourceOffset;
+
+        IncomingInputs[0] = InputsFixed[0] + SourceOffset;
+        IncomingInputs[1] = InputsFixed[1] + SourceOffset;
+      }
+    } else {
+      llvm_unreachable("Unhandled input size!");
+    }
+
+    // Now hoist the DWord down to the right half.
+    int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
+    assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
+    PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
+    for (int &M : HalfMask)
+      for (int Input : IncomingInputs)
+        if (M == Input)
+          M = FreeDWord * 2 + Input % 2;
+  };
+  moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
+                        /*SourceOffset*/ 4, /*DestOffset*/ 0);
+  moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
+                        /*SourceOffset*/ 0, /*DestOffset*/ 4);
+
+  // Now enact all the shuffles we've computed to move the inputs into their
+  // target half.
+  if (!isNoopShuffleMask(PSHUFLMask))
+    V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
+                    getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
+  if (!isNoopShuffleMask(PSHUFHMask))
+    V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
+                    getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
+  if (!isNoopShuffleMask(PSHUFDMask))
+    V = DAG.getBitcast(
+        VT,
+        DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
+                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+
+  // At this point, each half should contain all its inputs, and we can then
+  // just shuffle them into their final position.
+  assert(std::count_if(LoMask.begin(), LoMask.end(),
+                       [](int M) { return M >= 4; }) == 0 &&
+         "Failed to lift all the high half inputs to the low mask!");
+  assert(std::count_if(HiMask.begin(), HiMask.end(),
+                       [](int M) { return M >= 0 && M < 4; }) == 0 &&
+         "Failed to lift all the low half inputs to the high mask!");
+
+  // Do a half shuffle for the low mask.
+  if (!isNoopShuffleMask(LoMask))
+    V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
+                    getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
+
+  // Do a half shuffle with the high mask after shifting its values down.
+  for (int &M : HiMask)
+    if (M >= 0)
+      M -= 4;
+  if (!isNoopShuffleMask(HiMask))
+    V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
+                    getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
+
+  return V;
+}
+
+/// \brief Helper to form a PSHUFB-based shuffle+blend.
+static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
+                                          SDValue V2, ArrayRef<int> Mask,
+                                          SelectionDAG &DAG, bool &V1InUse,
+                                          bool &V2InUse) {
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  SDValue V1Mask[16];
+  SDValue V2Mask[16];
+  V1InUse = false;
+  V2InUse = false;
+
+  int Size = Mask.size();
+  int Scale = 16 / Size;
+  for (int i = 0; i < 16; ++i) {
+    if (Mask[i / Scale] == -1) {
+      V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
+    } else {
+      const int ZeroMask = 0x80;
+      int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
+                                          : ZeroMask;
+      int V2Idx = Mask[i / Scale] < Size
+                      ? ZeroMask
+                      : (Mask[i / Scale] - Size) * Scale + i % Scale;
+      if (Zeroable[i / Scale])
+        V1Idx = V2Idx = ZeroMask;
+      V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
+      V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
+      V1InUse |= (ZeroMask != V1Idx);
+      V2InUse |= (ZeroMask != V2Idx);
+    }
+  }
+
+  if (V1InUse)
+    V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+                     DAG.getBitcast(MVT::v16i8, V1),
+                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
+  if (V2InUse)
+    V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+                     DAG.getBitcast(MVT::v16i8, V2),
+                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
+
+  // If we need shuffled inputs from both, blend the two.
+  SDValue V;
+  if (V1InUse && V2InUse)
+    V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
+  else
+    V = V1InUse ? V1 : V2;
+
+  // Cast the result back to the correct type.
+  return DAG.getBitcast(VT, V);
+}
+
+/// \brief Generic lowering of 8-lane i16 shuffles.
+///
+/// This handles both single-input shuffles and combined shuffle/blends with
+/// two inputs. The single input shuffles are immediately delegated to
+/// a dedicated lowering routine.
+///
+/// The blends are lowered in one of three fundamental ways. If there are few
+/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
+/// of the input is significantly cheaper when lowered as an interleaving of
+/// the two inputs, try to interleave them. Otherwise, blend the low and high
+/// halves of the inputs separately (making them have relatively few inputs)
+/// and then concatenate them.
+static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> OrigMask = SVOp->getMask();
+  int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
+                        OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
+  MutableArrayRef<int> Mask(MaskStorage);
+
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+          DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
+    return ZExt;
+
+  auto isV1 = [](int M) { return M >= 0 && M < 8; };
+  (void)isV1;
+  auto isV2 = [](int M) { return M >= 8; };
+
+  int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
+
+  if (NumV2Inputs == 0) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1,
+                                                          Mask, Subtarget, DAG))
+      return Broadcast;
+
+    // Try to use shift instructions.
+    if (SDValue Shift =
+            lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG))
+      return Shift;
+
+    // Use dedicated unpack instructions for masks that match their pattern.
+    if (SDValue V =
+            lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+      return V;
+
+    // Try to use byte rotation instructions.
+    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
+                                                        Mask, Subtarget, DAG))
+      return Rotate;
+
+    return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask,
+                                                     Subtarget, DAG);
+  }
+
+  assert(std::any_of(Mask.begin(), Mask.end(), isV1) &&
+         "All single-input shuffles should be canonicalized to be V1-input "
+         "shuffles.");
+
+  // Try to use shift instructions.
+  if (SDValue Shift =
+          lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
+    return Shift;
+
+  // See if we can use SSE4A Extraction / Insertion.
+  if (Subtarget->hasSSE4A())
+    if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
+      return V;
+
+  // There are special ways we can lower some single-element blends.
+  if (NumV2Inputs == 1)
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
+                                                         Mask, Subtarget, DAG))
+      return V;
+
+  // We have different paths for blend lowering, but they all must use the
+  // *exact* same predicate.
+  bool IsBlendSupported = Subtarget->hasSSE41();
+  if (IsBlendSupported)
+    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
+                                                  Subtarget, DAG))
+      return Blend;
+
+  if (SDValue Masked =
+          lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
+    return Masked;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+    return V;
+
+  // Try to use byte rotation instructions.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+    return Rotate;
+
+  if (SDValue BitBlend =
+          lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
+    return BitBlend;
+
+  if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
+                                                            V2, Mask, DAG))
+    return Unpack;
+
+  // If we can't directly blend but can use PSHUFB, that will be better as it
+  // can both shuffle and set up the inefficient blend.
+  if (!IsBlendSupported && Subtarget->hasSSSE3()) {
+    bool V1InUse, V2InUse;
+    return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG,
+                                      V1InUse, V2InUse);
+  }
+
+  // We can always bit-blend if we have to so the fallback strategy is to
+  // decompose into single-input permutes and blends.
+  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
+                                                      Mask, DAG);
+}
+
+/// \brief Check whether a compaction lowering can be done by dropping even
+/// elements and compute how many times even elements must be dropped.
+///
+/// This handles shuffles which take every Nth element where N is a power of
+/// two. Example shuffle masks:
+///
+///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
+///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
+///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
+///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
+///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
+///
+/// Any of these lanes can of course be undef.
+///
+/// This routine only supports N <= 3.
+/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
+/// for larger N.
+///
+/// \returns N above, or the number of times even elements must be dropped if
+/// there is such a number. Otherwise returns zero.
+static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
+  // Figure out whether we're looping over two inputs or just one.
+  bool IsSingleInput = isSingleInputShuffleMask(Mask);
+
+  // The modulus for the shuffle vector entries is based on whether this is
+  // a single input or not.
+  int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
+  assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
+         "We should only be called with masks with a power-of-2 size!");
+
+  uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
+
+  // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
+  // and 2^3 simultaneously. This is because we may have ambiguity with
+  // partially undef inputs.
+  bool ViableForN[3] = {true, true, true};
+
+  for (int i = 0, e = Mask.size(); i < e; ++i) {
+    // Ignore undef lanes, we'll optimistically collapse them to the pattern we
+    // want.
+    if (Mask[i] == -1)
+      continue;
+
+    bool IsAnyViable = false;
+    for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+      if (ViableForN[j]) {
+        uint64_t N = j + 1;
+
+        // The shuffle mask must be equal to (i * 2^N) % M.
+        if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
+          IsAnyViable = true;
+        else
+          ViableForN[j] = false;
+      }
+    // Early exit if we exhaust the possible powers of two.
+    if (!IsAnyViable)
+      break;
+  }
+
+  for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+    if (ViableForN[j])
+      return j + 1;
+
+  // Return 0 as there is no viable power of two.
+  return 0;
+}
+
+/// \brief Generic lowering of v16i8 shuffles.
+///
+/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
+/// detect any complexity reducing interleaving. If that doesn't help, it uses
+/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
+/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
+/// back together.
+static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+  // Try to use shift instructions.
+  if (SDValue Shift =
+          lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG))
+    return Shift;
+
+  // Try to use byte rotation instructions.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+    return Rotate;
+
+  // Try to use a zext lowering.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+          DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+    return ZExt;
+
+  // See if we can use SSE4A Extraction / Insertion.
+  if (Subtarget->hasSSE4A())
+    if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
+      return V;
+
+  int NumV2Elements =
+      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
+
+  // For single-input shuffles, there are some nicer lowering tricks we can use.
+  if (NumV2Elements == 0) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1,
+                                                          Mask, Subtarget, DAG))
+      return Broadcast;
+
+    // Check whether we can widen this to an i16 shuffle by duplicating bytes.
+    // Notably, this handles splat and partial-splat shuffles more efficiently.
+    // However, it only makes sense if the pre-duplication shuffle simplifies
+    // things significantly. Currently, this means we need to be able to
+    // express the pre-duplication shuffle as an i16 shuffle.
+    //
+    // FIXME: We should check for other patterns which can be widened into an
+    // i16 shuffle as well.
+    auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
+      for (int i = 0; i < 16; i += 2)
+        if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
+          return false;
+
+      return true;
+    };
+    auto tryToWidenViaDuplication = [&]() -> SDValue {
+      if (!canWidenViaDuplication(Mask))
+        return SDValue();
+      SmallVector<int, 4> LoInputs;
+      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
+                   [](int M) { return M >= 0 && M < 8; });
+      std::sort(LoInputs.begin(), LoInputs.end());
+      LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
+                     LoInputs.end());
+      SmallVector<int, 4> HiInputs;
+      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
+                   [](int M) { return M >= 8; });
+      std::sort(HiInputs.begin(), HiInputs.end());
+      HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
+                     HiInputs.end());
+
+      bool TargetLo = LoInputs.size() >= HiInputs.size();
+      ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
+      ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
+
+      int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+      SmallDenseMap<int, int, 8> LaneMap;
+      for (int I : InPlaceInputs) {
+        PreDupI16Shuffle[I/2] = I/2;
+        LaneMap[I] = I;
+      }
+      int j = TargetLo ? 0 : 4, je = j + 4;
+      for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
+        // Check if j is already a shuffle of this input. This happens when
+        // there are two adjacent bytes after we move the low one.
+        if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
+          // If we haven't yet mapped the input, search for a slot into which
+          // we can map it.
+          while (j < je && PreDupI16Shuffle[j] != -1)
+            ++j;
+
+          if (j == je)
+            // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
+            return SDValue();
+
+          // Map this input with the i16 shuffle.
+          PreDupI16Shuffle[j] = MovingInputs[i] / 2;
+        }
+
+        // Update the lane map based on the mapping we ended up with.
+        LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
+      }
+      V1 = DAG.getBitcast(
+          MVT::v16i8,
+          DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
+                               DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
+
+      // Unpack the bytes to form the i16s that will be shuffled into place.
+      V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+                       MVT::v16i8, V1, V1);
+
+      int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+      for (int i = 0; i < 16; ++i)
+        if (Mask[i] != -1) {
+          int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
+          assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
+          if (PostDupI16Shuffle[i / 2] == -1)
+            PostDupI16Shuffle[i / 2] = MappedMask;
+          else
+            assert(PostDupI16Shuffle[i / 2] == MappedMask &&
+                   "Conflicting entrties in the original shuffle!");
+        }
+      return DAG.getBitcast(
+          MVT::v16i8,
+          DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
+                               DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
+    };
+    if (SDValue V = tryToWidenViaDuplication())
+      return V;
+  }
+
+  if (SDValue Masked =
+          lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG))
+    return Masked;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+    return V;
+
+  // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
+  // with PSHUFB. It is important to do this before we attempt to generate any
+  // blends but after all of the single-input lowerings. If the single input
+  // lowerings can find an instruction sequence that is faster than a PSHUFB, we
+  // want to preserve that and we can DAG combine any longer sequences into
+  // a PSHUFB in the end. But once we start blending from multiple inputs,
+  // the complexity of DAG combining bad patterns back into PSHUFB is too high,
+  // and there are *very* few patterns that would actually be faster than the
+  // PSHUFB approach because of its ability to zero lanes.
+  //
+  // FIXME: The only exceptions to the above are blends which are exact
+  // interleavings with direct instructions supporting them. We currently don't
+  // handle those well here.
+  if (Subtarget->hasSSSE3()) {
+    bool V1InUse = false;
+    bool V2InUse = false;
+
+    SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask,
+                                                DAG, V1InUse, V2InUse);
+
+    // If both V1 and V2 are in use and we can use a direct blend or an unpack,
+    // do so. This avoids using them to handle blends-with-zero which is
+    // important as a single pshufb is significantly faster for that.
+    if (V1InUse && V2InUse) {
+      if (Subtarget->hasSSE41())
+        if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
+                                                      Mask, Subtarget, DAG))
+          return Blend;
+
+      // We can use an unpack to do the blending rather than an or in some
+      // cases. Even though the or may be (very minorly) more efficient, we
+      // preference this lowering because there are common cases where part of
+      // the complexity of the shuffles goes away when we do the final blend as
+      // an unpack.
+      // FIXME: It might be worth trying to detect if the unpack-feeding
+      // shuffles will both be pshufb, in which case we shouldn't bother with
+      // this.
+      if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
+              DL, MVT::v16i8, V1, V2, Mask, DAG))
+        return Unpack;
+    }
+
+    return PSHUFB;
+  }
+
+  // There are special ways we can lower some single-element blends.
+  if (NumV2Elements == 1)
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2,
+                                                         Mask, Subtarget, DAG))
+      return V;
+
+  if (SDValue BitBlend =
+          lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
+    return BitBlend;
+
+  // Check whether a compaction lowering can be done. This handles shuffles
+  // which take every Nth element for some even N. See the helper function for
+  // details.
+  //
+  // We special case these as they can be particularly efficiently handled with
+  // the PACKUSB instruction on x86 and they show up in common patterns of
+  // rearranging bytes to truncate wide elements.
+  if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
+    // NumEvenDrops is the power of two stride of the elements. Another way of
+    // thinking about it is that we need to drop the even elements this many
+    // times to get the original input.
+    bool IsSingleInput = isSingleInputShuffleMask(Mask);
+
+    // First we need to zero all the dropped bytes.
+    assert(NumEvenDrops <= 3 &&
+           "No support for dropping even elements more than 3 times.");
+    // We use the mask type to pick which bytes are preserved based on how many
+    // elements are dropped.
+    MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
+    SDValue ByteClearMask = DAG.getBitcast(
+        MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
+    V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
+    if (!IsSingleInput)
+      V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
+
+    // Now pack things back together.
+    V1 = DAG.getBitcast(MVT::v8i16, V1);
+    V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
+    SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
+    for (int i = 1; i < NumEvenDrops; ++i) {
+      Result = DAG.getBitcast(MVT::v8i16, Result);
+      Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
+    }
+
+    return Result;
+  }
+
+  // Handle multi-input cases by blending single-input shuffles.
+  if (NumV2Elements > 0)
+    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
+                                                      Mask, DAG);
+
+  // The fallback path for single-input shuffles widens this into two v8i16
+  // vectors with unpacks, shuffles those, and then pulls them back together
+  // with a pack.
+  SDValue V = V1;
+
+  int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  for (int i = 0; i < 16; ++i)
+    if (Mask[i] >= 0)
+      (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
+
+  SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
+
+  SDValue VLoHalf, VHiHalf;
+  // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
+  // them out and avoid using UNPCK{L,H} to extract the elements of V as
+  // i16s.
+  if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
+                   [](int M) { return M >= 0 && M % 2 == 1; }) &&
+      std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
+                   [](int M) { return M >= 0 && M % 2 == 1; })) {
+    // Use a mask to drop the high bytes.
+    VLoHalf = DAG.getBitcast(MVT::v8i16, V);
+    VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
+                     DAG.getConstant(0x00FF, DL, MVT::v8i16));
+
+    // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
+    VHiHalf = DAG.getUNDEF(MVT::v8i16);
+
+    // Squash the masks to point directly into VLoHalf.
+    for (int &M : LoBlendMask)
+      if (M >= 0)
+        M /= 2;
+    for (int &M : HiBlendMask)
+      if (M >= 0)
+        M /= 2;
+  } else {
+    // Otherwise just unpack the low half of V into VLoHalf and the high half into
+    // VHiHalf so that we can blend them as i16s.
+    VLoHalf = DAG.getBitcast(
+        MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+    VHiHalf = DAG.getBitcast(
+        MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+  }
+
+  SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
+  SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
+
+  return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
+}
+
+/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
+///
+/// This routine breaks down the specific type of 128-bit shuffle and
+/// dispatches to the lowering routines accordingly.
+static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                        MVT VT, const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  switch (VT.SimpleTy) {
+  case MVT::v2i64:
+    return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v2f64:
+    return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v4i32:
+    return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v4f32:
+    return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v8i16:
+    return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v16i8:
+    return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
+
+  default:
+    llvm_unreachable("Unimplemented!");
+  }
+}
+
+/// \brief Helper function to test whether a shuffle mask could be
+/// simplified by widening the elements being shuffled.
+///
+/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
+/// leaves it in an unspecified state.
+///
+/// NOTE: This must handle normal vector shuffle masks and *target* vector
+/// shuffle masks. The latter have the special property of a '-2' representing
+/// a zero-ed lane of a vector.
+static bool canWidenShuffleElements(ArrayRef<int> Mask,
+                                    SmallVectorImpl<int> &WidenedMask) {
+  for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
+    // If both elements are undef, its trivial.
+    if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
+      WidenedMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+
+    // Check for an undef mask and a mask value properly aligned to fit with
+    // a pair of values. If we find such a case, use the non-undef mask's value.
+    if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
+      WidenedMask.push_back(Mask[i + 1] / 2);
+      continue;
+    }
+    if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
+      WidenedMask.push_back(Mask[i] / 2);
+      continue;
+    }
+
+    // When zeroing, we need to spread the zeroing across both lanes to widen.
+    if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
+      if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
+          (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
+        WidenedMask.push_back(SM_SentinelZero);
+        continue;
+      }
+      return false;
+    }
+
+    // Finally check if the two mask values are adjacent and aligned with
+    // a pair.
+    if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
+      WidenedMask.push_back(Mask[i] / 2);
+      continue;
+    }
+
+    // Otherwise we can't safely widen the elements used in this shuffle.
+    return false;
+  }
+  assert(WidenedMask.size() == Mask.size() / 2 &&
+         "Incorrect size of mask after widening the elements!");
+
+  return true;
+}
+
+/// \brief Generic routine to split vector shuffle into half-sized shuffles.
+///
+/// This routine just extracts two subvectors, shuffles them independently, and
+/// then concatenates them back together. This should work effectively with all
+/// AVX vector shuffle types.
+static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
+                                          SDValue V2, ArrayRef<int> Mask,
+                                          SelectionDAG &DAG) {
+  assert(VT.getSizeInBits() >= 256 &&
+         "Only for 256-bit or wider vector shuffles!");
+  assert(V1.getSimpleValueType() == VT && "Bad operand type!");
+  assert(V2.getSimpleValueType() == VT && "Bad operand type!");
+
+  ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
+  ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
+
+  int NumElements = VT.getVectorNumElements();
+  int SplitNumElements = NumElements / 2;
+  MVT ScalarVT = VT.getVectorElementType();
+  MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
+
+  // Rather than splitting build-vectors, just build two narrower build
+  // vectors. This helps shuffling with splats and zeros.
+  auto SplitVector = [&](SDValue V) {
+    while (V.getOpcode() == ISD::BITCAST)
+      V = V->getOperand(0);
+
+    MVT OrigVT = V.getSimpleValueType();
+    int OrigNumElements = OrigVT.getVectorNumElements();
+    int OrigSplitNumElements = OrigNumElements / 2;
+    MVT OrigScalarVT = OrigVT.getVectorElementType();
+    MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
+
+    SDValue LoV, HiV;
+
+    auto *BV = dyn_cast<BuildVectorSDNode>(V);
+    if (!BV) {
+      LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
+                        DAG.getIntPtrConstant(0, DL));
+      HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
+                        DAG.getIntPtrConstant(OrigSplitNumElements, DL));
+    } else {
+
+      SmallVector<SDValue, 16> LoOps, HiOps;
+      for (int i = 0; i < OrigSplitNumElements; ++i) {
+        LoOps.push_back(BV->getOperand(i));
+        HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
+      }
+      LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps);
+      HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps);
+    }
+    return std::make_pair(DAG.getBitcast(SplitVT, LoV),
+                          DAG.getBitcast(SplitVT, HiV));
+  };
+
+  SDValue LoV1, HiV1, LoV2, HiV2;
+  std::tie(LoV1, HiV1) = SplitVector(V1);
+  std::tie(LoV2, HiV2) = SplitVector(V2);
+
+  // Now create two 4-way blends of these half-width vectors.
+  auto HalfBlend = [&](ArrayRef<int> HalfMask) {
+    bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
+    SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
+    for (int i = 0; i < SplitNumElements; ++i) {
+      int M = HalfMask[i];
+      if (M >= NumElements) {
+        if (M >= NumElements + SplitNumElements)
+          UseHiV2 = true;
+        else
+          UseLoV2 = true;
+        V2BlendMask.push_back(M - NumElements);
+        V1BlendMask.push_back(-1);
+        BlendMask.push_back(SplitNumElements + i);
+      } else if (M >= 0) {
+        if (M >= SplitNumElements)
+          UseHiV1 = true;
+        else
+          UseLoV1 = true;
+        V2BlendMask.push_back(-1);
+        V1BlendMask.push_back(M);
+        BlendMask.push_back(i);
+      } else {
+        V2BlendMask.push_back(-1);
+        V1BlendMask.push_back(-1);
+        BlendMask.push_back(-1);
+      }
+    }
+
+    // Because the lowering happens after all combining takes place, we need to
+    // manually combine these blend masks as much as possible so that we create
+    // a minimal number of high-level vector shuffle nodes.
+
+    // First try just blending the halves of V1 or V2.
+    if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
+      return DAG.getUNDEF(SplitVT);
+    if (!UseLoV2 && !UseHiV2)
+      return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
+    if (!UseLoV1 && !UseHiV1)
+      return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+
+    SDValue V1Blend, V2Blend;
+    if (UseLoV1 && UseHiV1) {
+      V1Blend =
+        DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
+    } else {
+      // We only use half of V1 so map the usage down into the final blend mask.
+      V1Blend = UseLoV1 ? LoV1 : HiV1;
+      for (int i = 0; i < SplitNumElements; ++i)
+        if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
+          BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
+    }
+    if (UseLoV2 && UseHiV2) {
+      V2Blend =
+        DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+    } else {
+      // We only use half of V2 so map the usage down into the final blend mask.
+      V2Blend = UseLoV2 ? LoV2 : HiV2;
+      for (int i = 0; i < SplitNumElements; ++i)
+        if (BlendMask[i] >= SplitNumElements)
+          BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
+    }
+    return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
+  };
+  SDValue Lo = HalfBlend(LoMask);
+  SDValue Hi = HalfBlend(HiMask);
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+}
+
+/// \brief Either split a vector in halves or decompose the shuffles and the
+/// blend.
+///
+/// This is provided as a good fallback for many lowerings of non-single-input
+/// shuffles with more than one 128-bit lane. In those cases, we want to select
+/// between splitting the shuffle into 128-bit components and stitching those
+/// back together vs. extracting the single-input shuffles and blending those
+/// results.
+static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
+                                                SDValue V2, ArrayRef<int> Mask,
+                                                SelectionDAG &DAG) {
+  assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
+                                            "lower single-input shuffles as it "
+                                            "could then recurse on itself.");
+  int Size = Mask.size();
+
+  // If this can be modeled as a broadcast of two elements followed by a blend,
+  // prefer that lowering. This is especially important because broadcasts can
+  // often fold with memory operands.
+  auto DoBothBroadcast = [&] {
+    int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
+    for (int M : Mask)
+      if (M >= Size) {
+        if (V2BroadcastIdx == -1)
+          V2BroadcastIdx = M - Size;
+        else if (M - Size != V2BroadcastIdx)
+          return false;
+      } else if (M >= 0) {
+        if (V1BroadcastIdx == -1)
+          V1BroadcastIdx = M;
+        else if (M != V1BroadcastIdx)
+          return false;
+      }
+    return true;
+  };
+  if (DoBothBroadcast())
+    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
+                                                      DAG);
+
+  // If the inputs all stem from a single 128-bit lane of each input, then we
+  // split them rather than blending because the split will decompose to
+  // unusually few instructions.
+  int LaneCount = VT.getSizeInBits() / 128;
+  int LaneSize = Size / LaneCount;
+  SmallBitVector LaneInputs[2];
+  LaneInputs[0].resize(LaneCount, false);
+  LaneInputs[1].resize(LaneCount, false);
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] >= 0)
+      LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
+  if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
+    return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+
+  // Otherwise, just fall back to decomposed shuffles and a blend. This requires
+  // that the decomposed single-input shuffles don't end up here.
+  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
+}
+
+/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
+/// a permutation and blend of those lanes.
+///
+/// This essentially blends the out-of-lane inputs to each lane into the lane
+/// from a permuted copy of the vector. This lowering strategy results in four
+/// instructions in the worst case for a single-input cross lane shuffle which
+/// is lower than any other fully general cross-lane shuffle strategy I'm aware
+/// of. Special cases for each particular shuffle pattern should be handled
+/// prior to trying this lowering.
+static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
+                                                       SDValue V1, SDValue V2,
+                                                       ArrayRef<int> Mask,
+                                                       SelectionDAG &DAG) {
+  // FIXME: This should probably be generalized for 512-bit vectors as well.
+  assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
+  int LaneSize = Mask.size() / 2;
+
+  // If there are only inputs from one 128-bit lane, splitting will in fact be
+  // less expensive. The flags track whether the given lane contains an element
+  // that crosses to another lane.
+  bool LaneCrossing[2] = {false, false};
+  for (int i = 0, Size = Mask.size(); i < Size; ++i)
+    if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+      LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
+  if (!LaneCrossing[0] || !LaneCrossing[1])
+    return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+
+  if (isSingleInputShuffleMask(Mask)) {
+    SmallVector<int, 32> FlippedBlendMask;
+    for (int i = 0, Size = Mask.size(); i < Size; ++i)
+      FlippedBlendMask.push_back(
+          Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
+                                  ? Mask[i]
+                                  : Mask[i] % LaneSize +
+                                        (i / LaneSize) * LaneSize + Size));
+
+    // Flip the vector, and blend the results which should now be in-lane. The
+    // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
+    // 5 for the high source. The value 3 selects the high half of source 2 and
+    // the value 2 selects the low half of source 2. We only use source 2 to
+    // allow folding it into a memory operand.
+    unsigned PERMMask = 3 | 2 << 4;
+    SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
+                                  V1, DAG.getConstant(PERMMask, DL, MVT::i8));
+    return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
+  }
+
+  // This now reduces to two single-input shuffles of V1 and V2 which at worst
+  // will be handled by the above logic and a blend of the results, much like
+  // other patterns in AVX.
+  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering 2-lane 128-bit shuffles.
+static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
+                                        SDValue V2, ArrayRef<int> Mask,
+                                        const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  // TODO: If minimizing size and one of the inputs is a zero vector and the
+  // the zero vector has only one use, we could use a VPERM2X128 to save the
+  // instruction bytes needed to explicitly generate the zero vector.
+
+  // Blends are faster and handle all the non-lane-crossing cases.
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Blend;
+
+  bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+  // If either input operand is a zero vector, use VPERM2X128 because its mask
+  // allows us to replace the zero input with an implicit zero.
+  if (!IsV1Zero && !IsV2Zero) {
+    // Check for patterns which can be matched with a single insert of a 128-bit
+    // subvector.
+    bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
+    if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
+      MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
+                                   VT.getVectorNumElements() / 2);
+      SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+                                DAG.getIntPtrConstant(0, DL));
+      SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+                                OnlyUsesV1 ? V1 : V2,
+                                DAG.getIntPtrConstant(0, DL));
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+    }
+  }
+
+  // Otherwise form a 128-bit permutation. After accounting for undefs,
+  // convert the 64-bit shuffle mask selection values into 128-bit
+  // selection bits by dividing the indexes by 2 and shifting into positions
+  // defined by a vperm2*128 instruction's immediate control byte.
+
+  // The immediate permute control byte looks like this:
+  //    [1:0] - select 128 bits from sources for low half of destination
+  //    [2]   - ignore
+  //    [3]   - zero low half of destination
+  //    [5:4] - select 128 bits from sources for high half of destination
+  //    [6]   - ignore
+  //    [7]   - zero high half of destination
+
+  int MaskLO = Mask[0];
+  if (MaskLO == SM_SentinelUndef)
+    MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
+
+  int MaskHI = Mask[2];
+  if (MaskHI == SM_SentinelUndef)
+    MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
+
+  unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
+
+  // If either input is a zero vector, replace it with an undef input.
+  // Shuffle mask values <  4 are selecting elements of V1.
+  // Shuffle mask values >= 4 are selecting elements of V2.
+  // Adjust each half of the permute mask by clearing the half that was
+  // selecting the zero vector and setting the zero mask bit.
+  if (IsV1Zero) {
+    V1 = DAG.getUNDEF(VT);
+    if (MaskLO < 4)
+      PermMask = (PermMask & 0xf0) | 0x08;
+    if (MaskHI < 4)
+      PermMask = (PermMask & 0x0f) | 0x80;
+  }
+  if (IsV2Zero) {
+    V2 = DAG.getUNDEF(VT);
+    if (MaskLO >= 4)
+      PermMask = (PermMask & 0xf0) | 0x08;
+    if (MaskHI >= 4)
+      PermMask = (PermMask & 0x0f) | 0x80;
+  }
+
+  return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
+                     DAG.getConstant(PermMask, DL, MVT::i8));
+}
+
+/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
+/// shuffling each lane.
+///
+/// This will only succeed when the result of fixing the 128-bit lanes results
+/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
+/// each 128-bit lanes. This handles many cases where we can quickly blend away
+/// the lane crosses early and then use simpler shuffles within each lane.
+///
+/// FIXME: It might be worthwhile at some point to support this without
+/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
+/// in x86 only floating point has interesting non-repeating shuffles, and even
+/// those are still *marginally* more expensive.
+static SDValue lowerVectorShuffleByMerging128BitLanes(
+    SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+  assert(!isSingleInputShuffleMask(Mask) &&
+         "This is only useful with multiple inputs.");
+
+  int Size = Mask.size();
+  int LaneSize = 128 / VT.getScalarSizeInBits();
+  int NumLanes = Size / LaneSize;
+  assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
+
+  // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
+  // check whether the in-128-bit lane shuffles share a repeating pattern.
+  SmallVector<int, 4> Lanes;
+  Lanes.resize(NumLanes, -1);
+  SmallVector<int, 4> InLaneMask;
+  InLaneMask.resize(LaneSize, -1);
+  for (int i = 0; i < Size; ++i) {
+    if (Mask[i] < 0)
+      continue;
+
+    int j = i / LaneSize;
+
+    if (Lanes[j] < 0) {
+      // First entry we've seen for this lane.
+      Lanes[j] = Mask[i] / LaneSize;
+    } else if (Lanes[j] != Mask[i] / LaneSize) {
+      // This doesn't match the lane selected previously!
+      return SDValue();
+    }
+
+    // Check that within each lane we have a consistent shuffle mask.
+    int k = i % LaneSize;
+    if (InLaneMask[k] < 0) {
+      InLaneMask[k] = Mask[i] % LaneSize;
+    } else if (InLaneMask[k] != Mask[i] % LaneSize) {
+      // This doesn't fit a repeating in-lane mask.
+      return SDValue();
+    }
+  }
+
+  // First shuffle the lanes into place.
+  MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
+                                VT.getSizeInBits() / 64);
+  SmallVector<int, 8> LaneMask;
+  LaneMask.resize(NumLanes * 2, -1);
+  for (int i = 0; i < NumLanes; ++i)
+    if (Lanes[i] >= 0) {
+      LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
+      LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
+    }
+
+  V1 = DAG.getBitcast(LaneVT, V1);
+  V2 = DAG.getBitcast(LaneVT, V2);
+  SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
+
+  // Cast it back to the type we actually want.
+  LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
+
+  // Now do a simple shuffle that isn't lane crossing.
+  SmallVector<int, 8> NewMask;
+  NewMask.resize(Size, -1);
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] >= 0)
+      NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
+  assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
+         "Must not introduce lane crosses at this point!");
+
+  return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
+}
+
+/// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
+/// This allows for fast cases such as subvector extraction/insertion
+/// or shuffling smaller vector types which can lower more efficiently.
+static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1,
+                                               SDValue V2, ArrayRef<int> Mask,
+                                               const X86Subtarget *Subtarget,
+                                               SelectionDAG &DAG) {
+  assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector");
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned HalfNumElts = NumElts / 2;
+  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
+
+  bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
+  bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
+  if (!UndefLower && !UndefUpper)
+    return SDValue();
+
+  // Upper half is undef and lower half is whole upper subvector.
+  // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+  if (UndefUpper &&
+      isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+                             DAG.getIntPtrConstant(HalfNumElts, DL));
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+                       DAG.getIntPtrConstant(0, DL));
+  }
+
+  // Lower half is undef and upper half is whole lower subvector.
+  // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+  if (UndefLower &&
+      isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+                             DAG.getIntPtrConstant(0, DL));
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+                       DAG.getIntPtrConstant(HalfNumElts, DL));
+  }
+
+  // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
+  if (UndefLower && Subtarget->hasAVX2() &&
+      (VT == MVT::v4f64 || VT == MVT::v4i64))
+    return SDValue();
+
+  // If the shuffle only uses the lower halves of the input operands,
+  // then extract them and perform the 'half' shuffle at half width.
+  // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
+  int HalfIdx1 = -1, HalfIdx2 = -1;
+  SmallVector<int, 8> HalfMask;
+  unsigned Offset = UndefLower ? HalfNumElts : 0;
+  for (unsigned i = 0; i != HalfNumElts; ++i) {
+    int M = Mask[i + Offset];
+    if (M < 0) {
+      HalfMask.push_back(M);
+      continue;
+    }
+
+    // Determine which of the 4 half vectors this element is from.
+    // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
+    int HalfIdx = M / HalfNumElts;
+
+    // Only shuffle using the lower halves of the inputs.
+    // TODO: Investigate usefulness of shuffling with upper halves.
+    if (HalfIdx != 0 && HalfIdx != 2)
+      return SDValue();
+
+    // Determine the element index into its half vector source.
+    int HalfElt = M % HalfNumElts;
+
+    // We can shuffle with up to 2 half vectors, set the new 'half'
+    // shuffle mask accordingly.
+    if (-1 == HalfIdx1 || HalfIdx1 == HalfIdx) {
+      HalfMask.push_back(HalfElt);
+      HalfIdx1 = HalfIdx;
+      continue;
+    }
+    if (-1 == HalfIdx2 || HalfIdx2 == HalfIdx) {
+      HalfMask.push_back(HalfElt + HalfNumElts);
+      HalfIdx2 = HalfIdx;
+      continue;
+    }
+
+    // Too many half vectors referenced.
+    return SDValue();
+  }
+  assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
+
+  auto GetHalfVector = [&](int HalfIdx) {
+    if (HalfIdx < 0)
+      return DAG.getUNDEF(HalfVT);
+    SDValue V = (HalfIdx < 2 ? V1 : V2);
+    HalfIdx = (HalfIdx % 2) * HalfNumElts;
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
+                       DAG.getIntPtrConstant(HalfIdx, DL));
+  };
+
+  SDValue Half1 = GetHalfVector(HalfIdx1);
+  SDValue Half2 = GetHalfVector(HalfIdx2);
+  SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
+                     DAG.getIntPtrConstant(Offset, DL));
+}
+
+/// \brief Test whether the specified input (0 or 1) is in-place blended by the
+/// given mask.
+///
+/// This returns true if the elements from a particular input are already in the
+/// slot required by the given mask and require no permutation.
+static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
+  assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
+  int Size = Mask.size();
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
+      return false;
+
+  return true;
+}
+
+static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT,
+                                            ArrayRef<int> Mask, SDValue V1,
+                                            SDValue V2, SelectionDAG &DAG) {
+
+  // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
+  // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
+  assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD");
+  int NumElts = VT.getVectorNumElements();
+  bool ShufpdMask = true;
+  bool CommutableMask = true;
+  unsigned Immediate = 0;
+  for (int i = 0; i < NumElts; ++i) {
+    if (Mask[i] < 0)
+      continue;
+    int Val = (i & 6) + NumElts * (i & 1);
+    int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1);
+    if (Mask[i] < Val ||  Mask[i] > Val + 1)
+      ShufpdMask = false;
+    if (Mask[i] < CommutVal ||  Mask[i] > CommutVal + 1)
+      CommutableMask = false;
+    Immediate |= (Mask[i] % 2) << i;
+  }
+  if (ShufpdMask)
+    return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
+                       DAG.getConstant(Immediate, DL, MVT::i8));
+  if (CommutableMask)
+    return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
+                       DAG.getConstant(Immediate, DL, MVT::i8));
+  return SDValue();
+}
+
+/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
+///
+/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
+/// isn't available.
+static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+  SmallVector<int, 4> WidenedMask;
+  if (canWidenShuffleElements(Mask, WidenedMask))
+    return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
+                                    DAG);
+
+  if (isSingleInputShuffleMask(Mask)) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1,
+                                                          Mask, Subtarget, DAG))
+      return Broadcast;
+
+    // Use low duplicate instructions for masks that match their pattern.
+    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
+      return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
+
+    if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
+      // Non-half-crossing single input shuffles can be lowerid with an
+      // interleaved permutation.
+      unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
+                              ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
+      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
+                         DAG.getConstant(VPERMILPMask, DL, MVT::i8));
+    }
+
+    // With AVX2 we have direct support for this permutation.
+    if (Subtarget->hasAVX2())
+      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
+                         getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+
+    // Otherwise, fall back.
+    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
+                                                   DAG);
+  }
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
+    return V;
+
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Blend;
+
+  // Check if the blend happens to exactly fit that of SHUFPD.
+  if (SDValue Op =
+      lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
+    return Op;
+
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle. However, if we have AVX2 and either inputs are already in place,
+  // we will be able to shuffle even across lanes the other input in a single
+  // instruction so skip this pattern.
+  if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+                                 isShuffleMaskInputInPlace(1, Mask))))
+    if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+      return Result;
+
+  // If we have AVX2 then we always want to lower with a blend because an v4 we
+  // can fully permute the elements.
+  if (Subtarget->hasAVX2())
+    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
+                                                      Mask, DAG);
+
+  // Otherwise fall back on generic lowering.
+  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v4i64 shuffling..
+static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+  assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
+
+  SmallVector<int, 4> WidenedMask;
+  if (canWidenShuffleElements(Mask, WidenedMask))
+    return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
+                                    DAG);
+
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Blend;
+
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
+  // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
+  // use lower latency instructions that will operate on both 128-bit lanes.
+  SmallVector<int, 2> RepeatedMask;
+  if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
+    if (isSingleInputShuffleMask(Mask)) {
+      int PSHUFDMask[] = {-1, -1, -1, -1};
+      for (int i = 0; i < 2; ++i)
+        if (RepeatedMask[i] >= 0) {
+          PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
+          PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
+        }
+      return DAG.getBitcast(
+          MVT::v4i64,
+          DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
+                      DAG.getBitcast(MVT::v8i32, V1),
+                      getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+    }
+  }
+
+  // AVX2 provides a direct instruction for permuting a single input across
+  // lanes.
+  if (isSingleInputShuffleMask(Mask))
+    return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
+                       getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+
+  // Try to use shift instructions.
+  if (SDValue Shift =
+          lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG))
+    return Shift;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
+    return V;
+
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle. However, if we have AVX2 and either inputs are already in place,
+  // we will be able to shuffle even across lanes the other input in a single
+  // instruction so skip this pattern.
+  if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+                                 isShuffleMaskInputInPlace(1, Mask))))
+    if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+            DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
+      return Result;
+
+  // Otherwise fall back on generic blend lowering.
+  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
+                                                    Mask, DAG);
+}
+
+/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
+///
+/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
+/// isn't available.
+static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Blend;
+
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
+  // If the shuffle mask is repeated in each 128-bit lane, we have many more
+  // options to efficiently lower the shuffle.
+  SmallVector<int, 4> RepeatedMask;
+  if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
+    assert(RepeatedMask.size() == 4 &&
+           "Repeated masks must be half the mask width!");
+
+    // Use even/odd duplicate instructions for masks that match their pattern.
+    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
+      return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
+    if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7}))
+      return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
+
+    if (isSingleInputShuffleMask(Mask))
+      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
+                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+    // Use dedicated unpack instructions for masks that match their pattern.
+    if (SDValue V =
+            lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
+      return V;
+
+    // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
+    // have already handled any direct blends. We also need to squash the
+    // repeated mask into a simulated v4f32 mask.
+    for (int i = 0; i < 4; ++i)
+      if (RepeatedMask[i] >= 8)
+        RepeatedMask[i] -= 4;
+    return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
+  }
+
+  // If we have a single input shuffle with different shuffle patterns in the
+  // two 128-bit lanes use the variable mask to VPERMILPS.
+  if (isSingleInputShuffleMask(Mask)) {
+    SDValue VPermMask[8];
+    for (int i = 0; i < 8; ++i)
+      VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
+                                 : DAG.getConstant(Mask[i], DL, MVT::i32);
+    if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
+      return DAG.getNode(
+          X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
+          DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
+
+    if (Subtarget->hasAVX2())
+      return DAG.getNode(
+          X86ISD::VPERMV, DL, MVT::v8f32,
+          DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
+
+    // Otherwise, fall back.
+    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
+                                                   DAG);
+  }
+
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle.
+  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+          DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
+    return Result;
+
+  // If we have AVX2 then we always want to lower with a blend because at v8 we
+  // can fully permute the elements.
+  if (Subtarget->hasAVX2())
+    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
+                                                      Mask, DAG);
+
+  // Otherwise fall back on generic lowering.
+  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v8i32 shuffling..
+static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+  assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
+
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative. It also allows us to fold memory operands into the
+  // shuffle in many cases.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
+                                                         Mask, Subtarget, DAG))
+    return ZExt;
+
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Blend;
+
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
+  // If the shuffle mask is repeated in each 128-bit lane we can use more
+  // efficient instructions that mirror the shuffles across the two 128-bit
+  // lanes.
+  SmallVector<int, 4> RepeatedMask;
+  if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
+    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
+    if (isSingleInputShuffleMask(Mask))
+      return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
+                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+    // Use dedicated unpack instructions for masks that match their pattern.
+    if (SDValue V =
+            lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
+      return V;
+  }
+
+  // Try to use shift instructions.
+  if (SDValue Shift =
+          lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG))
+    return Shift;
+
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+    return Rotate;
+
+  // If the shuffle patterns aren't repeated but it is a single input, directly
+  // generate a cross-lane VPERMD instruction.
+  if (isSingleInputShuffleMask(Mask)) {
+    SDValue VPermMask[8];
+    for (int i = 0; i < 8; ++i)
+      VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
+                                 : DAG.getConstant(Mask[i], DL, MVT::i32);
+    return DAG.getNode(
+        X86ISD::VPERMV, DL, MVT::v8i32,
+        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
+  }
+
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle.
+  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+    return Result;
+
+  // Otherwise fall back on generic blend lowering.
+  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
+                                                    Mask, DAG);
+}
+
+/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v16i16 shuffling..
+static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                        const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+  assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
+
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative. It also allows us to fold memory operands into the
+  // shuffle in many cases.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
+                                                         Mask, Subtarget, DAG))
+    return ZExt;
+
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Blend;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
+    return V;
+
+  // Try to use shift instructions.
+  if (SDValue Shift =
+          lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG))
+    return Shift;
+
+  // Try to use byte rotation instructions.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+    return Rotate;
+
+  if (isSingleInputShuffleMask(Mask)) {
+    // There are no generalized cross-lane shuffle operations available on i16
+    // element types.
+    if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
+      return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
+                                                     Mask, DAG);
+
+    SmallVector<int, 8> RepeatedMask;
+    if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+      // As this is a single-input shuffle, the repeated mask should be
+      // a strictly valid v8i16 mask that we can pass through to the v8i16
+      // lowering to handle even the v16 case.
+      return lowerV8I16GeneralSingleInputVectorShuffle(
+          DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
+    }
+
+    SDValue PSHUFBMask[32];
+    for (int i = 0; i < 16; ++i) {
+      if (Mask[i] == -1) {
+        PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
+        continue;
+      }
+
+      int M = i < 8 ? Mask[i] : Mask[i] - 8;
+      assert(M >= 0 && M < 8 && "Invalid single-input mask!");
+      PSHUFBMask[2 * i] = DAG.getConstant(2 * M, DL, MVT::i8);
+      PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, DL, MVT::i8);
+    }
+    return DAG.getBitcast(MVT::v16i16,
+                          DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8,
+                                      DAG.getBitcast(MVT::v32i8, V1),
+                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
+                                                  MVT::v32i8, PSHUFBMask)));
+  }
+
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle.
+  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+    return Result;
+
+  // Otherwise fall back on generic lowering.
+  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v32i8 shuffling..
+static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
+  assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
+
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative. It also allows us to fold memory operands into the
+  // shuffle in many cases.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
+                                                         Mask, Subtarget, DAG))
+    return ZExt;
+
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Blend;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
+    return V;
+
+  // Try to use shift instructions.
+  if (SDValue Shift =
+          lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG))
+    return Shift;
+
+  // Try to use byte rotation instructions.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+    return Rotate;
+
+  if (isSingleInputShuffleMask(Mask)) {
+    // There are no generalized cross-lane shuffle operations available on i8
+    // element types.
+    if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
+      return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
+                                                     Mask, DAG);
+
+    SDValue PSHUFBMask[32];
+    for (int i = 0; i < 32; ++i)
+      PSHUFBMask[i] =
+          Mask[i] < 0
+              ? DAG.getUNDEF(MVT::i8)
+              : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, DL,
+                                MVT::i8);
+
+    return DAG.getNode(
+        X86ISD::PSHUFB, DL, MVT::v32i8, V1,
+        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
+  }
+
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle.
+  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+    return Result;
+
+  // Otherwise fall back on generic lowering.
+  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
+}
+
+/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
+///
+/// This routine either breaks down the specific type of a 256-bit x86 vector
+/// shuffle or splits it into two 128-bit shuffles and fuses the results back
+/// together based on the available instructions.
+static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                        MVT VT, const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+
+  // If we have a single input to the zero element, insert that into V1 if we
+  // can do so cheaply.
+  int NumElts = VT.getVectorNumElements();
+  int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) {
+    return M >= NumElts;
+  });
+
+  if (NumV2Elements == 1 && Mask[0] >= NumElts)
+    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+                              DL, VT, V1, V2, Mask, Subtarget, DAG))
+      return Insertion;
+
+  // Handle special cases where the lower or upper half is UNDEF.
+  if (SDValue V =
+          lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
+    return V;
+
+  // There is a really nice hard cut-over between AVX1 and AVX2 that means we
+  // can check for those subtargets here and avoid much of the subtarget
+  // querying in the per-vector-type lowering routines. With AVX1 we have
+  // essentially *zero* ability to manipulate a 256-bit vector with integer
+  // types. Since we'll use floating point types there eventually, just
+  // immediately cast everything to a float and operate entirely in that domain.
+  if (VT.isInteger() && !Subtarget->hasAVX2()) {
+    int ElementBits = VT.getScalarSizeInBits();
+    if (ElementBits < 32)
+      // No floating point type available, decompose into 128-bit vectors.
+      return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+
+    MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
+                                VT.getVectorNumElements());
+    V1 = DAG.getBitcast(FpVT, V1);
+    V2 = DAG.getBitcast(FpVT, V2);
+    return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
+  }
+
+  switch (VT.SimpleTy) {
+  case MVT::v4f64:
+    return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v4i64:
+    return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v8f32:
+    return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v8i32:
+    return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v16i16:
+    return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v32i8:
+    return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
+
+  default:
+    llvm_unreachable("Not a valid 256-bit x86 vector type!");
+  }
+}
+
+/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
+static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT,
+                                        ArrayRef<int> Mask,
+                                        SDValue V1, SDValue V2,
+                                        SelectionDAG &DAG) {
+  assert(VT.getScalarSizeInBits() == 64 &&
+         "Unexpected element type size for 128bit shuffle.");
+
+  // To handle 256 bit vector requires VLX and most probably
+  // function lowerV2X128VectorShuffle() is better solution.
+  assert(VT.is512BitVector() && "Unexpected vector size for 128bit shuffle.");
+
+  SmallVector<int, 4> WidenedMask;
+  if (!canWidenShuffleElements(Mask, WidenedMask))
+    return SDValue();
+
+  // Form a 128-bit permutation.
+  // Convert the 64-bit shuffle mask selection values into 128-bit selection
+  // bits defined by a vshuf64x2 instruction's immediate control byte.
+  unsigned PermMask = 0, Imm = 0;
+  unsigned ControlBitsNum = WidenedMask.size() / 2;
+
+  for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
+    if (WidenedMask[i] == SM_SentinelZero)
+      return SDValue();
+
+    // Use first element in place of undef mask.
+    Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
+    PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
+  }
+
+  return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
+                     DAG.getConstant(PermMask, DL, MVT::i8));
+}
+
+static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
+                                           ArrayRef<int> Mask, SDValue V1,
+                                           SDValue V2, SelectionDAG &DAG) {
+
+  assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV");
+
+  MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
+  MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
+
+  SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
+  if (isSingleInputShuffleMask(Mask))
+    return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
+
+  return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
+}
+
+/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
+static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+  if (SDValue Shuf128 =
+          lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
+    return Shuf128;
+
+  if (SDValue Unpck =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
+    return Unpck;
+
+  return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
+static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                        const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+  if (SDValue Unpck =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
+    return Unpck;
+
+  return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
+static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+  if (SDValue Shuf128 =
+          lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
+    return Shuf128;
+
+  if (SDValue Unpck =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
+    return Unpck;
+
+  return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
+static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                        const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+  if (SDValue Unpck =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
+    return Unpck;
+
+  return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
+static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                        const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
+  assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
+
+  return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
+static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
+  assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
+
+  // FIXME: Implement direct support for this type!
+  return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
+}
+
+/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
+///
+/// This routine either breaks down the specific type of a 512-bit x86 vector
+/// shuffle or splits it into two 256-bit shuffles and fuses the results back
+/// together based on the available instructions.
+static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                        MVT VT, const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Subtarget->hasAVX512() &&
+         "Cannot lower 512-bit vectors w/ basic ISA!");
+
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast =
+          lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG))
+    return Broadcast;
+
+  // Dispatch to each element type for lowering. If we don't have supprot for
+  // specific element type shuffles at 512 bits, immediately split them and
+  // lower them. Each lowering routine of a given type is allowed to assume that
+  // the requisite ISA extensions for that element type are available.
+  switch (VT.SimpleTy) {
+  case MVT::v8f64:
+    return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v16f32:
+    return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v8i64:
+    return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v16i32:
+    return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v32i16:
+    if (Subtarget->hasBWI())
+      return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    break;
+  case MVT::v64i8:
+    if (Subtarget->hasBWI())
+      return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    break;
+
+  default:
+    llvm_unreachable("Not a valid 512-bit x86 vector type!");
+  }
+
+  // Otherwise fall back on splitting.
+  return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+}
+
+// Lower vXi1 vector shuffles.
+// There is no a dedicated instruction on AVX-512 that shuffles the masks.
+// The only way to shuffle bits is to sign-extend the mask vector to SIMD
+// vector, shuffle and then truncate it back.
+static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                      MVT VT, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Subtarget->hasAVX512() &&
+         "Cannot lower 512-bit vectors w/o basic ISA!");
+  MVT ExtVT;
+  switch (VT.SimpleTy) {
+  default:
+    llvm_unreachable("Expected a vector of i1 elements");
+  case MVT::v2i1:
+    ExtVT = MVT::v2i64;
+    break;
+  case MVT::v4i1:
+    ExtVT = MVT::v4i32;
+    break;
+  case MVT::v8i1:
+    ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
+    break;
+  case MVT::v16i1:
+    ExtVT = MVT::v16i32;
+    break;
+  case MVT::v32i1:
+    ExtVT = MVT::v32i16;
+    break;
+  case MVT::v64i1:
+    ExtVT = MVT::v64i8;
+    break;
+  }
+
+  if (ISD::isBuildVectorAllZeros(V1.getNode()))
+    V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
+  else if (ISD::isBuildVectorAllOnes(V1.getNode()))
+    V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+  else
+    V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
+
+  if (V2.isUndef())
+    V2 = DAG.getUNDEF(ExtVT);
+  else if (ISD::isBuildVectorAllZeros(V2.getNode()))
+    V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
+  else if (ISD::isBuildVectorAllOnes(V2.getNode()))
+    V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+  else
+    V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
+  return DAG.getNode(ISD::TRUNCATE, DL, VT,
+                     DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask));
+}
+/// \brief Top-level lowering for x86 vector shuffles.
+///
+/// This handles decomposition, canonicalization, and lowering of all x86
+/// vector shuffles. Most of the specific lowering strategies are encapsulated
+/// above in helper routines. The canonicalization attempts to widen shuffles
+/// to involve fewer lanes of wider elements, consolidate symmetric patterns
+/// s.t. only one of the two inputs needs to be tested, etc.
+static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+                                  SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  MVT VT = Op.getSimpleValueType();
+  int NumElements = VT.getVectorNumElements();
+  SDLoc dl(Op);
+  bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
+
+  assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
+         "Can't lower MMX shuffles");
+
+  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  if (V1IsUndef && V2IsUndef)
+    return DAG.getUNDEF(VT);
+
+  // When we create a shuffle node we put the UNDEF node to second operand,
+  // but in some cases the first operand may be transformed to UNDEF.
+  // In this case we should just commute the node.
+  if (V1IsUndef)
+    return DAG.getCommutedVectorShuffle(*SVOp);
+
+  // Check for non-undef masks pointing at an undef vector and make the masks
+  // undef as well. This makes it easier to match the shuffle based solely on
+  // the mask.
+  if (V2IsUndef)
+    for (int M : Mask)
+      if (M >= NumElements) {
+        SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
+        for (int &M : NewMask)
+          if (M >= NumElements)
+            M = -1;
+        return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
+      }
+
+  // We actually see shuffles that are entirely re-arrangements of a set of
+  // zero inputs. This mostly happens while decomposing complex shuffles into
+  // simple ones. Directly lower these as a buildvector of zeros.
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  if (Zeroable.all())
+    return getZeroVector(VT, Subtarget, DAG, dl);
+
+  // Try to collapse shuffles into using a vector type with fewer elements but
+  // wider element types. We cap this to not form integers or floating point
+  // elements wider than 64 bits, but it might be interesting to form i128
+  // integers to handle flipping the low and high halves of AVX 256-bit vectors.
+  SmallVector<int, 16> WidenedMask;
+  if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
+      canWidenShuffleElements(Mask, WidenedMask)) {
+    MVT NewEltVT = VT.isFloatingPoint()
+                       ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
+                       : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
+    MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
+    // Make sure that the new vector type is legal. For example, v2f64 isn't
+    // legal on SSE1.
+    if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
+      V1 = DAG.getBitcast(NewVT, V1);
+      V2 = DAG.getBitcast(NewVT, V2);
+      return DAG.getBitcast(
+          VT, DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
+    }
+  }
+
+  int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
+  for (int M : SVOp->getMask())
+    if (M < 0)
+      ++NumUndefElements;
+    else if (M < NumElements)
+      ++NumV1Elements;
+    else
+      ++NumV2Elements;
+
+  // Commute the shuffle as needed such that more elements come from V1 than
+  // V2. This allows us to match the shuffle pattern strictly on how many
+  // elements come from V1 without handling the symmetric cases.
+  if (NumV2Elements > NumV1Elements)
+    return DAG.getCommutedVectorShuffle(*SVOp);
+
+  // When the number of V1 and V2 elements are the same, try to minimize the
+  // number of uses of V2 in the low half of the vector. When that is tied,
+  // ensure that the sum of indices for V1 is equal to or lower than the sum
+  // indices for V2. When those are equal, try to ensure that the number of odd
+  // indices for V1 is lower than the number of odd indices for V2.
+  if (NumV1Elements == NumV2Elements) {
+    int LowV1Elements = 0, LowV2Elements = 0;
+    for (int M : SVOp->getMask().slice(0, NumElements / 2))
+      if (M >= NumElements)
+        ++LowV2Elements;
+      else if (M >= 0)
+        ++LowV1Elements;
+    if (LowV2Elements > LowV1Elements) {
+      return DAG.getCommutedVectorShuffle(*SVOp);
+    } else if (LowV2Elements == LowV1Elements) {
+      int SumV1Indices = 0, SumV2Indices = 0;
+      for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
+        if (SVOp->getMask()[i] >= NumElements)
+          SumV2Indices += i;
+        else if (SVOp->getMask()[i] >= 0)
+          SumV1Indices += i;
+      if (SumV2Indices < SumV1Indices) {
+        return DAG.getCommutedVectorShuffle(*SVOp);
+      } else if (SumV2Indices == SumV1Indices) {
+        int NumV1OddIndices = 0, NumV2OddIndices = 0;
+        for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
+          if (SVOp->getMask()[i] >= NumElements)
+            NumV2OddIndices += i % 2;
+          else if (SVOp->getMask()[i] >= 0)
+            NumV1OddIndices += i % 2;
+        if (NumV2OddIndices < NumV1OddIndices)
+          return DAG.getCommutedVectorShuffle(*SVOp);
+      }
+    }
+  }
+
+  // For each vector width, delegate to a specialized lowering routine.
+  if (VT.is128BitVector())
+    return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+
+  if (VT.is256BitVector())
+    return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+
+  if (VT.is512BitVector())
+    return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+
+  if (Is1BitVector)
+    return lower1BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+  llvm_unreachable("Unimplemented!");
+}
+
+// This function assumes its argument is a BUILD_VECTOR of constants or
+// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
+// true.
+static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
+                                    unsigned &MaskValue) {
+  MaskValue = 0;
+  unsigned NumElems = BuildVector->getNumOperands();
+
+  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+  // We don't handle the >2 lanes case right now.
+  unsigned NumLanes = (NumElems - 1) / 8 + 1;
+  if (NumLanes > 2)
+    return false;
+
+  unsigned NumElemsInLane = NumElems / NumLanes;
+
+  // Blend for v16i16 should be symmetric for the both lanes.
+  for (unsigned i = 0; i < NumElemsInLane; ++i) {
+    SDValue EltCond = BuildVector->getOperand(i);
+    SDValue SndLaneEltCond =
+        (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
+
+    int Lane1Cond = -1, Lane2Cond = -1;
+    if (isa<ConstantSDNode>(EltCond))
+      Lane1Cond = !isNullConstant(EltCond);
+    if (isa<ConstantSDNode>(SndLaneEltCond))
+      Lane2Cond = !isNullConstant(SndLaneEltCond);
+
+    unsigned LaneMask = 0;
+    if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
+      // Lane1Cond != 0, means we want the first argument.
+      // Lane1Cond == 0, means we want the second argument.
+      // The encoding of this argument is 0 for the first argument, 1
+      // for the second. Therefore, invert the condition.
+      LaneMask = !Lane1Cond << i;
+    else if (Lane1Cond < 0)
+      LaneMask = !Lane2Cond << i;
+    else
+      return false;
+
+    MaskValue |= LaneMask;
+    if (NumLanes == 2)
+      MaskValue |= LaneMask << NumElemsInLane;
+  }
+  return true;
+}
+
+/// \brief Try to lower a VSELECT instruction to a vector shuffle.
+static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
+                                           const X86Subtarget *Subtarget,
+                                           SelectionDAG &DAG) {
+  SDValue Cond = Op.getOperand(0);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
+  auto *CondBV = cast<BuildVectorSDNode>(Cond);
+
+  // Only non-legal VSELECTs reach this lowering, convert those into generic
+  // shuffles and re-use the shuffle lowering path for blends.
+  SmallVector<int, 32> Mask;
+  for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
+    SDValue CondElt = CondBV->getOperand(i);
+    Mask.push_back(
+        isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
+                                     : -1);
+  }
+  return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
+}
+
+SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+  // A vselect where all conditions and data are constants can be optimized into
+  // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
+  if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
+      ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
+      ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
+    return SDValue();
+
+  // Try to lower this to a blend-style vector shuffle. This can handle all
+  // constant condition cases.
+  if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
+    return BlendOp;
+
+  // Variable blends are only legal from SSE4.1 onward.
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  // Only some types will be legal on some subtargets. If we can emit a legal
+  // VSELECT-matching blend, return Op, and but if we need to expand, return
+  // a null value.
+  switch (Op.getSimpleValueType().SimpleTy) {
+  default:
+    // Most of the vector types have blends past SSE4.1.
+    return Op;
+
+  case MVT::v32i8:
+    // The byte blends for AVX vectors were introduced only in AVX2.
+    if (Subtarget->hasAVX2())
+      return Op;
+
+    return SDValue();
+
+  case MVT::v8i16:
+  case MVT::v16i16:
+    // AVX-512 BWI and VLX features support VSELECT with i16 elements.
+    if (Subtarget->hasBWI() && Subtarget->hasVLX())
+      return Op;
+
+    // FIXME: We should custom lower this by fixing the condition and using i8
+    // blends.
+    return SDValue();
+  }
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+
+  if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
+    return SDValue();
+
+  if (VT.getSizeInBits() == 8) {
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
+                                  Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+                                  DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+  }
+
+  if (VT.getSizeInBits() == 16) {
+    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
+    if (isNullConstant(Op.getOperand(1)))
+      return DAG.getNode(
+          ISD::TRUNCATE, dl, MVT::i16,
+          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                      DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
+                      Op.getOperand(1)));
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
+                                  Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+                                  DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+  }
+
+  if (VT == MVT::f32) {
+    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
+    // the result back to FR32 register. It's only worth matching if the
+    // result has a single use which is a store or a bitcast to i32.  And in
+    // the case of a store, it's not worth it if the index is a constant 0,
+    // because a MOVSSmr can be used instead, which is smaller and faster.
+    if (!Op.hasOneUse())
+      return SDValue();
+    SDNode *User = *Op.getNode()->use_begin();
+    if ((User->getOpcode() != ISD::STORE ||
+         isNullConstant(Op.getOperand(1))) &&
+        (User->getOpcode() != ISD::BITCAST ||
+         User->getValueType(0) != MVT::i32))
+      return SDValue();
+    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                  DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
+                                  Op.getOperand(1));
+    return DAG.getBitcast(MVT::f32, Extract);
+  }
+
+  if (VT == MVT::i32 || VT == MVT::i64) {
+    // ExtractPS/pextrq works with constant index.
+    if (isa<ConstantSDNode>(Op.getOperand(1)))
+      return Op;
+  }
+  return SDValue();
+}
+
+/// Extract one bit from mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue
+X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Vec = Op.getOperand(0);
+  SDLoc dl(Vec);
+  MVT VecVT = Vec.getSimpleValueType();
+  SDValue Idx = Op.getOperand(1);
+  MVT EltVT = Op.getSimpleValueType();
+
+  assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
+  assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
+         "Unexpected vector type in ExtractBitFromMaskVector");
+
+  // variable index can't be handled in mask registers,
+  // extend vector to VR512
+  if (!isa<ConstantSDNode>(Idx)) {
+    MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
+    SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                              ExtVT.getVectorElementType(), Ext, Idx);
+    return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+  }
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  const TargetRegisterClass* rc = getRegClassFor(VecVT);
+  if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
+    rc = getRegClassFor(MVT::v16i1);
+  unsigned MaxSift = rc->getSize()*8 - 1;
+  Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+                    DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
+  Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+                    DAG.getConstant(MaxSift, dl, MVT::i8));
+  return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
+                       DAG.getIntPtrConstant(0, dl));
+}
+
+SDValue
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue Vec = Op.getOperand(0);
+  MVT VecVT = Vec.getSimpleValueType();
+  SDValue Idx = Op.getOperand(1);
+
+  if (Op.getSimpleValueType() == MVT::i1)
+    return ExtractBitFromMaskVector(Op, DAG);
+
+  if (!isa<ConstantSDNode>(Idx)) {
+    if (VecVT.is512BitVector() ||
+        (VecVT.is256BitVector() && Subtarget->hasInt256() &&
+         VecVT.getVectorElementType().getSizeInBits() == 32)) {
+
+      MVT MaskEltVT =
+        MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
+      MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
+                                    MaskEltVT.getSizeInBits());
+
+      Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
+      auto PtrVT = getPointerTy(DAG.getDataLayout());
+      SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
+                                 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
+                                 DAG.getConstant(0, dl, PtrVT));
+      SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
+                         DAG.getConstant(0, dl, PtrVT));
+    }
+    return SDValue();
+  }
+
+  // If this is a 256-bit vector result, first extract the 128-bit vector and
+  // then extract the element from the 128-bit vector.
+  if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
+
+    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+    // Get the 128-bit vector.
+    Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
+    MVT EltVT = VecVT.getVectorElementType();
+
+    unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
+    assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
+
+    // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
+    // this can be done with a mask.
+    IdxVal &= ElemsPerChunk - 1;
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
+                       DAG.getConstant(IdxVal, dl, MVT::i32));
+  }
+
+  assert(VecVT.is128BitVector() && "Unexpected vector length");
+
+  if (Subtarget->hasSSE41())
+    if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
+      return Res;
+
+  MVT VT = Op.getSimpleValueType();
+  // TODO: handle v16i8.
+  if (VT.getSizeInBits() == 16) {
+    SDValue Vec = Op.getOperand(0);
+    if (isNullConstant(Op.getOperand(1)))
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                     DAG.getBitcast(MVT::v4i32, Vec),
+                                     Op.getOperand(1)));
+    // Transform it so it match pextrw which produces a 32-bit result.
+    MVT EltVT = MVT::i32;
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
+                                  Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
+                                  DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+  }
+
+  if (VT.getSizeInBits() == 32) {
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    if (Idx == 0)
+      return Op;
+
+    // SHUFPS the element to the lowest double word, then movss.
+    int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
+    MVT VVT = Op.getOperand(0).getSimpleValueType();
+    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
+                                       DAG.getUNDEF(VVT), Mask);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+                       DAG.getIntPtrConstant(0, dl));
+  }
+
+  if (VT.getSizeInBits() == 64) {
+    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
+    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
+    //        to match extract_elt for f64.
+    if (isNullConstant(Op.getOperand(1)))
+      return Op;
+
+    // UNPCKHPD the element to the lowest double word, then movsd.
+    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
+    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
+    int Mask[2] = { 1, -1 };
+    MVT VVT = Op.getOperand(0).getSimpleValueType();
+    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
+                                       DAG.getUNDEF(VVT), Mask);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+                       DAG.getIntPtrConstant(0, dl));
+  }
+
+  return SDValue();
+}
+
+/// Insert one bit to mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue
+X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue Elt = Op.getOperand(1);
+  SDValue Idx = Op.getOperand(2);
+  MVT VecVT = Vec.getSimpleValueType();
+
+  if (!isa<ConstantSDNode>(Idx)) {
+    // Non constant index. Extend source and destination,
+    // insert element and then truncate the result.
+    MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
+    MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
+    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
+      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
+      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
+    return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
+  }
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
+  if (IdxVal)
+    EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+                           DAG.getConstant(IdxVal, dl, MVT::i8));
+  if (Vec.getOpcode() == ISD::UNDEF)
+    return EltInVec;
+  return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
+}
+
+SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+
+  if (EltVT == MVT::i1)
+    return InsertBitToMaskVector(Op, DAG);
+
+  SDLoc dl(Op);
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2 = Op.getOperand(2);
+  if (!isa<ConstantSDNode>(N2))
+    return SDValue();
+  auto *N2C = cast<ConstantSDNode>(N2);
+  unsigned IdxVal = N2C->getZExtValue();
+
+  // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
+  // into that, and then insert the subvector back into the result.
+  if (VT.is256BitVector() || VT.is512BitVector()) {
+    // With a 256-bit vector, we can insert into the zero element efficiently
+    // using a blend if we have AVX or AVX2 and the right data type.
+    if (VT.is256BitVector() && IdxVal == 0) {
+      // TODO: It is worthwhile to cast integer to floating point and back
+      // and incur a domain crossing penalty if that's what we'll end up
+      // doing anyway after extracting to a 128-bit vector.
+      if ((Subtarget->hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
+          (Subtarget->hasAVX2() && EltVT == MVT::i32)) {
+        SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
+        N2 = DAG.getIntPtrConstant(1, dl);
+        return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
+      }
+    }
+
+    // Get the desired 128-bit vector chunk.
+    SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
+
+    // Insert the element into the desired chunk.
+    unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
+    assert(isPowerOf2_32(NumEltsIn128));
+    // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
+    unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
+
+    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
+                    DAG.getConstant(IdxIn128, dl, MVT::i32));
+
+    // Insert the changed part back into the bigger vector
+    return Insert128BitVector(N0, V, IdxVal, DAG, dl);
+  }
+  assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
+
+  if (Subtarget->hasSSE41()) {
+    if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
+      unsigned Opc;
+      if (VT == MVT::v8i16) {
+        Opc = X86ISD::PINSRW;
+      } else {
+        assert(VT == MVT::v16i8);
+        Opc = X86ISD::PINSRB;
+      }
+
+      // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+      // argument.
+      if (N1.getValueType() != MVT::i32)
+        N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+      if (N2.getValueType() != MVT::i32)
+        N2 = DAG.getIntPtrConstant(IdxVal, dl);
+      return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+    }
+
+    if (EltVT == MVT::f32) {
+      // Bits [7:6] of the constant are the source select. This will always be
+      //   zero here. The DAG Combiner may combine an extract_elt index into
+      //   these bits. For example (insert (extract, 3), 2) could be matched by
+      //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
+      // Bits [5:4] of the constant are the destination select. This is the
+      //   value of the incoming immediate.
+      // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
+      //   combine either bitwise AND or insert of float 0.0 to set these bits.
+
+      bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
+      if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
+        // If this is an insertion of 32-bits into the low 32-bits of
+        // a vector, we prefer to generate a blend with immediate rather
+        // than an insertps. Blends are simpler operations in hardware and so
+        // will always have equal or better performance than insertps.
+        // But if optimizing for size and there's a load folding opportunity,
+        // generate insertps because blendps does not have a 32-bit memory
+        // operand form.
+        N2 = DAG.getIntPtrConstant(1, dl);
+        N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+        return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
+      }
+      N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
+      // Create this as a scalar to vector..
+      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+      return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
+    }
+
+    if (EltVT == MVT::i32 || EltVT == MVT::i64) {
+      // PINSR* works with constant index.
+      return Op;
+    }
+  }
+
+  if (EltVT == MVT::i8)
+    return SDValue();
+
+  if (EltVT.getSizeInBits() == 16) {
+    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
+    // as its second argument.
+    if (N1.getValueType() != MVT::i32)
+      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+    if (N2.getValueType() != MVT::i32)
+      N2 = DAG.getIntPtrConstant(IdxVal, dl);
+    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
+  }
+  return SDValue();
+}
+
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  MVT OpVT = Op.getSimpleValueType();
+
+  // If this is a 256-bit vector result, first insert into a 128-bit
+  // vector and then insert into the 256-bit vector.
+  if (!OpVT.is128BitVector()) {
+    // Insert into a 128-bit vector.
+    unsigned SizeFactor = OpVT.getSizeInBits()/128;
+    MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
+                                 OpVT.getVectorNumElements() / SizeFactor);
+
+    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
+
+    // Insert the 128-bit vector.
+    return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
+  }
+
+  if (OpVT == MVT::v1i64 &&
+      Op.getOperand(0).getValueType() == MVT::i64)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
+
+  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
+  assert(OpVT.is128BitVector() && "Expected an SSE type!");
+  return DAG.getBitcast(
+      OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
+}
+
+// Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
+// a simple subregister reference or explicit instructions to grab
+// upper bits of a vector.
+static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  SDValue In =  Op.getOperand(0);
+  SDValue Idx = Op.getOperand(1);
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  MVT ResVT   = Op.getSimpleValueType();
+  MVT InVT    = In.getSimpleValueType();
+
+  if (Subtarget->hasFp256()) {
+    if (ResVT.is128BitVector() &&
+        (InVT.is256BitVector() || InVT.is512BitVector()) &&
+        isa<ConstantSDNode>(Idx)) {
+      return Extract128BitVector(In, IdxVal, DAG, dl);
+    }
+    if (ResVT.is256BitVector() && InVT.is512BitVector() &&
+        isa<ConstantSDNode>(Idx)) {
+      return Extract256BitVector(In, IdxVal, DAG, dl);
+    }
+  }
+  return SDValue();
+}
+
+// Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
+// simple superregister reference or explicit instructions to insert
+// the upper bits of a vector.
+static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+                                     SelectionDAG &DAG) {
+  if (!Subtarget->hasAVX())
+    return SDValue();
+
+  SDLoc dl(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue SubVec = Op.getOperand(1);
+  SDValue Idx = Op.getOperand(2);
+
+  if (!isa<ConstantSDNode>(Idx))
+    return SDValue();
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  MVT OpVT = Op.getSimpleValueType();
+  MVT SubVecVT = SubVec.getSimpleValueType();
+
+  // Fold two 16-byte subvector loads into one 32-byte load:
+  // (insert_subvector (insert_subvector undef, (load addr), 0),
+  //                   (load addr + 16), Elts/2)
+  // --> load32 addr
+  if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
+      Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
+    auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
+    if (Idx2 && Idx2->getZExtValue() == 0) {
+      SDValue SubVec2 = Vec.getOperand(1);
+      // If needed, look through a bitcast to get to the load.
+      if (SubVec2.getNode() && SubVec2.getOpcode() == ISD::BITCAST)
+        SubVec2 = SubVec2.getOperand(0);
+
+      if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) {
+        bool Fast;
+        unsigned Alignment = FirstLd->getAlignment();
+        unsigned AS = FirstLd->getAddressSpace();
+        const X86TargetLowering *TLI = Subtarget->getTargetLowering();
+        if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+                                    OpVT, AS, Alignment, &Fast) && Fast) {
+          SDValue Ops[] = { SubVec2, SubVec };
+          if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
+            return Ld;
+        }
+      }
+    }
+  }
+
+  if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
+      SubVecVT.is128BitVector())
+    return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
+  if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
+    return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
+  if (OpVT.getVectorElementType() == MVT::i1)
+    return Insert1BitVector(Op, DAG);
+
+  return SDValue();
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOV32ri.
+SDValue
+X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = 0;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    WrapperKind = X86ISD::WrapperRIP;
+  else if (Subtarget->isPICStyleGOT())
+    OpFlag = X86II::MO_GOTOFF;
+  else if (Subtarget->isPICStyleStubPIC())
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetConstantPool(
+      CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
+  SDLoc DL(CP);
+  Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
+  // With PIC, the address is actually $g + Offset.
+  if (OpFlag) {
+    Result =
+        DAG.getNode(ISD::ADD, DL, PtrVT,
+                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
+  }
+
+  return Result;
+}
+
+SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = 0;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    WrapperKind = X86ISD::WrapperRIP;
+  else if (Subtarget->isPICStyleGOT())
+    OpFlag = X86II::MO_GOTOFF;
+  else if (Subtarget->isPICStyleStubPIC())
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
+  SDLoc DL(JT);
+  Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (OpFlag)
+    Result =
+        DAG.getNode(ISD::ADD, DL, PtrVT,
+                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
+  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = 0;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel)) {
+    if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
+      OpFlag = X86II::MO_GOTPCREL;
+    WrapperKind = X86ISD::WrapperRIP;
+  } else if (Subtarget->isPICStyleGOT()) {
+    OpFlag = X86II::MO_GOT;
+  } else if (Subtarget->isPICStyleStubPIC()) {
+    OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+  } else if (Subtarget->isPICStyleStubNoDynamic()) {
+    OpFlag = X86II::MO_DARWIN_NONLAZY;
+  }
+
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
+
+  SDLoc DL(Op);
+  Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
+      !Subtarget->is64Bit()) {
+    Result =
+        DAG.getNode(ISD::ADD, DL, PtrVT,
+                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
+  }
+
+  // For symbols that require a load from a stub to get the address, emit the
+  // load.
+  if (isGlobalStubReference(OpFlag))
+    Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                         false, false, false, 0);
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+  // Create the TargetBlockAddressAddress node.
+  unsigned char OpFlags =
+    Subtarget->ClassifyBlockAddressReference();
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
+  SDLoc dl(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
+  else
+    Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (isGlobalRelativeToPICBase(OpFlags)) {
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT,
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
+  }
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
+                                      int64_t Offset, SelectionDAG &DAG) const {
+  // Create the TargetGlobalAddress node, folding in the constant
+  // offset if it is legal.
+  unsigned char OpFlags =
+      Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result;
+  if (OpFlags == X86II::MO_NO_FLAG &&
+      X86::isOffsetSuitableForCodeModel(Offset, M)) {
+    // A direct static reference to a global.
+    Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
+    Offset = 0;
+  } else {
+    Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
+  }
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
+  else
+    Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (isGlobalRelativeToPICBase(OpFlags)) {
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT,
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
+  }
+
+  // For globals that require a load from a stub to get the address, emit the
+  // load.
+  if (isGlobalStubReference(OpFlags))
+    Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                         false, false, false, 0);
+
+  // If there was a non-zero offset that we didn't fold, create an explicit
+  // addition for it.
+  if (Offset != 0)
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
+                         DAG.getConstant(Offset, dl, PtrVT));
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+  return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
+}
+
+static SDValue
+GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
+           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
+           unsigned char OperandFlags, bool LocalDynamic = false) {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDLoc dl(GA);
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                           GA->getValueType(0),
+                                           GA->getOffset(),
+                                           OperandFlags);
+
+  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
+                                           : X86ISD::TLSADDR;
+
+  if (InFlag) {
+    SDValue Ops[] = { Chain,  TGA, *InFlag };
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
+  } else {
+    SDValue Ops[]  = { Chain, TGA };
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
+  }
+
+  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
+  MFI->setAdjustsStack(true);
+  MFI->setHasCalls(true);
+
+  SDValue Flag = Chain.getValue(1);
+  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
+static SDValue
+LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                const EVT PtrVT) {
+  SDValue InFlag;
+  SDLoc dl(GA);  // ? function entry point might be better
+  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+                                   DAG.getNode(X86ISD::GlobalBaseReg,
+                                               SDLoc(), PtrVT), InFlag);
+  InFlag = Chain.getValue(1);
+
+  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
+static SDValue
+LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                const EVT PtrVT) {
+  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
+                    X86::RAX, X86II::MO_TLSGD);
+}
+
+static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
+                                           SelectionDAG &DAG,
+                                           const EVT PtrVT,
+                                           bool is64Bit) {
+  SDLoc dl(GA);
+
+  // Get the start address of the TLS block for this module.
+  X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
+      .getInfo<X86MachineFunctionInfo>();
+  MFI->incNumLocalDynamicTLSAccesses();
+
+  SDValue Base;
+  if (is64Bit) {
+    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
+                      X86II::MO_TLSLD, /*LocalDynamic=*/true);
+  } else {
+    SDValue InFlag;
+    SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+        DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
+    InFlag = Chain.getValue(1);
+    Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
+                      X86II::MO_TLSLDM, /*LocalDynamic=*/true);
+  }
+
+  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
+  // of Base.
+
+  // Build x@dtpoff.
+  unsigned char OperandFlags = X86II::MO_DTPOFF;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                           GA->getValueType(0),
+                                           GA->getOffset(), OperandFlags);
+  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+  // Add x@dtpoff with the base.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
+static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                   const EVT PtrVT, TLSModel::Model model,
+                                   bool is64Bit, bool isPIC) {
+  SDLoc dl(GA);
+
+  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
+  Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
+                                                         is64Bit ? 257 : 256));
+
+  SDValue ThreadPointer =
+      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
+                  MachinePointerInfo(Ptr), false, false, false, 0);
+
+  unsigned char OperandFlags = 0;
+  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
+  // initialexec.
+  unsigned WrapperKind = X86ISD::Wrapper;
+  if (model == TLSModel::LocalExec) {
+    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
+  } else if (model == TLSModel::InitialExec) {
+    if (is64Bit) {
+      OperandFlags = X86II::MO_GOTTPOFF;
+      WrapperKind = X86ISD::WrapperRIP;
+    } else {
+      OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
+    }
+  } else {
+    llvm_unreachable("Unexpected model");
+  }
+
+  // emit "addl x@ntpoff,%eax" (local exec)
+  // or "addl x@indntpoff,%eax" (initial exec)
+  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
+  SDValue TGA =
+      DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
+                                 GA->getOffset(), OperandFlags);
+  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+  if (model == TLSModel::InitialExec) {
+    if (isPIC && !is64Bit) {
+      Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
+                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
+                           Offset);
+    }
+
+    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                         false, false, false, 0);
+  }
+
+  // The address of the thread local variable is the add of the thread
+  // pointer with the offset of the variable.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue
+X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+  // Cygwin uses emutls.
+  // FIXME: It may be EmulatedTLS-generic also for X86-Android.
+  if (Subtarget->isTargetWindowsCygwin())
+    return LowerToTLSEmulatedModel(GA, DAG);
+
+  const GlobalValue *GV = GA->getGlobal();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  if (Subtarget->isTargetELF()) {
+    if (DAG.getTarget().Options.EmulatedTLS)
+      return LowerToTLSEmulatedModel(GA, DAG);
+    TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
+    switch (model) {
+      case TLSModel::GeneralDynamic:
+        if (Subtarget->is64Bit())
+          return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
+        return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
+      case TLSModel::LocalDynamic:
+        return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
+                                           Subtarget->is64Bit());
+      case TLSModel::InitialExec:
+      case TLSModel::LocalExec:
+        return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget->is64Bit(),
+                                   DAG.getTarget().getRelocationModel() ==
+                                       Reloc::PIC_);
+    }
+    llvm_unreachable("Unknown TLS model.");
+  }
+
+  if (Subtarget->isTargetDarwin()) {
+    // Darwin only has one model of TLS.  Lower to that.
+    unsigned char OpFlag = 0;
+    unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
+                           X86ISD::WrapperRIP : X86ISD::Wrapper;
+
+    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+    // global base reg.
+    bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
+                 !Subtarget->is64Bit();
+    if (PIC32)
+      OpFlag = X86II::MO_TLVP_PIC_BASE;
+    else
+      OpFlag = X86II::MO_TLVP;
+    SDLoc DL(Op);
+    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
+                                                GA->getValueType(0),
+                                                GA->getOffset(), OpFlag);
+    SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
+
+    // With PIC32, the address is actually $g + Offset.
+    if (PIC32)
+      Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
+                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
+                           Offset);
+
+    // Lowering the machine isd will make sure everything is in the right
+    // location.
+    SDValue Chain = DAG.getEntryNode();
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Args[] = { Chain, Offset };
+    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
+
+    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    MFI->setAdjustsStack(true);
+
+    // And our return value (tls address) is in the standard call return value
+    // location.
+    unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+    return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
+  }
+
+  if (Subtarget->isTargetKnownWindowsMSVC() ||
+      Subtarget->isTargetWindowsGNU()) {
+    // Just use the implicit TLS architecture
+    // Need to generate someting similar to:
+    //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
+    //                                  ; from TEB
+    //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
+    //   mov     rcx, qword [rdx+rcx*8]
+    //   mov     eax, .tls$:tlsvar
+    //   [rax+rcx] contains the address
+    // Windows 64bit: gs:0x58
+    // Windows 32bit: fs:__tls_array
+
+    SDLoc dl(GA);
+    SDValue Chain = DAG.getEntryNode();
+
+    // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
+    // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
+    // use its literal value of 0x2C.
+    Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
+                                        ? Type::getInt8PtrTy(*DAG.getContext(),
+                                                             256)
+                                        : Type::getInt32PtrTy(*DAG.getContext(),
+                                                              257));
+
+    SDValue TlsArray = Subtarget->is64Bit()
+                           ? DAG.getIntPtrConstant(0x58, dl)
+                           : (Subtarget->isTargetWindowsGNU()
+                                  ? DAG.getIntPtrConstant(0x2C, dl)
+                                  : DAG.getExternalSymbol("_tls_array", PtrVT));
+
+    SDValue ThreadPointer =
+        DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false,
+                    false, false, 0);
+
+    SDValue res;
+    if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
+      res = ThreadPointer;
+    } else {
+      // Load the _tls_index variable
+      SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
+      if (Subtarget->is64Bit())
+        IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
+                             MachinePointerInfo(), MVT::i32, false, false,
+                             false, 0);
+      else
+        IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false,
+                          false, false, 0);
+
+      auto &DL = DAG.getDataLayout();
+      SDValue Scale =
+          DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
+      IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
+
+      res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
+    }
+
+    res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false,
+                      false, 0);
+
+    // Get the offset of start of .tls section
+    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                             GA->getValueType(0),
+                                             GA->getOffset(), X86II::MO_SECREL);
+    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
+
+    // The address of the thread local variable is the add of the thread
+    // pointer with the offset of the variable.
+    return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
+  }
+
+  llvm_unreachable("TLS not implemented for this target.");
+}
+
+/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
+/// and take a 2 x i32 value to shift plus a shift amount.
+static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  MVT VT = Op.getSimpleValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
+  // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
+  // during isel.
+  SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+                                  DAG.getConstant(VTBits - 1, dl, MVT::i8));
+  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
+                                     DAG.getConstant(VTBits - 1, dl, MVT::i8))
+                       : DAG.getConstant(0, dl, VT);
+
+  SDValue Tmp2, Tmp3;
+  if (Op.getOpcode() == ISD::SHL_PARTS) {
+    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
+  } else {
+    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
+    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
+  }
+
+  // If the shift amount is larger or equal than the width of a part we can't
+  // rely on the results of shld/shrd. Insert a test and select the appropriate
+  // values for large shift amounts.
+  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+                                DAG.getConstant(VTBits, dl, MVT::i8));
+  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+                             AndNode, DAG.getConstant(0, dl, MVT::i8));
+
+  SDValue Hi, Lo;
+  SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
+  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
+  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
+
+  if (Op.getOpcode() == ISD::SHL_PARTS) {
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
+  } else {
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
+  }
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue Src = Op.getOperand(0);
+  MVT SrcVT = Src.getSimpleValueType();
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+
+  if (SrcVT.isVector()) {
+    if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
+      return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT,
+                         DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
+                         DAG.getUNDEF(SrcVT)));
+    }
+    if (SrcVT.getVectorElementType() == MVT::i1) {
+      MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
+      return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+                         DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
+    }
+    return SDValue();
+  }
+
+  assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
+         "Unknown SINT_TO_FP to lower!");
+
+  // These are really Legal; return the operand so the caller accepts it as
+  // Legal.
+  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
+    return Op;
+  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+      Subtarget->is64Bit()) {
+    return Op;
+  }
+
+  unsigned Size = SrcVT.getSizeInBits()/8;
+  MachineFunction &MF = DAG.getMachineFunction();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
+  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+  SDValue Chain = DAG.getStore(
+      DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false,
+      false, 0);
+  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+}
+
+SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
+                                     SDValue StackSlot,
+                                     SelectionDAG &DAG) const {
+  // Build the FILD
+  SDLoc DL(Op);
+  SDVTList Tys;
+  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
+  if (useSSE)
+    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
+  else
+    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
+
+  unsigned ByteSize = SrcVT.getSizeInBits()/8;
+
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
+  MachineMemOperand *MMO;
+  if (FI) {
+    int SSFI = FI->getIndex();
+    MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+        MachineMemOperand::MOLoad, ByteSize, ByteSize);
+  } else {
+    MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
+    StackSlot = StackSlot.getOperand(1);
+  }
+  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
+  SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
+                                           X86ISD::FILD, DL,
+                                           Tys, Ops, SrcVT, MMO);
+
+  if (useSSE) {
+    Chain = Result.getValue(1);
+    SDValue InFlag = Result.getValue(2);
+
+    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
+    // shouldn't be necessary except that RFP cannot be live across
+    // multiple blocks. When stackifier is fixed, they can be uncoupled.
+    MachineFunction &MF = DAG.getMachineFunction();
+    unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
+    int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
+    auto PtrVT = getPointerTy(MF.getDataLayout());
+    SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+    Tys = DAG.getVTList(MVT::Other);
+    SDValue Ops[] = {
+      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
+    };
+    MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+        MachineMemOperand::MOStore, SSFISize, SSFISize);
+
+    Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
+                                    Ops, Op.getValueType(), MMO);
+    Result = DAG.getLoad(
+        Op.getValueType(), DL, Chain, StackSlot,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+        false, false, false, 0);
+  }
+
+  return Result;
+}
+
+// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  // This algorithm is not obvious. Here it is what we're trying to output:
+  /*
+     movq       %rax,  %xmm0
+     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
+     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
+     #ifdef __SSE3__
+       haddpd   %xmm0, %xmm0
+     #else
+       pshufd   $0x4e, %xmm0, %xmm1
+       addpd    %xmm1, %xmm0
+     #endif
+  */
+
+  SDLoc dl(Op);
+  LLVMContext *Context = DAG.getContext();
+
+  // Build some magic constants.
+  static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+  Constant *C0 = ConstantDataVector::get(*Context, CV0);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
+
+  SmallVector<Constant*,2> CV1;
+  CV1.push_back(
+    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+                                      APInt(64, 0x4330000000000000ULL))));
+  CV1.push_back(
+    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+                                      APInt(64, 0x4530000000000000ULL))));
+  Constant *C1 = ConstantVector::get(CV1);
+  SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
+
+  // Load the 64-bit value into an XMM register.
+  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+                            Op.getOperand(0));
+  SDValue CLod0 =
+      DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
+                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                  false, false, false, 16);
+  SDValue Unpck1 =
+      getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
+
+  SDValue CLod1 =
+      DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
+                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                  false, false, false, 16);
+  SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
+  // TODO: Are there any fast-math-flags to propagate here?
+  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+  SDValue Result;
+
+  if (Subtarget->hasSSE3()) {
+    // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
+    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
+  } else {
+    SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
+    SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
+                                           S2F, 0x4E, DAG);
+    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
+                         DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
+  }
+
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
+                     DAG.getIntPtrConstant(0, dl));
+}
+
+// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  // FP constant to bias correct the final result.
+  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
+                                   MVT::f64);
+
+  // Load the 32-bit value into an XMM register.
+  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+                             Op.getOperand(0));
+
+  // Zero out the upper parts of the register.
+  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
+
+  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                     DAG.getBitcast(MVT::v2f64, Load),
+                     DAG.getIntPtrConstant(0, dl));
+
+  // Or the load with the bias.
+  SDValue Or = DAG.getNode(
+      ISD::OR, dl, MVT::v2i64,
+      DAG.getBitcast(MVT::v2i64,
+                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
+      DAG.getBitcast(MVT::v2i64,
+                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
+  Or =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                  DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
+
+  // Subtract the bias.
+  // TODO: Are there any fast-math-flags to propagate here?
+  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
+
+  // Handle final rounding.
+  MVT DestVT = Op.getSimpleValueType();
+
+  if (DestVT.bitsLT(MVT::f64))
+    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
+                       DAG.getIntPtrConstant(0, dl));
+  if (DestVT.bitsGT(MVT::f64))
+    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
+
+  // Handle final rounding.
+  return Sub;
+}
+
+static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget) {
+  // The algorithm is the following:
+  // #ifdef __SSE4_1__
+  //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
+  //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
+  //                                 (uint4) 0x53000000, 0xaa);
+  // #else
+  //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
+  //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
+  // #endif
+  //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
+  //     return (float4) lo + fhi;
+
+  // We shouldn't use it when unsafe-fp-math is enabled though: we might later
+  // reassociate the two FADDs, and if we do that, the algorithm fails
+  // spectacularly (PR24512).
+  // FIXME: If we ever have some kind of Machine FMF, this should be marked
+  // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
+  // there's also the MachineCombiner reassociations happening on Machine IR.
+  if (DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
+  SDLoc DL(Op);
+  SDValue V = Op->getOperand(0);
+  MVT VecIntVT = V.getSimpleValueType();
+  bool Is128 = VecIntVT == MVT::v4i32;
+  MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
+  // If we convert to something else than the supported type, e.g., to v4f64,
+  // abort early.
+  if (VecFloatVT != Op->getSimpleValueType(0))
+    return SDValue();
+
+  unsigned NumElts = VecIntVT.getVectorNumElements();
+  assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
+         "Unsupported custom type");
+  assert(NumElts <= 8 && "The size of the constant array must be fixed");
+
+  // In the #idef/#else code, we have in common:
+  // - The vector of constants:
+  // -- 0x4b000000
+  // -- 0x53000000
+  // - A shift:
+  // -- v >> 16
+
+  // Create the splat vector for 0x4b000000.
+  SDValue CstLow = DAG.getConstant(0x4b000000, DL, MVT::i32);
+  SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
+                           CstLow, CstLow, CstLow, CstLow};
+  SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
+                                  makeArrayRef(&CstLowArray[0], NumElts));
+  // Create the splat vector for 0x53000000.
+  SDValue CstHigh = DAG.getConstant(0x53000000, DL, MVT::i32);
+  SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
+                            CstHigh, CstHigh, CstHigh, CstHigh};
+  SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
+                                   makeArrayRef(&CstHighArray[0], NumElts));
+
+  // Create the right shift.
+  SDValue CstShift = DAG.getConstant(16, DL, MVT::i32);
+  SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
+                             CstShift, CstShift, CstShift, CstShift};
+  SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
+                                    makeArrayRef(&CstShiftArray[0], NumElts));
+  SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
+
+  SDValue Low, High;
+  if (Subtarget.hasSSE41()) {
+    MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
+    //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
+    SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
+    SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
+    // Low will be bitcasted right away, so do not bother bitcasting back to its
+    // original type.
+    Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
+                      VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
+    //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
+    //                                 (uint4) 0x53000000, 0xaa);
+    SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
+    SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
+    // High will be bitcasted right away, so do not bother bitcasting back to
+    // its original type.
+    High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
+                       VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
+  } else {
+    SDValue CstMask = DAG.getConstant(0xffff, DL, MVT::i32);
+    SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
+                                     CstMask, CstMask, CstMask);
+    //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
+    SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
+    Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
+
+    //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
+    High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
+  }
+
+  // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
+  SDValue CstFAdd = DAG.getConstantFP(
+      APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, MVT::f32);
+  SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
+                            CstFAdd, CstFAdd, CstFAdd, CstFAdd};
+  SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
+                                   makeArrayRef(&CstFAddArray[0], NumElts));
+
+  //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
+  SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
+  // TODO: Are there any fast-math-flags to propagate here?
+  SDValue FHigh =
+      DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
+  //     return (float4) lo + fhi;
+  SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
+  return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
+}
+
+SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue N0 = Op.getOperand(0);
+  MVT SVT = N0.getSimpleValueType();
+  SDLoc dl(Op);
+
+  switch (SVT.SimpleTy) {
+  default:
+    llvm_unreachable("Custom UINT_TO_FP is not supported!");
+  case MVT::v4i8:
+  case MVT::v4i16:
+  case MVT::v8i8:
+  case MVT::v8i16: {
+    MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
+    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+                       DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
+  }
+  case MVT::v4i32:
+  case MVT::v8i32:
+    return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
+  case MVT::v16i8:
+  case MVT::v16i16:
+    assert(Subtarget->hasAVX512());
+    return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+                       DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
+  }
+}
+
+SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue N0 = Op.getOperand(0);
+  SDLoc dl(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  if (Op.getSimpleValueType().isVector())
+    return lowerUINT_TO_FP_vec(Op, DAG);
+
+  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
+  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
+  // the optimization here.
+  if (DAG.SignBitIsZero(N0))
+    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
+
+  MVT SrcVT = N0.getSimpleValueType();
+  MVT DstVT = Op.getSimpleValueType();
+
+  if (Subtarget->hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
+      (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget->is64Bit()))) {
+    // Conversions from unsigned i32 to f32/f64 are legal,
+    // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
+    return Op;
+  }
+
+  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
+    return LowerUINT_TO_FP_i64(Op, DAG);
+  if (SrcVT == MVT::i32 && X86ScalarSSEf64)
+    return LowerUINT_TO_FP_i32(Op, DAG);
+  if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
+    return SDValue();
+
+  // Make a 64-bit buffer, and use it to build an FILD.
+  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
+  if (SrcVT == MVT::i32) {
+    SDValue WordOff = DAG.getConstant(4, dl, PtrVT);
+    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, WordOff);
+    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+                                  StackSlot, MachinePointerInfo(),
+                                  false, false, 0);
+    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
+                                  OffsetSlot, MachinePointerInfo(),
+                                  false, false, 0);
+    SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
+    return Fild;
+  }
+
+  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+                               StackSlot, MachinePointerInfo(),
+                               false, false, 0);
+  // For i64 source, we need to add the appropriate power of 2 if the input
+  // was negative.  This is the same as the optimization in
+  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
+  // we must be careful to do the computation in x87 extended precision, not
+  // in SSE. (The generic code can't know it's OK to do this, or how to.)
+  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+      MachineMemOperand::MOLoad, 8, 8);
+
+  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
+  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
+                                         MVT::i64, MMO);
+
+  APInt FF(32, 0x5F800000ULL);
+
+  // Check whether the sign bit is set.
+  SDValue SignSet = DAG.getSetCC(
+      dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
+      Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
+
+  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
+  SDValue FudgePtr = DAG.getConstantPool(
+      ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
+
+  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
+  SDValue Zero = DAG.getIntPtrConstant(0, dl);
+  SDValue Four = DAG.getIntPtrConstant(4, dl);
+  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
+                               Zero, Four);
+  FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
+
+  // Load the value out, extending it from f32 to f80.
+  // FIXME: Avoid the extend by constructing the right constant pool?
+  SDValue Fudge = DAG.getExtLoad(
+      ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
+      false, false, false, 4);
+  // Extend everything to 80 bits to force it to be done on x87.
+  // TODO: Are there any fast-math-flags to propagate here?
+  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
+  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
+                     DAG.getIntPtrConstant(0, dl));
+}
+
+// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
+// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
+// just return an <SDValue(), SDValue()> pair.
+// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
+// to i16, i32 or i64, and we lower it to a legal sequence.
+// If lowered to the final integer result we return a <result, SDValue()> pair.
+// Otherwise we lower it to a sequence ending with a FIST, return a
+// <FIST, StackSlot> pair, and the caller is responsible for loading
+// the final integer result from StackSlot.
+std::pair<SDValue,SDValue>
+X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+                                   bool IsSigned, bool IsReplace) const {
+  SDLoc DL(Op);
+
+  EVT DstTy = Op.getValueType();
+  EVT TheVT = Op.getOperand(0).getValueType();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
+    // f16 must be promoted before using the lowering in this routine.
+    // fp128 does not use this lowering.
+    return std::make_pair(SDValue(), SDValue());
+  }
+
+  // If using FIST to compute an unsigned i64, we'll need some fixup
+  // to handle values above the maximum signed i64.  A FIST is always
+  // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
+  bool UnsignedFixup = !IsSigned &&
+                       DstTy == MVT::i64 &&
+                       (!Subtarget->is64Bit() ||
+                        !isScalarFPTypeInSSEReg(TheVT));
+
+  if (!IsSigned && DstTy != MVT::i64 && !Subtarget->hasAVX512()) {
+    // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
+    // The low 32 bits of the fist result will have the correct uint32 result.
+    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
+    DstTy = MVT::i64;
+  }
+
+  assert(DstTy.getSimpleVT() <= MVT::i64 &&
+         DstTy.getSimpleVT() >= MVT::i16 &&
+         "Unknown FP_TO_INT to lower!");
+
+  // These are really Legal.
+  if (DstTy == MVT::i32 &&
+      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+    return std::make_pair(SDValue(), SDValue());
+  if (Subtarget->is64Bit() &&
+      DstTy == MVT::i64 &&
+      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+    return std::make_pair(SDValue(), SDValue());
+
+  // We lower FP->int64 into FISTP64 followed by a load from a temporary
+  // stack slot.
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned MemSize = DstTy.getSizeInBits()/8;
+  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+
+  unsigned Opc;
+  switch (DstTy.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
+  case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+  case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+  case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+  }
+
+  SDValue Chain = DAG.getEntryNode();
+  SDValue Value = Op.getOperand(0);
+  SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
+
+  if (UnsignedFixup) {
+    //
+    // Conversion to unsigned i64 is implemented with a select,
+    // depending on whether the source value fits in the range
+    // of a signed i64.  Let Thresh be the FP equivalent of
+    // 0x8000000000000000ULL.
+    //
+    //  Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
+    //  FistSrc    = (Value < Thresh) ? Value : (Value - Thresh);
+    //  Fist-to-mem64 FistSrc
+    //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
+    //  to XOR'ing the high 32 bits with Adjust.
+    //
+    // Being a power of 2, Thresh is exactly representable in all FP formats.
+    // For X87 we'd like to use the smallest FP type for this constant, but
+    // for DAG type consistency we have to match the FP operand type.
+
+    APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000));
+    LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
+    bool LosesInfo = false;
+    if (TheVT == MVT::f64)
+      // The rounding mode is irrelevant as the conversion should be exact.
+      Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
+                              &LosesInfo);
+    else if (TheVT == MVT::f80)
+      Status = Thresh.convert(APFloat::x87DoubleExtended,
+                              APFloat::rmNearestTiesToEven, &LosesInfo);
+
+    assert(Status == APFloat::opOK && !LosesInfo &&
+           "FP conversion should have been exact");
+
+    SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
+
+    SDValue Cmp = DAG.getSetCC(DL,
+                               getSetCCResultType(DAG.getDataLayout(),
+                                                  *DAG.getContext(), TheVT),
+                               Value, ThreshVal, ISD::SETLT);
+    Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
+                           DAG.getConstant(0, DL, MVT::i32),
+                           DAG.getConstant(0x80000000, DL, MVT::i32));
+    SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
+    Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
+                                              *DAG.getContext(), TheVT),
+                       Value, ThreshVal, ISD::SETLT);
+    Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
+  }
+
+  // FIXME This causes a redundant load/store if the SSE-class value is already
+  // in memory, such as if it is on the callstack.
+  if (isScalarFPTypeInSSEReg(TheVT)) {
+    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
+    Chain = DAG.getStore(Chain, DL, Value, StackSlot,
+                         MachinePointerInfo::getFixedStack(MF, SSFI), false,
+                         false, 0);
+    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
+    SDValue Ops[] = {
+      Chain, StackSlot, DAG.getValueType(TheVT)
+    };
+
+    MachineMemOperand *MMO =
+        MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
+                                MachineMemOperand::MOLoad, MemSize, MemSize);
+    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
+    Chain = Value.getValue(1);
+    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
+    StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+  }
+
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
+                              MachineMemOperand::MOStore, MemSize, MemSize);
+
+  if (UnsignedFixup) {
+
+    // Insert the FIST, load its result as two i32's,
+    // and XOR the high i32 with Adjust.
+
+    SDValue FistOps[] = { Chain, Value, StackSlot };
+    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
+                                           FistOps, DstTy, MMO);
+
+    SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot,
+                                MachinePointerInfo(),
+                                false, false, false, 0);
+    SDValue HighAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackSlot,
+                                   DAG.getConstant(4, DL, PtrVT));
+
+    SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr,
+                                 MachinePointerInfo(),
+                                 false, false, false, 0);
+    High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
+
+    if (Subtarget->is64Bit()) {
+      // Join High32 and Low32 into a 64-bit result.
+      // (High32 << 32) | Low32
+      Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
+      High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
+      High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
+                           DAG.getConstant(32, DL, MVT::i8));
+      SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
+      return std::make_pair(Result, SDValue());
+    }
+
+    SDValue ResultOps[] = { Low32, High32 };
+
+    SDValue pair = IsReplace
+      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
+      : DAG.getMergeValues(ResultOps, DL);
+    return std::make_pair(pair, SDValue());
+  } else {
+    // Build the FP_TO_INT*_IN_MEM
+    SDValue Ops[] = { Chain, Value, StackSlot };
+    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
+                                           Ops, DstTy, MMO);
+    return std::make_pair(FIST, StackSlot);
+  }
+}
+
+static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
+                              const X86Subtarget *Subtarget) {
+  MVT VT = Op->getSimpleValueType(0);
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+  SDLoc dl(Op);
+
+  if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
+
+  // Optimize vectors in AVX mode:
+  //
+  //   v8i16 -> v8i32
+  //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
+  //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
+  //   Concat upper and lower parts.
+  //
+  //   v4i32 -> v4i64
+  //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
+  //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
+  //   Concat upper and lower parts.
+  //
+
+  if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
+      ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
+      ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
+    return SDValue();
+
+  if (Subtarget->hasInt256())
+    return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
+
+  SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
+  SDValue Undef = DAG.getUNDEF(InVT);
+  bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
+  SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+  SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+
+  MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
+                             VT.getVectorNumElements()/2);
+
+  OpLo = DAG.getBitcast(HVT, OpLo);
+  OpHi = DAG.getBitcast(HVT, OpHi);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
+static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
+                  const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+  MVT VT = Op->getSimpleValueType(0);
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+  SDLoc DL(Op);
+  unsigned int NumElts = VT.getVectorNumElements();
+  if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI())
+    return SDValue();
+
+  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
+
+  assert(InVT.getVectorElementType() == MVT::i1);
+  MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
+  SDValue One =
+   DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
+  SDValue Zero =
+   DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
+
+  SDValue V = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
+  if (VT.is512BitVector())
+    return V;
+  return DAG.getNode(X86ISD::VTRUNC, DL, VT, V);
+}
+
+static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+                               SelectionDAG &DAG) {
+  if (Subtarget->hasFp256())
+    if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
+      return Res;
+
+  return SDValue();
+}
+
+static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+                                SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  MVT SVT = In.getSimpleValueType();
+
+  if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
+    return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
+
+  if (Subtarget->hasFp256())
+    if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
+      return Res;
+
+  assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
+         VT.getVectorNumElements() != SVT.getVectorNumElements());
+  return SDValue();
+}
+
+static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+
+  assert(VT.getVectorElementType() == MVT::i1 && "Unexected vector type.");
+
+  // Shift LSB to MSB and use VPMOVB2M - SKX.
+  unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
+  if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 &&
+         Subtarget->hasBWI()) ||     // legal, will go to VPMOVB2M, VPMOVW2M
+      ((InVT.is256BitVector() || InVT.is128BitVector()) &&
+             InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() &&
+             Subtarget->hasVLX())) { // legal, will go to VPMOVB2M, VPMOVW2M
+    // Shift packed bytes not supported natively, bitcast to dword
+    MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
+    SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
+                                     DAG.getBitcast(ExtVT, In),
+                                     DAG.getConstant(ShiftInx, DL, ExtVT));
+    ShiftNode = DAG.getBitcast(InVT, ShiftNode);
+    return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
+  }
+  if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 &&
+         Subtarget->hasDQI()) ||  // legal, will go to VPMOVD2M, VPMOVQ2M
+      ((InVT.is256BitVector() || InVT.is128BitVector()) &&
+         InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI() &&
+         Subtarget->hasVLX())) {  // legal, will go to VPMOVD2M, VPMOVQ2M
+
+    SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
+                                     DAG.getConstant(ShiftInx, DL, InVT));
+    return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
+  }
+
+  // Shift LSB to MSB, extend if necessary and use TESTM.
+  unsigned NumElts = InVT.getVectorNumElements();
+  if (InVT.getSizeInBits() < 512 &&
+      (InVT.getScalarType() == MVT::i8 || InVT.getScalarType() == MVT::i16 ||
+       !Subtarget->hasVLX())) {
+    assert((NumElts == 8 || NumElts == 16) && "Unexected vector type.");
+
+    // TESTD/Q should be used (if BW supported we use CVT2MASK above),
+    // so vector should be extended to packed dword/qword.
+    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+    In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
+    InVT = ExtVT;
+    ShiftInx = InVT.getScalarSizeInBits() - 1;
+  }
+
+  SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
+                                   DAG.getConstant(ShiftInx, DL, InVT));
+  return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
+}
+
+SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+
+  if (VT == MVT::i1) {
+    assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
+           "Invalid scalar TRUNCATE operation");
+    if (InVT.getSizeInBits() >= 32)
+      return SDValue();
+    In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
+    return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
+  }
+  assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
+         "Invalid TRUNCATE operation");
+
+  if (VT.getVectorElementType() == MVT::i1)
+    return LowerTruncateVecI1(Op, DAG, Subtarget);
+
+  // vpmovqb/w/d, vpmovdb/w, vpmovwb
+  if (Subtarget->hasAVX512()) {
+    // word to byte only under BWI
+    if (InVT == MVT::v16i16 && !Subtarget->hasBWI()) // v16i16 -> v16i8
+      return DAG.getNode(X86ISD::VTRUNC, DL, VT,
+                         DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
+    return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
+  }
+  if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
+    // On AVX2, v4i64 -> v4i32 becomes VPERMD.
+    if (Subtarget->hasInt256()) {
+      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
+      In = DAG.getBitcast(MVT::v8i32, In);
+      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
+                                ShufMask);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
+                         DAG.getIntPtrConstant(0, DL));
+    }
+
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                               DAG.getIntPtrConstant(0, DL));
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                               DAG.getIntPtrConstant(2, DL));
+    OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
+    OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
+    static const int ShufMask[] = {0, 2, 4, 6};
+    return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
+  }
+
+  if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
+    // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
+    if (Subtarget->hasInt256()) {
+      In = DAG.getBitcast(MVT::v32i8, In);
+
+      SmallVector<SDValue,32> pshufbMask;
+      for (unsigned i = 0; i < 2; ++i) {
+        pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
+        for (unsigned j = 0; j < 8; ++j)
+          pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
+      }
+      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
+      In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
+      In = DAG.getBitcast(MVT::v4i64, In);
+
+      static const int ShufMask[] = {0,  2,  -1,  -1};
+      In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
+                                &ShufMask[0]);
+      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                       DAG.getIntPtrConstant(0, DL));
+      return DAG.getBitcast(VT, In);
+    }
+
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+                               DAG.getIntPtrConstant(0, DL));
+
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+                               DAG.getIntPtrConstant(4, DL));
+
+    OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
+    OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
+
+    // The PSHUFB mask:
+    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
+                                   -1, -1, -1, -1, -1, -1, -1, -1};
+
+    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
+    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
+    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
+
+    OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
+    OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
+
+    // The MOVLHPS Mask:
+    static const int ShufMask2[] = {0, 1, 4, 5};
+    SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
+    return DAG.getBitcast(MVT::v8i16, res);
+  }
+
+  // Handle truncation of V256 to V128 using shuffles.
+  if (!VT.is128BitVector() || !InVT.is256BitVector())
+    return SDValue();
+
+  assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
+
+  unsigned NumElems = VT.getVectorNumElements();
+  MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
+
+  SmallVector<int, 16> MaskVec(NumElems * 2, -1);
+  // Prepare truncation shuffle mask
+  for (unsigned i = 0; i != NumElems; ++i)
+    MaskVec[i] = i * 2;
+  SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
+                                   DAG.getUNDEF(NVT), &MaskVec[0]);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
+                     DAG.getIntPtrConstant(0, DL));
+}
+
+SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  assert(!Op.getSimpleValueType().isVector());
+
+  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
+    /*IsSigned=*/ true, /*IsReplace=*/ false);
+  SDValue FIST = Vals.first, StackSlot = Vals.second;
+  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
+  if (!FIST.getNode())
+    return Op;
+
+  if (StackSlot.getNode())
+    // Load the result.
+    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
+                       FIST, StackSlot, MachinePointerInfo(),
+                       false, false, false, 0);
+
+  // The node is the result.
+  return FIST;
+}
+
+SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
+    /*IsSigned=*/ false, /*IsReplace=*/ false);
+  SDValue FIST = Vals.first, StackSlot = Vals.second;
+  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
+  if (!FIST.getNode())
+    return Op;
+
+  if (StackSlot.getNode())
+    // Load the result.
+    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
+                       FIST, StackSlot, MachinePointerInfo(),
+                       false, false, false, 0);
+
+  // The node is the result.
+  return FIST;
+}
+
+static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  MVT SVT = In.getSimpleValueType();
+
+  assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
+
+  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
+                     DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
+                                 In, DAG.getUNDEF(SVT)));
+}
+
+/// The only differences between FABS and FNEG are the mask and the logic op.
+/// FNEG also has a folding opportunity for FNEG(FABS(x)).
+static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
+  assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
+         "Wrong opcode for lowering FABS or FNEG.");
+
+  bool IsFABS = (Op.getOpcode() == ISD::FABS);
+
+  // If this is a FABS and it has an FNEG user, bail out to fold the combination
+  // into an FNABS. We'll lower the FABS after that if it is still in use.
+  if (IsFABS)
+    for (SDNode *User : Op->uses())
+      if (User->getOpcode() == ISD::FNEG)
+        return Op;
+
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  bool IsF128 = (VT == MVT::f128);
+
+  // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
+  // decide if we should generate a 16-byte constant mask when we only need 4 or
+  // 8 bytes for the scalar case.
+
+  MVT LogicVT;
+  MVT EltVT;
+  unsigned NumElts;
+
+  if (VT.isVector()) {
+    LogicVT = VT;
+    EltVT = VT.getVectorElementType();
+    NumElts = VT.getVectorNumElements();
+  } else if (IsF128) {
+    // SSE instructions are used for optimized f128 logical operations.
+    LogicVT = MVT::f128;
+    EltVT = VT;
+    NumElts = 1;
+  } else {
+    // There are no scalar bitwise logical SSE/AVX instructions, so we
+    // generate a 16-byte vector constant and logic op even for the scalar case.
+    // Using a 16-byte mask allows folding the load of the mask with
+    // the logic op, so it can save (~4 bytes) on code size.
+    LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+    EltVT = VT;
+    NumElts = (VT == MVT::f64) ? 2 : 4;
+  }
+
+  unsigned EltBits = EltVT.getSizeInBits();
+  LLVMContext *Context = DAG.getContext();
+  // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
+  APInt MaskElt =
+    IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
+  Constant *C = ConstantInt::get(*Context, MaskElt);
+  C = ConstantVector::getSplat(NumElts, C);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  SDValue Mask =
+      DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
+                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                  false, false, false, Alignment);
+
+  SDValue Op0 = Op.getOperand(0);
+  bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
+  unsigned LogicOp =
+    IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
+  SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
+
+  if (VT.isVector() || IsF128)
+    return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+
+  // For the scalar case extend to a 128-bit vector, perform the logic op,
+  // and extract the scalar result back out.
+  Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
+  SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
+                     DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  LLVMContext *Context = DAG.getContext();
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT SrcVT = Op1.getSimpleValueType();
+  bool IsF128 = (VT == MVT::f128);
+
+  // If second operand is smaller, extend it first.
+  if (SrcVT.bitsLT(VT)) {
+    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
+    SrcVT = VT;
+  }
+  // And if it is bigger, shrink it first.
+  if (SrcVT.bitsGT(VT)) {
+    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1, dl));
+    SrcVT = VT;
+  }
+
+  // At this point the operands and the result should have the same
+  // type, and that won't be f80 since that is not custom lowered.
+  assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) &&
+         "Unexpected type in LowerFCOPYSIGN");
+
+  const fltSemantics &Sem =
+      VT == MVT::f64 ? APFloat::IEEEdouble :
+          (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
+  const unsigned SizeInBits = VT.getSizeInBits();
+
+  SmallVector<Constant *, 4> CV(
+      VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4),
+      ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
+
+  // First, clear all bits but the sign bit from the second operand (sign).
+  CV[0] = ConstantFP::get(*Context,
+                          APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
+  Constant *C = ConstantVector::get(CV);
+  auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+  SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
+
+  // Perform all logic operations as 16-byte vectors because there are no
+  // scalar FP logic instructions in SSE. This allows load folding of the
+  // constants into the logic instructions.
+  MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32);
+  SDValue Mask1 =
+      DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
+                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                  false, false, false, 16);
+  if (!IsF128)
+    Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
+  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
+
+  // Next, clear the sign bit from the first operand (magnitude).
+  // If it's a constant, we can clear it here.
+  if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
+    APFloat APF = Op0CN->getValueAPF();
+    // If the magnitude is a positive zero, the sign bit alone is enough.
+    if (APF.isPosZero())
+      return IsF128 ? SignBit :
+          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
+                      DAG.getIntPtrConstant(0, dl));
+    APF.clearSign();
+    CV[0] = ConstantFP::get(*Context, APF);
+  } else {
+    CV[0] = ConstantFP::get(
+        *Context,
+        APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
+  }
+  C = ConstantVector::get(CV);
+  CPIdx = DAG.getConstantPool(C, PtrVT, 16);
+  SDValue Val =
+      DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
+                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                  false, false, false, 16);
+  // If the magnitude operand wasn't a constant, we need to AND out the sign.
+  if (!isa<ConstantFPSDNode>(Op0)) {
+    if (!IsF128)
+      Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
+    Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
+  }
+  // OR the magnitude value with the sign bit.
+  Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
+  return IsF128 ? Val :
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
+                  DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
+  SDValue N0 = Op.getOperand(0);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
+  SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
+                                  DAG.getConstant(1, dl, VT));
+  return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, dl, VT));
+}
+
+// Check whether an OR'd tree is PTEST-able.
+static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  if (!Op->hasOneUse())
+    return SDValue();
+
+  SDNode *N = Op.getNode();
+  SDLoc DL(N);
+
+  SmallVector<SDValue, 8> Opnds;
+  DenseMap<SDValue, unsigned> VecInMap;
+  SmallVector<SDValue, 8> VecIns;
+  EVT VT = MVT::Other;
+
+  // Recognize a special case where a vector is casted into wide integer to
+  // test all 0s.
+  Opnds.push_back(N->getOperand(0));
+  Opnds.push_back(N->getOperand(1));
+
+  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
+    SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
+    // BFS traverse all OR'd operands.
+    if (I->getOpcode() == ISD::OR) {
+      Opnds.push_back(I->getOperand(0));
+      Opnds.push_back(I->getOperand(1));
+      // Re-evaluate the number of nodes to be traversed.
+      e += 2; // 2 more nodes (LHS and RHS) are pushed.
+      continue;
+    }
+
+    // Quit if a non-EXTRACT_VECTOR_ELT
+    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // Quit if without a constant index.
+    SDValue Idx = I->getOperand(1);
+    if (!isa<ConstantSDNode>(Idx))
+      return SDValue();
+
+    SDValue ExtractedFromVec = I->getOperand(0);
+    DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
+    if (M == VecInMap.end()) {
+      VT = ExtractedFromVec.getValueType();
+      // Quit if not 128/256-bit vector.
+      if (!VT.is128BitVector() && !VT.is256BitVector())
+        return SDValue();
+      // Quit if not the same type.
+      if (VecInMap.begin() != VecInMap.end() &&
+          VT != VecInMap.begin()->first.getValueType())
+        return SDValue();
+      M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
+      VecIns.push_back(ExtractedFromVec);
+    }
+    M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+  }
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Not extracted from 128-/256-bit vector.");
+
+  unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
+
+  for (DenseMap<SDValue, unsigned>::const_iterator
+        I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
+    // Quit if not all elements are used.
+    if (I->second != FullMask)
+      return SDValue();
+  }
+
+  MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+
+  // Cast all vectors into TestVT for PTEST.
+  for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
+    VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
+
+  // If more than one full vectors are evaluated, OR them first before PTEST.
+  for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
+    // Each iteration will OR 2 nodes and append the result until there is only
+    // 1 node left, i.e. the final OR'd value of all vectors.
+    SDValue LHS = VecIns[Slot];
+    SDValue RHS = VecIns[Slot + 1];
+    VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
+  }
+
+  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
+                     VecIns.back(), VecIns.back());
+}
+
+/// \brief return true if \c Op has a use that doesn't just read flags.
+static bool hasNonFlagsUse(SDValue Op) {
+  for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
+       ++UI) {
+    SDNode *User = *UI;
+    unsigned UOpNo = UI.getOperandNo();
+    if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+      // Look pass truncate.
+      UOpNo = User->use_begin().getOperandNo();
+      User = *User->use_begin();
+    }
+
+    if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
+        !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
+      return true;
+  }
+  return false;
+}
+
+/// Emit nodes that will be selected as "test Op0,Op0", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
+                                    SelectionDAG &DAG) const {
+  if (Op.getValueType() == MVT::i1) {
+    SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
+    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
+                       DAG.getConstant(0, dl, MVT::i8));
+  }
+  // CF and OF aren't always set the way we want. Determine which
+  // of these we need.
+  bool NeedCF = false;
+  bool NeedOF = false;
+  switch (X86CC) {
+  default: break;
+  case X86::COND_A: case X86::COND_AE:
+  case X86::COND_B: case X86::COND_BE:
+    NeedCF = true;
+    break;
+  case X86::COND_G: case X86::COND_GE:
+  case X86::COND_L: case X86::COND_LE:
+  case X86::COND_O: case X86::COND_NO: {
+    // Check if we really need to set the
+    // Overflow flag. If NoSignedWrap is present
+    // that is not actually needed.
+    switch (Op->getOpcode()) {
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::MUL:
+    case ISD::SHL: {
+      const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
+      if (BinNode->Flags.hasNoSignedWrap())
+        break;
+    }
+    default:
+      NeedOF = true;
+      break;
+    }
+    break;
+  }
+  }
+  // See if we can use the EFLAGS value from the operand instead of
+  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
+  // we prove that the arithmetic won't overflow, we can't use OF or CF.
+  if (Op.getResNo() != 0 || NeedOF || NeedCF) {
+    // Emit a CMP with 0, which is the TEST pattern.
+    //if (Op.getValueType() == MVT::i1)
+    //  return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
+    //                     DAG.getConstant(0, MVT::i1));
+    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+                       DAG.getConstant(0, dl, Op.getValueType()));
+  }
+  unsigned Opcode = 0;
+  unsigned NumOperands = 0;
+
+  // Truncate operations may prevent the merge of the SETCC instruction
+  // and the arithmetic instruction before it. Attempt to truncate the operands
+  // of the arithmetic instruction and use a reduced bit-width instruction.
+  bool NeedTruncation = false;
+  SDValue ArithOp = Op;
+  if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
+    SDValue Arith = Op->getOperand(0);
+    // Both the trunc and the arithmetic op need to have one user each.
+    if (Arith->hasOneUse())
+      switch (Arith.getOpcode()) {
+        default: break;
+        case ISD::ADD:
+        case ISD::SUB:
+        case ISD::AND:
+        case ISD::OR:
+        case ISD::XOR: {
+          NeedTruncation = true;
+          ArithOp = Arith;
+        }
+      }
+  }
+
+  // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
+  // which may be the result of a CAST.  We use the variable 'Op', which is the
+  // non-casted variable when we check for possible users.
+  switch (ArithOp.getOpcode()) {
+  case ISD::ADD:
+    // Due to an isel shortcoming, be conservative if this add is likely to be
+    // selected as part of a load-modify-store instruction. When the root node
+    // in a match is a store, isel doesn't know how to remap non-chain non-flag
+    // uses of other nodes in the match, such as the ADD in this case. This
+    // leads to the ADD being left around and reselected, with the result being
+    // two adds in the output.  Alas, even if none our users are stores, that
+    // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
+    // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
+    // climbing the DAG back to the root, and it doesn't seem to be worth the
+    // effort.
+    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+         UE = Op.getNode()->use_end(); UI != UE; ++UI)
+      if (UI->getOpcode() != ISD::CopyToReg &&
+          UI->getOpcode() != ISD::SETCC &&
+          UI->getOpcode() != ISD::STORE)
+        goto default_case;
+
+    if (ConstantSDNode *C =
+        dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
+      // An add of one will be selected as an INC.
+      if (C->isOne() && !Subtarget->slowIncDec()) {
+        Opcode = X86ISD::INC;
+        NumOperands = 1;
+        break;
+      }
+
+      // An add of negative one (subtract of one) will be selected as a DEC.
+      if (C->isAllOnesValue() && !Subtarget->slowIncDec()) {
+        Opcode = X86ISD::DEC;
+        NumOperands = 1;
+        break;
+      }
+    }
+
+    // Otherwise use a regular EFLAGS-setting add.
+    Opcode = X86ISD::ADD;
+    NumOperands = 2;
+    break;
+  case ISD::SHL:
+  case ISD::SRL:
+    // If we have a constant logical shift that's only used in a comparison
+    // against zero turn it into an equivalent AND. This allows turning it into
+    // a TEST instruction later.
+    if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
+        isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
+      EVT VT = Op.getValueType();
+      unsigned BitWidth = VT.getSizeInBits();
+      unsigned ShAmt = Op->getConstantOperandVal(1);
+      if (ShAmt >= BitWidth) // Avoid undefined shifts.
+        break;
+      APInt Mask = ArithOp.getOpcode() == ISD::SRL
+                       ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
+                       : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
+      if (!Mask.isSignedIntN(32)) // Avoid large immediates.
+        break;
+      SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
+                                DAG.getConstant(Mask, dl, VT));
+      DAG.ReplaceAllUsesWith(Op, New);
+      Op = New;
+    }
+    break;
+
+  case ISD::AND:
+    // If the primary and result isn't used, don't bother using X86ISD::AND,
+    // because a TEST instruction will be better.
+    if (!hasNonFlagsUse(Op))
+      break;
+    // FALL THROUGH
+  case ISD::SUB:
+  case ISD::OR:
+  case ISD::XOR:
+    // Due to the ISEL shortcoming noted above, be conservative if this op is
+    // likely to be selected as part of a load-modify-store instruction.
+    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+           UE = Op.getNode()->use_end(); UI != UE; ++UI)
+      if (UI->getOpcode() == ISD::STORE)
+        goto default_case;
+
+    // Otherwise use a regular EFLAGS-setting instruction.
+    switch (ArithOp.getOpcode()) {
+    default: llvm_unreachable("unexpected operator!");
+    case ISD::SUB: Opcode = X86ISD::SUB; break;
+    case ISD::XOR: Opcode = X86ISD::XOR; break;
+    case ISD::AND: Opcode = X86ISD::AND; break;
+    case ISD::OR: {
+      if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+        SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
+        if (EFLAGS.getNode())
+          return EFLAGS;
+      }
+      Opcode = X86ISD::OR;
+      break;
+    }
+    }
+
+    NumOperands = 2;
+    break;
+  case X86ISD::ADD:
+  case X86ISD::SUB:
+  case X86ISD::INC:
+  case X86ISD::DEC:
+  case X86ISD::OR:
+  case X86ISD::XOR:
+  case X86ISD::AND:
+    return SDValue(Op.getNode(), 1);
+  default:
+  default_case:
+    break;
+  }
+
+  // If we found that truncation is beneficial, perform the truncation and
+  // update 'Op'.
+  if (NeedTruncation) {
+    EVT VT = Op.getValueType();
+    SDValue WideVal = Op->getOperand(0);
+    EVT WideVT = WideVal.getValueType();
+    unsigned ConvertedOp = 0;
+    // Use a target machine opcode to prevent further DAGCombine
+    // optimizations that may separate the arithmetic operations
+    // from the setcc node.
+    switch (WideVal.getOpcode()) {
+      default: break;
+      case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
+      case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
+      case ISD::AND: ConvertedOp = X86ISD::AND; break;
+      case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
+      case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
+    }
+
+    if (ConvertedOp) {
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
+        SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
+        SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
+        Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
+      }
+    }
+  }
+
+  if (Opcode == 0)
+    // Emit a CMP with 0, which is the TEST pattern.
+    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+                       DAG.getConstant(0, dl, Op.getValueType()));
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+  SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
+
+  SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
+  DAG.ReplaceAllUsesWith(Op, New);
+  return SDValue(New.getNode(), 1);
+}
+
+/// Emit nodes that will be selected as "cmp Op0,Op1", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+                                   SDLoc dl, SelectionDAG &DAG) const {
+  if (isNullConstant(Op1))
+    return EmitTest(Op0, X86CC, dl, DAG);
+
+  assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
+         "Unexpected comparison operation for MVT::i1 operands");
+
+  if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
+       Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
+    // Do the comparison at i32 if it's smaller, besides the Atom case.
+    // This avoids subregister aliasing issues. Keep the smaller reference
+    // if we're optimizing for size, however, as that'll allow better folding
+    // of memory operations.
+    if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
+        !DAG.getMachineFunction().getFunction()->optForMinSize() &&
+        !Subtarget->isAtom()) {
+      unsigned ExtendOp =
+          isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+      Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
+      Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
+    }
+    // Use SUB instead of CMP to enable CSE between SUB and CMP.
+    SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
+    SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
+                              Op0, Op1);
+    return SDValue(Sub.getNode(), 1);
+  }
+  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+}
+
+/// Convert a comparison if required by the subtarget.
+SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
+                                                 SelectionDAG &DAG) const {
+  // If the subtarget does not support the FUCOMI instruction, floating-point
+  // comparisons have to be converted.
+  if (Subtarget->hasCMov() ||
+      Cmp.getOpcode() != X86ISD::CMP ||
+      !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
+      !Cmp.getOperand(1).getValueType().isFloatingPoint())
+    return Cmp;
+
+  // The instruction selector will select an FUCOM instruction instead of
+  // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
+  // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
+  // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
+  SDLoc dl(Cmp);
+  SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
+  SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
+  SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
+                            DAG.getConstant(8, dl, MVT::i8));
+  SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
+
+  // Some 64-bit targets lack SAHF support, but they do support FCOMI.
+  assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
+  return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
+}
+
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
+                                            DAGCombinerInfo &DCI,
+                                            unsigned &RefinementSteps,
+                                            bool &UseOneConstNR) const {
+  EVT VT = Op.getValueType();
+  const char *RecipOp;
+
+  // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
+  // TODO: Add support for AVX512 (v16f32).
+  // It is likely not profitable to do this for f64 because a double-precision
+  // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
+  // instructions: convert to single, rsqrtss, convert back to double, refine
+  // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
+  // along with FMA, this could be a throughput win.
+  if (VT == MVT::f32 && Subtarget->hasSSE1())
+    RecipOp = "sqrtf";
+  else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
+           (VT == MVT::v8f32 && Subtarget->hasAVX()))
+    RecipOp = "vec-sqrtf";
+  else
+    return SDValue();
+
+  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+  if (!Recips.isEnabled(RecipOp))
+    return SDValue();
+
+  RefinementSteps = Recips.getRefinementSteps(RecipOp);
+  UseOneConstNR = false;
+  return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
+}
+
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
+                                            DAGCombinerInfo &DCI,
+                                            unsigned &RefinementSteps) const {
+  EVT VT = Op.getValueType();
+  const char *RecipOp;
+
+  // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
+  // TODO: Add support for AVX512 (v16f32).
+  // It is likely not profitable to do this for f64 because a double-precision
+  // reciprocal estimate with refinement on x86 prior to FMA requires
+  // 15 instructions: convert to single, rcpss, convert back to double, refine
+  // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
+  // along with FMA, this could be a throughput win.
+  if (VT == MVT::f32 && Subtarget->hasSSE1())
+    RecipOp = "divf";
+  else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
+           (VT == MVT::v8f32 && Subtarget->hasAVX()))
+    RecipOp = "vec-divf";
+  else
+    return SDValue();
+
+  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+  if (!Recips.isEnabled(RecipOp))
+    return SDValue();
+
+  RefinementSteps = Recips.getRefinementSteps(RecipOp);
+  return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
+}
+
+/// If we have at least two divisions that use the same divisor, convert to
+/// multplication by a reciprocal. This may need to be adjusted for a given
+/// CPU if a division's cost is not at least twice the cost of a multiplication.
+/// This is because we still need one division to calculate the reciprocal and
+/// then we need two multiplies by that reciprocal as replacements for the
+/// original divisions.
+unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
+  return 2;
+}
+
+/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
+/// if it's possible.
+SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
+                                     SDLoc dl, SelectionDAG &DAG) const {
+  SDValue Op0 = And.getOperand(0);
+  SDValue Op1 = And.getOperand(1);
+  if (Op0.getOpcode() == ISD::TRUNCATE)
+    Op0 = Op0.getOperand(0);
+  if (Op1.getOpcode() == ISD::TRUNCATE)
+    Op1 = Op1.getOperand(0);
+
+  SDValue LHS, RHS;
+  if (Op1.getOpcode() == ISD::SHL)
+    std::swap(Op0, Op1);
+  if (Op0.getOpcode() == ISD::SHL) {
+    if (isOneConstant(Op0.getOperand(0))) {
+        // If we looked past a truncate, check that it's only truncating away
+        // known zeros.
+        unsigned BitWidth = Op0.getValueSizeInBits();
+        unsigned AndBitWidth = And.getValueSizeInBits();
+        if (BitWidth > AndBitWidth) {
+          APInt Zeros, Ones;
+          DAG.computeKnownBits(Op0, Zeros, Ones);
+          if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
+            return SDValue();
+        }
+        LHS = Op1;
+        RHS = Op0.getOperand(1);
+      }
+  } else if (Op1.getOpcode() == ISD::Constant) {
+    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
+    uint64_t AndRHSVal = AndRHS->getZExtValue();
+    SDValue AndLHS = Op0;
+
+    if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
+      LHS = AndLHS.getOperand(0);
+      RHS = AndLHS.getOperand(1);
+    }
+
+    // Use BT if the immediate can't be encoded in a TEST instruction.
+    if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
+      LHS = AndLHS;
+      RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
+    }
+  }
+
+  if (LHS.getNode()) {
+    // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
+    // instruction.  Since the shift amount is in-range-or-undefined, we know
+    // that doing a bittest on the i32 value is ok.  We extend to i32 because
+    // the encoding for the i16 version is larger than the i32 version.
+    // Also promote i16 to i32 for performance / code size reason.
+    if (LHS.getValueType() == MVT::i8 ||
+        LHS.getValueType() == MVT::i16)
+      LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
+
+    // If the operand types disagree, extend the shift amount to match.  Since
+    // BT ignores high bits (like shifts) we can use anyextend.
+    if (LHS.getValueType() != RHS.getValueType())
+      RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
+
+    SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
+    X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                       DAG.getConstant(Cond, dl, MVT::i8), BT);
+  }
+
+  return SDValue();
+}
+
+/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
+/// mask CMPs.
+static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
+                              SDValue &Op1) {
+  unsigned SSECC;
+  bool Swap = false;
+
+  // SSE Condition code mapping:
+  //  0 - EQ
+  //  1 - LT
+  //  2 - LE
+  //  3 - UNORD
+  //  4 - NEQ
+  //  5 - NLT
+  //  6 - NLE
+  //  7 - ORD
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Unexpected SETCC condition");
+  case ISD::SETOEQ:
+  case ISD::SETEQ:  SSECC = 0; break;
+  case ISD::SETOGT:
+  case ISD::SETGT:  Swap = true; // Fallthrough
+  case ISD::SETLT:
+  case ISD::SETOLT: SSECC = 1; break;
+  case ISD::SETOGE:
+  case ISD::SETGE:  Swap = true; // Fallthrough
+  case ISD::SETLE:
+  case ISD::SETOLE: SSECC = 2; break;
+  case ISD::SETUO:  SSECC = 3; break;
+  case ISD::SETUNE:
+  case ISD::SETNE:  SSECC = 4; break;
+  case ISD::SETULE: Swap = true; // Fallthrough
+  case ISD::SETUGE: SSECC = 5; break;
+  case ISD::SETULT: Swap = true; // Fallthrough
+  case ISD::SETUGT: SSECC = 6; break;
+  case ISD::SETO:   SSECC = 7; break;
+  case ISD::SETUEQ:
+  case ISD::SETONE: SSECC = 8; break;
+  }
+  if (Swap)
+    std::swap(Op0, Op1);
+
+  return SSECC;
+}
+
+// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
+// ones, and then concatenate the result back.
+static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+
+  assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
+         "Unsupported value type for operation");
+
+  unsigned NumElems = VT.getVectorNumElements();
+  SDLoc dl(Op);
+  SDValue CC = Op.getOperand(2);
+
+  // Extract the LHS vectors
+  SDValue LHS = Op.getOperand(0);
+  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
+
+  // Extract the RHS vectors
+  SDValue RHS = Op.getOperand(1);
+  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
+
+  // Issue the operation on the smaller types and concatenate the result back
+  MVT EltVT = VT.getVectorElementType();
+  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
+}
+
+static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+
+  assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+         "Unexpected type for boolean compare operation");
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
+                               DAG.getConstant(-1, dl, VT));
+  SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
+                               DAG.getConstant(-1, dl, VT));
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Unexpected SETCC condition");
+  case ISD::SETEQ:
+    // (x == y) -> ~(x ^ y)
+    return DAG.getNode(ISD::XOR, dl, VT,
+                       DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
+                       DAG.getConstant(-1, dl, VT));
+  case ISD::SETNE:
+    // (x != y) -> (x ^ y)
+    return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
+  case ISD::SETUGT:
+  case ISD::SETGT:
+    // (x > y) -> (x & ~y)
+    return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
+  case ISD::SETULT:
+  case ISD::SETLT:
+    // (x < y) -> (~x & y)
+    return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
+  case ISD::SETULE:
+  case ISD::SETLE:
+    // (x <= y) -> (~x | y)
+    return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
+  case ISD::SETUGE:
+  case ISD::SETGE:
+    // (x >=y) -> (x | ~y)
+    return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
+  }
+}
+
+static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
+                                     const X86Subtarget *Subtarget) {
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+
+  assert(Op0.getSimpleValueType().getVectorElementType().getSizeInBits() >= 8 &&
+         Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+         "Cannot set masked compare for this operation");
+
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  unsigned  Opc = 0;
+  bool Unsigned = false;
+  bool Swap = false;
+  unsigned SSECC;
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Unexpected SETCC condition");
+  case ISD::SETNE:  SSECC = 4; break;
+  case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
+  case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
+  case ISD::SETLT:  Swap = true; //fall-through
+  case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
+  case ISD::SETULT: SSECC = 1; Unsigned = true; break;
+  case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
+  case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
+  case ISD::SETULE: Unsigned = true; //fall-through
+  case ISD::SETLE:  SSECC = 2; break;
+  }
+
+  if (Swap)
+    std::swap(Op0, Op1);
+  if (Opc)
+    return DAG.getNode(Opc, dl, VT, Op0, Op1);
+  Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
+  return DAG.getNode(Opc, dl, VT, Op0, Op1,
+                     DAG.getConstant(SSECC, dl, MVT::i8));
+}
+
+/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
+/// operand \p Op1.  If non-trivial (for example because it's not constant)
+/// return an empty value.
+static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
+{
+  BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
+  if (!BV)
+    return SDValue();
+
+  MVT VT = Op1.getSimpleValueType();
+  MVT EVT = VT.getVectorElementType();
+  unsigned n = VT.getVectorNumElements();
+  SmallVector<SDValue, 8> ULTOp1;
+
+  for (unsigned i = 0; i < n; ++i) {
+    ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
+    if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
+      return SDValue();
+
+    // Avoid underflow.
+    APInt Val = Elt->getAPIntValue();
+    if (Val == 0)
+      return SDValue();
+
+    ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
+}
+
+static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG) {
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  MVT VT = Op.getSimpleValueType();
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
+  SDLoc dl(Op);
+
+  if (isFP) {
+#ifndef NDEBUG
+    MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
+    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+#endif
+
+    unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
+    unsigned Opc = X86ISD::CMPP;
+    if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
+      assert(VT.getVectorNumElements() <= 16);
+      Opc = X86ISD::CMPM;
+    }
+    // In the two special cases we can't handle, emit two comparisons.
+    if (SSECC == 8) {
+      unsigned CC0, CC1;
+      unsigned CombineOpc;
+      if (SetCCOpcode == ISD::SETUEQ) {
+        CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
+      } else {
+        assert(SetCCOpcode == ISD::SETONE);
+        CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
+      }
+
+      SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
+                                 DAG.getConstant(CC0, dl, MVT::i8));
+      SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
+                                 DAG.getConstant(CC1, dl, MVT::i8));
+      return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
+    }
+    // Handle all other FP comparisons here.
+    return DAG.getNode(Opc, dl, VT, Op0, Op1,
+                       DAG.getConstant(SSECC, dl, MVT::i8));
+  }
+
+  MVT VTOp0 = Op0.getSimpleValueType();
+  assert(VTOp0 == Op1.getSimpleValueType() &&
+         "Expected operands with same type!");
+  assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
+         "Invalid number of packed elements for source and destination!");
+
+  if (VT.is128BitVector() && VTOp0.is256BitVector()) {
+    // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
+    // legalizer to a wider vector type.  In the case of 'vsetcc' nodes, the
+    // legalizer firstly checks if the first operand in input to the setcc has
+    // a legal type. If so, then it promotes the return type to that same type.
+    // Otherwise, the return type is promoted to the 'next legal type' which,
+    // for a vector of MVT::i1 is always a 128-bit integer vector type.
+    //
+    // We reach this code only if the following two conditions are met:
+    // 1. Both return type and operand type have been promoted to wider types
+    //    by the type legalizer.
+    // 2. The original operand type has been promoted to a 256-bit vector.
+    //
+    // Note that condition 2. only applies for AVX targets.
+    SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
+    return DAG.getZExtOrTrunc(NewOp, dl, VT);
+  }
+
+  // The non-AVX512 code below works under the assumption that source and
+  // destination types are the same.
+  assert((Subtarget->hasAVX512() || (VT == VTOp0)) &&
+         "Value types for source and destination must be the same!");
+
+  // Break 256-bit integer vector compare into smaller ones.
+  if (VT.is256BitVector() && !Subtarget->hasInt256())
+    return Lower256IntVSETCC(Op, DAG);
+
+  MVT OpVT = Op1.getSimpleValueType();
+  if (OpVT.getVectorElementType() == MVT::i1)
+    return LowerBoolVSETCC_AVX512(Op, DAG);
+
+  bool MaskResult = (VT.getVectorElementType() == MVT::i1);
+  if (Subtarget->hasAVX512()) {
+    if (Op1.getSimpleValueType().is512BitVector() ||
+        (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
+        (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
+      return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
+
+    // In AVX-512 architecture setcc returns mask with i1 elements,
+    // But there is no compare instruction for i8 and i16 elements in KNL.
+    // We are not talking about 512-bit operands in this case, these
+    // types are illegal.
+    if (MaskResult &&
+        (OpVT.getVectorElementType().getSizeInBits() < 32 &&
+         OpVT.getVectorElementType().getSizeInBits() >= 8))
+      return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                         DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
+  }
+
+  // Lower using XOP integer comparisons.
+  if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
+       VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget->hasXOP()) {
+    // Translate compare code to XOP PCOM compare mode.
+    unsigned CmpMode = 0;
+    switch (SetCCOpcode) {
+    default: llvm_unreachable("Unexpected SETCC condition");
+    case ISD::SETULT:
+    case ISD::SETLT: CmpMode = 0x00; break;
+    case ISD::SETULE:
+    case ISD::SETLE: CmpMode = 0x01; break;
+    case ISD::SETUGT:
+    case ISD::SETGT: CmpMode = 0x02; break;
+    case ISD::SETUGE:
+    case ISD::SETGE: CmpMode = 0x03; break;
+    case ISD::SETEQ: CmpMode = 0x04; break;
+    case ISD::SETNE: CmpMode = 0x05; break;
+    }
+
+    // Are we comparing unsigned or signed integers?
+    unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
+      ? X86ISD::VPCOMU : X86ISD::VPCOM;
+
+    return DAG.getNode(Opc, dl, VT, Op0, Op1,
+                       DAG.getConstant(CmpMode, dl, MVT::i8));
+  }
+
+  // We are handling one of the integer comparisons here.  Since SSE only has
+  // GT and EQ comparisons for integer, swapping operands and multiple
+  // operations may be required for some comparisons.
+  unsigned Opc;
+  bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
+  bool Subus = false;
+
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Unexpected SETCC condition");
+  case ISD::SETNE:  Invert = true;
+  case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
+  case ISD::SETLT:  Swap = true;
+  case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
+  case ISD::SETGE:  Swap = true;
+  case ISD::SETLE:  Opc = X86ISD::PCMPGT;
+                    Invert = true; break;
+  case ISD::SETULT: Swap = true;
+  case ISD::SETUGT: Opc = X86ISD::PCMPGT;
+                    FlipSigns = true; break;
+  case ISD::SETUGE: Swap = true;
+  case ISD::SETULE: Opc = X86ISD::PCMPGT;
+                    FlipSigns = true; Invert = true; break;
+  }
+
+  // Special case: Use min/max operations for SETULE/SETUGE
+  MVT VET = VT.getVectorElementType();
+  bool hasMinMax =
+       (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
+    || (Subtarget->hasSSE2()  && (VET == MVT::i8));
+
+  if (hasMinMax) {
+    switch (SetCCOpcode) {
+    default: break;
+    case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
+    case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
+    }
+
+    if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
+  }
+
+  bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
+  if (!MinMax && hasSubus) {
+    // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
+    // Op0 u<= Op1:
+    //   t = psubus Op0, Op1
+    //   pcmpeq t, <0..0>
+    switch (SetCCOpcode) {
+    default: break;
+    case ISD::SETULT: {
+      // If the comparison is against a constant we can turn this into a
+      // setule.  With psubus, setule does not require a swap.  This is
+      // beneficial because the constant in the register is no longer
+      // destructed as the destination so it can be hoisted out of a loop.
+      // Only do this pre-AVX since vpcmp* is no longer destructive.
+      if (Subtarget->hasAVX())
+        break;
+      SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
+      if (ULEOp1.getNode()) {
+        Op1 = ULEOp1;
+        Subus = true; Invert = false; Swap = false;
+      }
+      break;
+    }
+    // Psubus is better than flip-sign because it requires no inversion.
+    case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
+    case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
+    }
+
+    if (Subus) {
+      Opc = X86ISD::SUBUS;
+      FlipSigns = false;
+    }
+  }
+
+  if (Swap)
+    std::swap(Op0, Op1);
+
+  // Check that the operation in question is available (most are plain SSE2,
+  // but PCMPGTQ and PCMPEQQ have different requirements).
+  if (VT == MVT::v2i64) {
+    if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
+      assert(Subtarget->hasSSE2() && "Don't know how to lower!");
+
+      // First cast everything to the right type.
+      Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+      Op1 = DAG.getBitcast(MVT::v4i32, Op1);
+
+      // Since SSE has no unsigned integer comparisons, we need to flip the sign
+      // bits of the inputs before performing those operations. The lower
+      // compare is always unsigned.
+      SDValue SB;
+      if (FlipSigns) {
+        SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
+      } else {
+        SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
+        SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
+        SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                         Sign, Zero, Sign, Zero);
+      }
+      Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
+      Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
+
+      // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
+      SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
+      SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
+
+      // Create masks for only the low parts/high parts of the 64 bit integers.
+      static const int MaskHi[] = { 1, 1, 3, 3 };
+      static const int MaskLo[] = { 0, 0, 2, 2 };
+      SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
+      SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
+      SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
+
+      SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
+      Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
+
+      if (Invert)
+        Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+      return DAG.getBitcast(VT, Result);
+    }
+
+    if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
+      // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
+      // pcmpeqd + pshufd + pand.
+      assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
+
+      // First cast everything to the right type.
+      Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+      Op1 = DAG.getBitcast(MVT::v4i32, Op1);
+
+      // Do the compare.
+      SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
+
+      // Make sure the lower and upper halves are both all-ones.
+      static const int Mask[] = { 1, 0, 3, 2 };
+      SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
+      Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
+
+      if (Invert)
+        Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+      return DAG.getBitcast(VT, Result);
+    }
+  }
+
+  // Since SSE has no unsigned integer comparisons, we need to flip the sign
+  // bits of the inputs before performing those operations.
+  if (FlipSigns) {
+    MVT EltVT = VT.getVectorElementType();
+    SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
+                                 VT);
+    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
+    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
+  }
+
+  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+
+  // If the logical-not of the result is required, perform that now.
+  if (Invert)
+    Result = DAG.getNOT(dl, Result, VT);
+
+  if (MinMax)
+    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
+
+  if (Subus)
+    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
+                         getZeroVector(VT, Subtarget, DAG, dl));
+
+  return Result;
+}
+
+SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+  MVT VT = Op.getSimpleValueType();
+
+  if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
+
+  assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
+         && "SetCC type must be 8-bit or 1-bit integer");
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDLoc dl(Op);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+  // Optimize to BT if possible.
+  // Lower (X & (1 << N)) == 0 to BT(X, N).
+  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
+  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
+  if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
+      isNullConstant(Op1) &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
+      if (VT == MVT::i1)
+        return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
+      return NewSetCC;
+    }
+  }
+
+  // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
+  // these.
+  if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+
+    // If the input is a setcc, then reuse the input setcc or use a new one with
+    // the inverted condition.
+    if (Op0.getOpcode() == X86ISD::SETCC) {
+      X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
+      bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
+      if (!Invert)
+        return Op0;
+
+      CCode = X86::GetOppositeBranchCondition(CCode);
+      SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                  DAG.getConstant(CCode, dl, MVT::i8),
+                                  Op0.getOperand(1));
+      if (VT == MVT::i1)
+        return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
+      return SetCC;
+    }
+  }
+  if ((Op0.getValueType() == MVT::i1) && isOneConstant(Op1) &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+
+    ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
+    return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
+  }
+
+  bool isFP = Op1.getSimpleValueType().isFloatingPoint();
+  unsigned X86CC = TranslateX86CC(CC, dl, isFP, Op0, Op1, DAG);
+  if (X86CC == X86::COND_INVALID)
+    return SDValue();
+
+  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
+  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
+  SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                              DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS);
+  if (VT == MVT::i1)
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
+  return SetCC;
+}
+
+SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Carry = Op.getOperand(2);
+  SDValue Cond = Op.getOperand(3);
+  SDLoc DL(Op);
+
+  assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
+  X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
+
+  assert(Carry.getOpcode() != ISD::CARRY_FALSE);
+  SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+  SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
+  return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(),
+                     DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
+}
+
+// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
+static bool isX86LogicalCmp(SDValue Op) {
+  unsigned Opc = Op.getNode()->getOpcode();
+  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
+      Opc == X86ISD::SAHF)
+    return true;
+  if (Op.getResNo() == 1 &&
+      (Opc == X86ISD::ADD ||
+       Opc == X86ISD::SUB ||
+       Opc == X86ISD::ADC ||
+       Opc == X86ISD::SBB ||
+       Opc == X86ISD::SMUL ||
+       Opc == X86ISD::UMUL ||
+       Opc == X86ISD::INC ||
+       Opc == X86ISD::DEC ||
+       Opc == X86ISD::OR ||
+       Opc == X86ISD::XOR ||
+       Opc == X86ISD::AND))
+    return true;
+
+  if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
+    return true;
+
+  return false;
+}
+
+static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
+  if (V.getOpcode() != ISD::TRUNCATE)
+    return false;
+
+  SDValue VOp0 = V.getOperand(0);
+  unsigned InBits = VOp0.getValueSizeInBits();
+  unsigned Bits = V.getValueSizeInBits();
+  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
+}
+
+SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+  bool addTest = true;
+  SDValue Cond  = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Op2 = Op.getOperand(2);
+  SDLoc DL(Op);
+  MVT VT = Op1.getSimpleValueType();
+  SDValue CC;
+
+  // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
+  // are available or VBLENDV if AVX is available.
+  // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
+  if (Cond.getOpcode() == ISD::SETCC &&
+      ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
+       (Subtarget->hasSSE1() && VT == MVT::f32)) &&
+      VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
+    SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
+    int SSECC = translateX86FSETCC(
+        cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
+
+    if (SSECC != 8) {
+      if (Subtarget->hasAVX512()) {
+        SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
+                                  DAG.getConstant(SSECC, DL, MVT::i8));
+        return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
+      }
+
+      SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
+                                DAG.getConstant(SSECC, DL, MVT::i8));
+
+      // If we have AVX, we can use a variable vector select (VBLENDV) instead
+      // of 3 logic instructions for size savings and potentially speed.
+      // Unfortunately, there is no scalar form of VBLENDV.
+
+      // If either operand is a constant, don't try this. We can expect to
+      // optimize away at least one of the logic instructions later in that
+      // case, so that sequence would be faster than a variable blend.
+
+      // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
+      // uses XMM0 as the selection register. That may need just as many
+      // instructions as the AND/ANDN/OR sequence due to register moves, so
+      // don't bother.
+
+      if (Subtarget->hasAVX() &&
+          !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
+
+        // Convert to vectors, do a VSELECT, and convert back to scalar.
+        // All of the conversions should be optimized away.
+
+        MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
+        SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
+        SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
+        SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
+
+        MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
+        VCmp = DAG.getBitcast(VCmpVT, VCmp);
+
+        SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
+
+        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+                           VSel, DAG.getIntPtrConstant(0, DL));
+      }
+      SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
+      SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
+      return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
+    }
+  }
+
+  if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
+    SDValue Op1Scalar;
+    if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
+      Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
+    else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
+      Op1Scalar = Op1.getOperand(0);
+    SDValue Op2Scalar;
+    if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
+      Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
+    else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
+      Op2Scalar = Op2.getOperand(0);
+    if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
+      SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
+                                      Op1Scalar.getValueType(),
+                                      Cond, Op1Scalar, Op2Scalar);
+      if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
+        return DAG.getBitcast(VT, newSelect);
+      SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
+                         DAG.getIntPtrConstant(0, DL));
+    }
+  }
+
+  if (VT == MVT::v4i1 || VT == MVT::v2i1) {
+    SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
+    Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
+                      DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
+    Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
+                      DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
+    SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
+                                    Cond, Op1, Op2);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
+  }
+
+  if (Cond.getOpcode() == ISD::SETCC) {
+    SDValue NewCond = LowerSETCC(Cond, DAG);
+    if (NewCond.getNode())
+      Cond = NewCond;
+  }
+
+  // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
+  // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
+  // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
+  // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
+  if (Cond.getOpcode() == X86ISD::SETCC &&
+      Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
+      isNullConstant(Cond.getOperand(1).getOperand(1))) {
+    SDValue Cmp = Cond.getOperand(1);
+
+    unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+
+    if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
+        (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
+      SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
+
+      SDValue CmpOp0 = Cmp.getOperand(0);
+      // Apply further optimizations for special cases
+      // (select (x != 0), -1, 0) -> neg & sbb
+      // (select (x == 0), 0, -1) -> neg & sbb
+      if (isNullConstant(Y) &&
+            (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
+          SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+          SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
+                                    DAG.getConstant(0, DL,
+                                                    CmpOp0.getValueType()),
+                                    CmpOp0);
+          SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                                    DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                                    SDValue(Neg.getNode(), 1));
+          return Res;
+        }
+
+      Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+                        CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
+      Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+
+      SDValue Res =   // Res = 0 or -1.
+        DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                    DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
+
+      if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
+        Res = DAG.getNOT(DL, Res, Res.getValueType());
+
+      if (!isNullConstant(Op2))
+        Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
+      return Res;
+    }
+  }
+
+  // Look past (and (setcc_carry (cmp ...)), 1).
+  if (Cond.getOpcode() == ISD::AND &&
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+      isOneConstant(Cond.getOperand(1)))
+    Cond = Cond.getOperand(0);
+
+  // If condition flag is set by a X86ISD::CMP, then use it as the condition
+  // setting operand in place of the X86ISD::SETCC.
+  unsigned CondOpcode = Cond.getOpcode();
+  if (CondOpcode == X86ISD::SETCC ||
+      CondOpcode == X86ISD::SETCC_CARRY) {
+    CC = Cond.getOperand(0);
+
+    SDValue Cmp = Cond.getOperand(1);
+    unsigned Opc = Cmp.getOpcode();
+    MVT VT = Op.getSimpleValueType();
+
+    bool IllegalFPCMov = false;
+    if (VT.isFloatingPoint() && !VT.isVector() &&
+        !isScalarFPTypeInSSEReg(VT))  // FPStack?
+      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
+
+    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
+        Opc == X86ISD::BT) { // FIXME
+      Cond = Cmp;
+      addTest = false;
+    }
+  } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+             CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+             ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+              Cond.getOperand(0).getValueType() != MVT::i8)) {
+    SDValue LHS = Cond.getOperand(0);
+    SDValue RHS = Cond.getOperand(1);
+    unsigned X86Opcode;
+    unsigned X86Cond;
+    SDVTList VTs;
+    switch (CondOpcode) {
+    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+    default: llvm_unreachable("unexpected overflowing operator");
+    }
+    if (CondOpcode == ISD::UMULO)
+      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+                          MVT::i32);
+    else
+      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+    SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
+
+    if (CondOpcode == ISD::UMULO)
+      Cond = X86Op.getValue(2);
+    else
+      Cond = X86Op.getValue(1);
+
+    CC = DAG.getConstant(X86Cond, DL, MVT::i8);
+    addTest = false;
+  }
+
+  if (addTest) {
+    // Look past the truncate if the high bits are known zero.
+    if (isTruncWithZeroHighBitsInput(Cond, DAG))
+      Cond = Cond.getOperand(0);
+
+    // We know the result of AND is compared against zero. Try to match
+    // it to BT.
+    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+      if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
+        CC = NewSetCC.getOperand(0);
+        Cond = NewSetCC.getOperand(1);
+        addTest = false;
+      }
+    }
+  }
+
+  if (addTest) {
+    CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
+    Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
+  }
+
+  // a <  b ? -1 :  0 -> RES = ~setcc_carry
+  // a <  b ?  0 : -1 -> RES = setcc_carry
+  // a >= b ? -1 :  0 -> RES = setcc_carry
+  // a >= b ?  0 : -1 -> RES = ~setcc_carry
+  if (Cond.getOpcode() == X86ISD::SUB) {
+    Cond = ConvertCmpIfNecessary(Cond, DAG);
+    unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
+
+    if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
+        (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
+        (isNullConstant(Op1) || isNullConstant(Op2))) {
+      SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                                DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                                Cond);
+      if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
+        return DAG.getNOT(DL, Res, Res.getValueType());
+      return Res;
+    }
+  }
+
+  // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
+  // widen the cmov and push the truncate through. This avoids introducing a new
+  // branch during isel and doesn't add any extensions.
+  if (Op.getValueType() == MVT::i8 &&
+      Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
+    SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
+    if (T1.getValueType() == T2.getValueType() &&
+        // Blacklist CopyFromReg to avoid partial register stalls.
+        T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
+      SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
+      SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
+      return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
+    }
+  }
+
+  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
+  // condition is true.
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+  SDValue Ops[] = { Op2, Op1, CC, Cond };
+  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
+}
+
+static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  MVT VT = Op->getSimpleValueType(0);
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+  MVT VTElt = VT.getVectorElementType();
+  MVT InVTElt = InVT.getVectorElementType();
+  SDLoc dl(Op);
+
+  // SKX processor
+  if ((InVTElt == MVT::i1) &&
+      (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
+        VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
+
+       ((Subtarget->hasBWI() && VT.is512BitVector() &&
+        VTElt.getSizeInBits() <= 16)) ||
+
+       ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
+        VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
+
+       ((Subtarget->hasDQI() && VT.is512BitVector() &&
+        VTElt.getSizeInBits() >= 32))))
+    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
+  unsigned int NumElts = VT.getVectorNumElements();
+
+  if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI())
+    return SDValue();
+
+  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
+    if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
+      return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
+    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+  }
+
+  assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+  MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
+  SDValue NegOne =
+   DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl,
+                   ExtVT);
+  SDValue Zero =
+   DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
+
+  SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
+  if (VT.is512BitVector())
+    return V;
+  return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
+}
+
+static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
+                                             const X86Subtarget *Subtarget,
+                                             SelectionDAG &DAG) {
+  SDValue In = Op->getOperand(0);
+  MVT VT = Op->getSimpleValueType(0);
+  MVT InVT = In.getSimpleValueType();
+  assert(VT.getSizeInBits() == InVT.getSizeInBits());
+
+  MVT InSVT = InVT.getVectorElementType();
+  assert(VT.getVectorElementType().getSizeInBits() > InSVT.getSizeInBits());
+
+  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
+    return SDValue();
+  if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
+    return SDValue();
+
+  SDLoc dl(Op);
+
+  // SSE41 targets can use the pmovsx* instructions directly.
+  if (Subtarget->hasSSE41())
+    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
+  // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
+  SDValue Curr = In;
+  MVT CurrVT = InVT;
+
+  // As SRAI is only available on i16/i32 types, we expand only up to i32
+  // and handle i64 separately.
+  while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
+    Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
+    MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
+    CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
+    Curr = DAG.getBitcast(CurrVT, Curr);
+  }
+
+  SDValue SignExt = Curr;
+  if (CurrVT != InVT) {
+    unsigned SignExtShift =
+        CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits();
+    SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
+                          DAG.getConstant(SignExtShift, dl, MVT::i8));
+  }
+
+  if (CurrVT == VT)
+    return SignExt;
+
+  if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
+    SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
+                               DAG.getConstant(31, dl, MVT::i8));
+    SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
+    return DAG.getBitcast(VT, Ext);
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+                                SelectionDAG &DAG) {
+  MVT VT = Op->getSimpleValueType(0);
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+  SDLoc dl(Op);
+
+  if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
+    return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
+
+  if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
+      (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
+      (VT != MVT::v16i16 || InVT != MVT::v16i8))
+    return SDValue();
+
+  if (Subtarget->hasInt256())
+    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
+  // Optimize vectors in AVX mode
+  // Sign extend  v8i16 to v8i32 and
+  //              v4i32 to v4i64
+  //
+  // Divide input vector into two parts
+  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+  // concat the vectors to original VT
+
+  unsigned NumElems = InVT.getVectorNumElements();
+  SDValue Undef = DAG.getUNDEF(InVT);
+
+  SmallVector<int,8> ShufMask1(NumElems, -1);
+  for (unsigned i = 0; i != NumElems/2; ++i)
+    ShufMask1[i] = i;
+
+  SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
+
+  SmallVector<int,8> ShufMask2(NumElems, -1);
+  for (unsigned i = 0; i != NumElems/2; ++i)
+    ShufMask2[i] = i + NumElems/2;
+
+  SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
+
+  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
+                                VT.getVectorNumElements()/2);
+
+  OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
+  OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
+// Lower vector extended loads using a shuffle. If SSSE3 is not available we
+// may emit an illegal shuffle but the expansion is still better than scalar
+// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
+// we'll emit a shuffle and a arithmetic shift.
+// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
+// TODO: It is possible to support ZExt by zeroing the undef values during
+// the shuffle phase or after the shuffle.
+static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
+                                 SelectionDAG &DAG) {
+  MVT RegVT = Op.getSimpleValueType();
+  assert(RegVT.isVector() && "We only custom lower vector sext loads.");
+  assert(RegVT.isInteger() &&
+         "We only custom lower integer vector sext loads.");
+
+  // Nothing useful we can do without SSE2 shuffles.
+  assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
+
+  LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
+  SDLoc dl(Ld);
+  EVT MemVT = Ld->getMemoryVT();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned RegSz = RegVT.getSizeInBits();
+
+  ISD::LoadExtType Ext = Ld->getExtensionType();
+
+  assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
+         && "Only anyext and sext are currently implemented.");
+  assert(MemVT != RegVT && "Cannot extend to the same type");
+  assert(MemVT.isVector() && "Must load a vector from memory");
+
+  unsigned NumElems = RegVT.getVectorNumElements();
+  unsigned MemSz = MemVT.getSizeInBits();
+  assert(RegSz > MemSz && "Register size must be greater than the mem size");
+
+  if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
+    // The only way in which we have a legal 256-bit vector result but not the
+    // integer 256-bit operations needed to directly lower a sextload is if we
+    // have AVX1 but not AVX2. In that case, we can always emit a sextload to
+    // a 128-bit vector and a normal sign_extend to 256-bits that should get
+    // correctly legalized. We do this late to allow the canonical form of
+    // sextload to persist throughout the rest of the DAG combiner -- it wants
+    // to fold together any extensions it can, and so will fuse a sign_extend
+    // of an sextload into a sextload targeting a wider value.
+    SDValue Load;
+    if (MemSz == 128) {
+      // Just switch this to a normal load.
+      assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
+                                       "it must be a legal 128-bit vector "
+                                       "type!");
+      Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
+                  Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
+                  Ld->isInvariant(), Ld->getAlignment());
+    } else {
+      assert(MemSz < 128 &&
+             "Can't extend a type wider than 128 bits to a 256 bit vector!");
+      // Do an sext load to a 128-bit vector type. We want to use the same
+      // number of elements, but elements half as wide. This will end up being
+      // recursively lowered by this routine, but will succeed as we definitely
+      // have all the necessary features if we're using AVX1.
+      EVT HalfEltVT =
+          EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
+      EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
+      Load =
+          DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
+                         Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
+                         Ld->isNonTemporal(), Ld->isInvariant(),
+                         Ld->getAlignment());
+    }
+
+    // Replace chain users with the new chain.
+    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+
+    // Finally, do a normal sign-extend to the desired register.
+    return DAG.getSExtOrTrunc(Load, dl, RegVT);
+  }
+
+  // All sizes must be a power of two.
+  assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
+         "Non-power-of-two elements are not custom lowered!");
+
+  // Attempt to load the original value using scalar loads.
+  // Find the largest scalar type that divides the total loaded size.
+  MVT SclrLoadTy = MVT::i8;
+  for (MVT Tp : MVT::integer_valuetypes()) {
+    if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
+      SclrLoadTy = Tp;
+    }
+  }
+
+  // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+  if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
+      (64 <= MemSz))
+    SclrLoadTy = MVT::f64;
+
+  // Calculate the number of scalar loads that we need to perform
+  // in order to load our vector from memory.
+  unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+
+  assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
+         "Can only lower sext loads with a single scalar load!");
+
+  unsigned loadRegZize = RegSz;
+  if (Ext == ISD::SEXTLOAD && RegSz >= 256)
+    loadRegZize = 128;
+
+  // Represent our vector as a sequence of elements which are the
+  // largest scalar that we can load.
+  EVT LoadUnitVecVT = EVT::getVectorVT(
+      *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
+
+  // Represent the data using the same element type that is stored in
+  // memory. In practice, we ''widen'' MemVT.
+  EVT WideVecVT =
+      EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+                       loadRegZize / MemVT.getScalarSizeInBits());
+
+  assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
+         "Invalid vector type");
+
+  // We can't shuffle using an illegal type.
+  assert(TLI.isTypeLegal(WideVecVT) &&
+         "We only lower types that form legal widened vector types");
+
+  SmallVector<SDValue, 8> Chains;
+  SDValue Ptr = Ld->getBasePtr();
+  SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
+                                      TLI.getPointerTy(DAG.getDataLayout()));
+  SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
+
+  for (unsigned i = 0; i < NumLoads; ++i) {
+    // Perform a single load.
+    SDValue ScalarLoad =
+        DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+                    Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
+                    Ld->getAlignment());
+    Chains.push_back(ScalarLoad.getValue(1));
+    // Create the first element type using SCALAR_TO_VECTOR in order to avoid
+    // another round of DAGCombining.
+    if (i == 0)
+      Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
+    else
+      Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
+                        ScalarLoad, DAG.getIntPtrConstant(i, dl));
+
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+  }
+
+  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+
+  // Bitcast the loaded value to a vector of the original element type, in
+  // the size of the target vector type.
+  SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
+  unsigned SizeRatio = RegSz / MemSz;
+
+  if (Ext == ISD::SEXTLOAD) {
+    // If we have SSE4.1, we can directly emit a VSEXT node.
+    if (Subtarget->hasSSE41()) {
+      SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+      return Sext;
+    }
+
+    // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
+    // lanes.
+    assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
+           "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
+
+    SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+    return Shuff;
+  }
+
+  // Redistribute the loaded elements into the different locations.
+  SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+  for (unsigned i = 0; i != NumElems; ++i)
+    ShuffleVec[i * SizeRatio] = i;
+
+  SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+                                       DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
+
+  // Bitcast to the requested type.
+  Shuff = DAG.getBitcast(RegVT, Shuff);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+  return Shuff;
+}
+
+// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
+// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
+// from the AND / OR.
+static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
+  Opc = Op.getOpcode();
+  if (Opc != ISD::OR && Opc != ISD::AND)
+    return false;
+  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+          Op.getOperand(0).hasOneUse() &&
+          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
+          Op.getOperand(1).hasOneUse());
+}
+
+// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
+// 1 and that the SETCC node has a single use.
+static bool isXor1OfSetCC(SDValue Op) {
+  if (Op.getOpcode() != ISD::XOR)
+    return false;
+  if (isOneConstant(Op.getOperand(1)))
+    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+           Op.getOperand(0).hasOneUse();
+  return false;
+}
+
+SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+  bool addTest = true;
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond  = Op.getOperand(1);
+  SDValue Dest  = Op.getOperand(2);
+  SDLoc dl(Op);
+  SDValue CC;
+  bool Inverted = false;
+
+  if (Cond.getOpcode() == ISD::SETCC) {
+    // Check for setcc([su]{add,sub,mul}o == 0).
+    if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
+        isNullConstant(Cond.getOperand(1)) &&
+        Cond.getOperand(0).getResNo() == 1 &&
+        (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
+         Cond.getOperand(0).getOpcode() == ISD::UADDO ||
+         Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
+         Cond.getOperand(0).getOpcode() == ISD::USUBO ||
+         Cond.getOperand(0).getOpcode() == ISD::SMULO ||
+         Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
+      Inverted = true;
+      Cond = Cond.getOperand(0);
+    } else {
+      SDValue NewCond = LowerSETCC(Cond, DAG);
+      if (NewCond.getNode())
+        Cond = NewCond;
+    }
+  }
+#if 0
+  // FIXME: LowerXALUO doesn't handle these!!
+  else if (Cond.getOpcode() == X86ISD::ADD  ||
+           Cond.getOpcode() == X86ISD::SUB  ||
+           Cond.getOpcode() == X86ISD::SMUL ||
+           Cond.getOpcode() == X86ISD::UMUL)
+    Cond = LowerXALUO(Cond, DAG);
+#endif
+
+  // Look pass (and (setcc_carry (cmp ...)), 1).
+  if (Cond.getOpcode() == ISD::AND &&
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+      isOneConstant(Cond.getOperand(1)))
+    Cond = Cond.getOperand(0);
+
+  // If condition flag is set by a X86ISD::CMP, then use it as the condition
+  // setting operand in place of the X86ISD::SETCC.
+  unsigned CondOpcode = Cond.getOpcode();
+  if (CondOpcode == X86ISD::SETCC ||
+      CondOpcode == X86ISD::SETCC_CARRY) {
+    CC = Cond.getOperand(0);
+
+    SDValue Cmp = Cond.getOperand(1);
+    unsigned Opc = Cmp.getOpcode();
+    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
+    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
+      Cond = Cmp;
+      addTest = false;
+    } else {
+      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
+      default: break;
+      case X86::COND_O:
+      case X86::COND_B:
+        // These can only come from an arithmetic instruction with overflow,
+        // e.g. SADDO, UADDO.
+        Cond = Cond.getNode()->getOperand(1);
+        addTest = false;
+        break;
+      }
+    }
+  }
+  CondOpcode = Cond.getOpcode();
+  if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+      CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+      ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+       Cond.getOperand(0).getValueType() != MVT::i8)) {
+    SDValue LHS = Cond.getOperand(0);
+    SDValue RHS = Cond.getOperand(1);
+    unsigned X86Opcode;
+    unsigned X86Cond;
+    SDVTList VTs;
+    // Keep this in sync with LowerXALUO, otherwise we might create redundant
+    // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
+    // X86ISD::INC).
+    switch (CondOpcode) {
+    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+    case ISD::SADDO:
+      if (isOneConstant(RHS)) {
+          X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
+          break;
+        }
+      X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+    case ISD::SSUBO:
+      if (isOneConstant(RHS)) {
+          X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
+          break;
+        }
+      X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+    default: llvm_unreachable("unexpected overflowing operator");
+    }
+    if (Inverted)
+      X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
+    if (CondOpcode == ISD::UMULO)
+      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+                          MVT::i32);
+    else
+      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+    SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
+
+    if (CondOpcode == ISD::UMULO)
+      Cond = X86Op.getValue(2);
+    else
+      Cond = X86Op.getValue(1);
+
+    CC = DAG.getConstant(X86Cond, dl, MVT::i8);
+    addTest = false;
+  } else {
+    unsigned CondOpc;
+    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
+      SDValue Cmp = Cond.getOperand(0).getOperand(1);
+      if (CondOpc == ISD::OR) {
+        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
+        // two branches instead of an explicit OR instruction with a
+        // separate test.
+        if (Cmp == Cond.getOperand(1).getOperand(1) &&
+            isX86LogicalCmp(Cmp)) {
+          CC = Cond.getOperand(0).getOperand(0);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = Cond.getOperand(1).getOperand(0);
+          Cond = Cmp;
+          addTest = false;
+        }
+      } else { // ISD::AND
+        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
+        // two branches instead of an explicit AND instruction with a
+        // separate test. However, we only do this if this block doesn't
+        // have a fall-through edge, because this requires an explicit
+        // jmp when the condition is false.
+        if (Cmp == Cond.getOperand(1).getOperand(1) &&
+            isX86LogicalCmp(Cmp) &&
+            Op.getNode()->hasOneUse()) {
+          X86::CondCode CCode =
+            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+          CCode = X86::GetOppositeBranchCondition(CCode);
+          CC = DAG.getConstant(CCode, dl, MVT::i8);
+          SDNode *User = *Op.getNode()->use_begin();
+          // Look for an unconditional branch following this conditional branch.
+          // We need this because we need to reverse the successors in order
+          // to implement FCMP_OEQ.
+          if (User->getOpcode() == ISD::BR) {
+            SDValue FalseBB = User->getOperand(1);
+            SDNode *NewBR =
+              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+            assert(NewBR == User);
+            (void)NewBR;
+            Dest = FalseBB;
+
+            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                                Chain, Dest, CC, Cmp);
+            X86::CondCode CCode =
+              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
+            CCode = X86::GetOppositeBranchCondition(CCode);
+            CC = DAG.getConstant(CCode, dl, MVT::i8);
+            Cond = Cmp;
+            addTest = false;
+          }
+        }
+      }
+    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
+      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
+      // It should be transformed during dag combiner except when the condition
+      // is set by a arithmetics with overflow node.
+      X86::CondCode CCode =
+        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+      CCode = X86::GetOppositeBranchCondition(CCode);
+      CC = DAG.getConstant(CCode, dl, MVT::i8);
+      Cond = Cond.getOperand(0).getOperand(1);
+      addTest = false;
+    } else if (Cond.getOpcode() == ISD::SETCC &&
+               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
+      // For FCMP_OEQ, we can emit
+      // two branches instead of an explicit AND instruction with a
+      // separate test. However, we only do this if this block doesn't
+      // have a fall-through edge, because this requires an explicit
+      // jmp when the condition is false.
+      if (Op.getNode()->hasOneUse()) {
+        SDNode *User = *Op.getNode()->use_begin();
+        // Look for an unconditional branch following this conditional branch.
+        // We need this because we need to reverse the successors in order
+        // to implement FCMP_OEQ.
+        if (User->getOpcode() == ISD::BR) {
+          SDValue FalseBB = User->getOperand(1);
+          SDNode *NewBR =
+            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+          assert(NewBR == User);
+          (void)NewBR;
+          Dest = FalseBB;
+
+          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+                                    Cond.getOperand(0), Cond.getOperand(1));
+          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+          CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
+          Cond = Cmp;
+          addTest = false;
+        }
+      }
+    } else if (Cond.getOpcode() == ISD::SETCC &&
+               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
+      // For FCMP_UNE, we can emit
+      // two branches instead of an explicit AND instruction with a
+      // separate test. However, we only do this if this block doesn't
+      // have a fall-through edge, because this requires an explicit
+      // jmp when the condition is false.
+      if (Op.getNode()->hasOneUse()) {
+        SDNode *User = *Op.getNode()->use_begin();
+        // Look for an unconditional branch following this conditional branch.
+        // We need this because we need to reverse the successors in order
+        // to implement FCMP_UNE.
+        if (User->getOpcode() == ISD::BR) {
+          SDValue FalseBB = User->getOperand(1);
+          SDNode *NewBR =
+            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+          assert(NewBR == User);
+          (void)NewBR;
+
+          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+                                    Cond.getOperand(0), Cond.getOperand(1));
+          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+          CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
+          Cond = Cmp;
+          addTest = false;
+          Dest = FalseBB;
+        }
+      }
+    }
+  }
+
+  if (addTest) {
+    // Look pass the truncate if the high bits are known zero.
+    if (isTruncWithZeroHighBitsInput(Cond, DAG))
+        Cond = Cond.getOperand(0);
+
+    // We know the result of AND is compared against zero. Try to match
+    // it to BT.
+    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+      if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
+        CC = NewSetCC.getOperand(0);
+        Cond = NewSetCC.getOperand(1);
+        addTest = false;
+      }
+    }
+  }
+
+  if (addTest) {
+    X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
+    CC = DAG.getConstant(X86Cond, dl, MVT::i8);
+    Cond = EmitTest(Cond, X86Cond, dl, DAG);
+  }
+  Cond = ConvertCmpIfNecessary(Cond, DAG);
+  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                     Chain, Dest, CC, Cond);
+}
+
+// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
+// Calls to _alloca are needed to probe the stack when allocating more than 4k
+// bytes in one go. Touching the stack at 4K increments is necessary to ensure
+// that the guard pages used by the OS virtual memory manager are allocated in
+// correct sequence.
+SDValue
+X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool SplitStack = MF.shouldSplitStack();
+  bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
+               SplitStack;
+  SDLoc dl(Op);
+
+  // Get the inputs.
+  SDNode *Node = Op.getNode();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  EVT VT = Node->getValueType(0);
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+
+  bool Is64Bit = Subtarget->is64Bit();
+  MVT SPTy = getPointerTy(DAG.getDataLayout());
+
+  SDValue Result;
+  if (!Lower) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+    assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
+                    " not tell us which reg is the stack pointer!");
+    EVT VT = Node->getValueType(0);
+    SDValue Tmp3 = Node->getOperand(2);
+
+    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+    Chain = SP.getValue(1);
+    unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
+    const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+    unsigned StackAlign = TFI.getStackAlignment();
+    Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+    if (Align > StackAlign)
+      Result = DAG.getNode(ISD::AND, dl, VT, Result,
+                         DAG.getConstant(-(uint64_t)Align, dl, VT));
+    Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
+  } else if (SplitStack) {
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+
+    if (Is64Bit) {
+      // The 64 bit implementation of segmented stacks needs to clobber both r10
+      // r11. This makes it impossible to use it along with nested parameters.
+      const Function *F = MF.getFunction();
+
+      for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+           I != E; ++I)
+        if (I->hasNestAttr())
+          report_fatal_error("Cannot use segmented stacks with functions that "
+                             "have nested arguments.");
+    }
+
+    const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
+    unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
+    Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
+    Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
+                                DAG.getRegister(Vreg, SPTy));
+  } else {
+    SDValue Flag;
+    const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
+
+    Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
+    Flag = Chain.getValue(1);
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+    Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
+
+    const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+    unsigned SPReg = RegInfo->getStackRegister();
+    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
+    Chain = SP.getValue(1);
+
+    if (Align) {
+      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                       DAG.getConstant(-(uint64_t)Align, dl, VT));
+      Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
+    }
+
+    Result = SP;
+  }
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
+                             DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
+
+  SDValue Ops[2] = {Result, Chain};
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  SDLoc DL(Op);
+
+  if (!Subtarget->is64Bit() ||
+      Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) {
+    // vastart just stores the address of the VarArgsFrameIndex slot into the
+    // memory location argument.
+    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+    return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+                        MachinePointerInfo(SV), false, false, 0);
+  }
+
+  // __va_list_tag:
+  //   gp_offset         (0 - 6 * 8)
+  //   fp_offset         (48 - 48 + 8 * 16)
+  //   overflow_arg_area (point to parameters coming in memory).
+  //   reg_save_area
+  SmallVector<SDValue, 8> MemOps;
+  SDValue FIN = Op.getOperand(1);
+  // Store gp_offset
+  SDValue Store = DAG.getStore(Op.getOperand(0), DL,
+                               DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
+                                               DL, MVT::i32),
+                               FIN, MachinePointerInfo(SV), false, false, 0);
+  MemOps.push_back(Store);
+
+  // Store fp_offset
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
+  Store = DAG.getStore(Op.getOperand(0), DL,
+                       DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL,
+                                       MVT::i32),
+                       FIN, MachinePointerInfo(SV, 4), false, false, 0);
+  MemOps.push_back(Store);
+
+  // Store ptr to overflow_arg_area
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
+  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+  Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
+                       MachinePointerInfo(SV, 8),
+                       false, false, 0);
+  MemOps.push_back(Store);
+
+  // Store ptr to reg_save_area.
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
+      Subtarget->isTarget64BitLP64() ? 8 : 4, DL));
+  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
+  Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(
+      SV, Subtarget->isTarget64BitLP64() ? 16 : 12), false, false, 0);
+  MemOps.push_back(Store);
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+}
+
+SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->is64Bit() &&
+         "LowerVAARG only handles 64-bit va_arg!");
+  assert(Op.getNode()->getNumOperands() == 4);
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()))
+    // The Win64 ABI uses char* instead of a structure.
+    return DAG.expandVAArg(Op.getNode());
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue SrcPtr = Op.getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  unsigned Align = Op.getConstantOperandVal(3);
+  SDLoc dl(Op);
+
+  EVT ArgVT = Op.getNode()->getValueType(0);
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+  uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
+  uint8_t ArgMode;
+
+  // Decide which area this value should be read from.
+  // TODO: Implement the AMD64 ABI in its entirety. This simple
+  // selection mechanism works only for the basic types.
+  if (ArgVT == MVT::f80) {
+    llvm_unreachable("va_arg for f80 not yet implemented");
+  } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
+    ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
+  } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
+    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
+  } else {
+    llvm_unreachable("Unhandled argument type in LowerVAARG");
+  }
+
+  if (ArgMode == 2) {
+    // Sanity Check: Make sure using fp_offset makes sense.
+    assert(!Subtarget->useSoftFloat() &&
+           !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
+           Subtarget->hasSSE1());
+  }
+
+  // Insert VAARG_64 node into the DAG
+  // VAARG_64 returns two values: Variable Argument Address, Chain
+  SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
+                       DAG.getConstant(ArgMode, dl, MVT::i8),
+                       DAG.getConstant(Align, dl, MVT::i32)};
+  SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
+  SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
+                                          VTs, InstOps, MVT::i64,
+                                          MachinePointerInfo(SV),
+                                          /*Align=*/0,
+                                          /*Volatile=*/false,
+                                          /*ReadMem=*/true,
+                                          /*WriteMem=*/true);
+  Chain = VAARG.getValue(1);
+
+  // Load the next argument and return it
+  return DAG.getLoad(ArgVT, dl,
+                     Chain,
+                     VAARG,
+                     MachinePointerInfo(),
+                     false, false, false, 0);
+}
+
+static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG) {
+  // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
+  // where a va_list is still an i8*.
+  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
+  if (Subtarget->isCallingConvWin64(
+        DAG.getMachineFunction().getFunction()->getCallingConv()))
+    // Probably a Win64 va_copy.
+    return DAG.expandVACopy(Op.getNode());
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue DstPtr = Op.getOperand(1);
+  SDValue SrcPtr = Op.getOperand(2);
+  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+  SDLoc DL(Op);
+
+  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
+                       DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
+                       false, false,
+                       MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
+}
+
+// getTargetVShiftByConstNode - Handle vector element shifts where the shift
+// amount is a constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
+                                          SDValue SrcOp, uint64_t ShiftAmt,
+                                          SelectionDAG &DAG) {
+  MVT ElementType = VT.getVectorElementType();
+
+  // Fold this packed shift into its first operand if ShiftAmt is 0.
+  if (ShiftAmt == 0)
+    return SrcOp;
+
+  // Check for ShiftAmt >= element width
+  if (ShiftAmt >= ElementType.getSizeInBits()) {
+    if (Opc == X86ISD::VSRAI)
+      ShiftAmt = ElementType.getSizeInBits() - 1;
+    else
+      return DAG.getConstant(0, dl, VT);
+  }
+
+  assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
+         && "Unknown target vector shift-by-constant node");
+
+  // Fold this packed vector shift into a build vector if SrcOp is a
+  // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
+  if (VT == SrcOp.getSimpleValueType() &&
+      ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
+    SmallVector<SDValue, 8> Elts;
+    unsigned NumElts = SrcOp->getNumOperands();
+    ConstantSDNode *ND;
+
+    switch(Opc) {
+    default: llvm_unreachable(nullptr);
+    case X86ISD::VSHLI:
+      for (unsigned i=0; i!=NumElts; ++i) {
+        SDValue CurrentOp = SrcOp->getOperand(i);
+        if (CurrentOp->getOpcode() == ISD::UNDEF) {
+          Elts.push_back(CurrentOp);
+          continue;
+        }
+        ND = cast<ConstantSDNode>(CurrentOp);
+        const APInt &C = ND->getAPIntValue();
+        Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
+      }
+      break;
+    case X86ISD::VSRLI:
+      for (unsigned i=0; i!=NumElts; ++i) {
+        SDValue CurrentOp = SrcOp->getOperand(i);
+        if (CurrentOp->getOpcode() == ISD::UNDEF) {
+          Elts.push_back(CurrentOp);
+          continue;
+        }
+        ND = cast<ConstantSDNode>(CurrentOp);
+        const APInt &C = ND->getAPIntValue();
+        Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
+      }
+      break;
+    case X86ISD::VSRAI:
+      for (unsigned i=0; i!=NumElts; ++i) {
+        SDValue CurrentOp = SrcOp->getOperand(i);
+        if (CurrentOp->getOpcode() == ISD::UNDEF) {
+          Elts.push_back(CurrentOp);
+          continue;
+        }
+        ND = cast<ConstantSDNode>(CurrentOp);
+        const APInt &C = ND->getAPIntValue();
+        Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
+      }
+      break;
+    }
+
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
+  }
+
+  return DAG.getNode(Opc, dl, VT, SrcOp,
+                     DAG.getConstant(ShiftAmt, dl, MVT::i8));
+}
+
+// getTargetVShiftNode - Handle vector element shifts where the shift amount
+// may or may not be a constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
+                                   SDValue SrcOp, SDValue ShAmt,
+                                   SelectionDAG &DAG) {
+  MVT SVT = ShAmt.getSimpleValueType();
+  assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
+
+  // Catch shift-by-constant.
+  if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
+    return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
+                                      CShAmt->getZExtValue(), DAG);
+
+  // Change opcode to non-immediate version
+  switch (Opc) {
+    default: llvm_unreachable("Unknown target vector shift node");
+    case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
+    case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
+    case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
+  }
+
+  const X86Subtarget &Subtarget =
+      static_cast<const X86Subtarget &>(DAG.getSubtarget());
+  if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
+      ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
+    // Let the shuffle legalizer expand this shift amount node.
+    SDValue Op0 = ShAmt.getOperand(0);
+    Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
+    ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
+  } else {
+    // Need to build a vector containing shift amount.
+    // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
+    SmallVector<SDValue, 4> ShOps;
+    ShOps.push_back(ShAmt);
+    if (SVT == MVT::i32) {
+      ShOps.push_back(DAG.getConstant(0, dl, SVT));
+      ShOps.push_back(DAG.getUNDEF(SVT));
+    }
+    ShOps.push_back(DAG.getUNDEF(SVT));
+
+    MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
+    ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
+  }
+
+  // The return type has to be a 128-bit type with the same element
+  // type as the input type.
+  MVT EltVT = VT.getVectorElementType();
+  MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
+
+  ShAmt = DAG.getBitcast(ShVT, ShAmt);
+  return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+}
+
+/// \brief Return Mask with the necessary casting or extending
+/// for \p Mask according to \p MaskVT when lowering masking intrinsics
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+                           const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG, SDLoc dl) {
+
+  if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
+    // Mask should be extended
+    Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
+                       MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
+  }
+
+  if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) {
+    if (MaskVT == MVT::v64i1) {
+      assert(Subtarget->hasBWI() && "Expected AVX512BW target!");
+      // In case 32bit mode, bitcast i64 is illegal, extend/split it.
+      SDValue Lo, Hi;
+      Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+                          DAG.getConstant(0, dl, MVT::i32));
+      Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+                          DAG.getConstant(1, dl, MVT::i32));
+
+      Lo = DAG.getBitcast(MVT::v32i1, Lo);
+      Hi = DAG.getBitcast(MVT::v32i1, Hi);
+
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
+    } else {
+      // MaskVT require < 64bit. Truncate mask (should succeed in any case),
+      // and bitcast.
+      MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
+      return DAG.getBitcast(MaskVT,
+                            DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
+    }
+
+  } else {
+    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+                                     Mask.getSimpleValueType().getSizeInBits());
+    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+    // are extracted by EXTRACT_SUBVECTOR.
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                       DAG.getBitcast(BitcastVT, Mask),
+                       DAG.getIntPtrConstant(0, dl));
+  }
+}
+
+/// \brief Return (and \p Op, \p Mask) for compare instructions or
+/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
+/// necessary casting or extending for \p Mask when lowering masking intrinsics
+static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
+                  SDValue PreservedSrc,
+                  const X86Subtarget *Subtarget,
+                  SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+  unsigned OpcodeSelect = ISD::VSELECT;
+  SDLoc dl(Op);
+
+  if (isAllOnesConstant(Mask))
+    return Op;
+
+  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+  switch (Op.getOpcode()) {
+  default: break;
+  case X86ISD::PCMPEQM:
+  case X86ISD::PCMPGTM:
+  case X86ISD::CMPM:
+  case X86ISD::CMPMU:
+    return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
+  case X86ISD::VFPCLASS:
+    case X86ISD::VFPCLASSS:
+    return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
+  case X86ISD::VTRUNC:
+  case X86ISD::VTRUNCS:
+  case X86ISD::VTRUNCUS:
+    // We can't use ISD::VSELECT here because it is not always "Legal"
+    // for the destination type. For example vpmovqb require only AVX512
+    // and vselect that can operate on byte element type require BWI
+    OpcodeSelect = X86ISD::SELECT;
+    break;
+  }
+  if (PreservedSrc.getOpcode() == ISD::UNDEF)
+    PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+  return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
+}
+
+/// \brief Creates an SDNode for a predicated scalar operation.
+/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
+/// The mask is coming as MVT::i8 and it should be truncated
+/// to MVT::i1 while lowering masking intrinsics.
+/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
+/// "X86select" instead of "vselect". We just can't create the "vselect" node
+/// for a scalar instruction.
+static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
+                                    SDValue PreservedSrc,
+                                    const X86Subtarget *Subtarget,
+                                    SelectionDAG &DAG) {
+  if (isAllOnesConstant(Mask))
+    return Op;
+
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+  // The mask should be of type MVT::i1
+  SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
+
+  if (Op.getOpcode() == X86ISD::FSETCC)
+    return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
+  if (Op.getOpcode() == X86ISD::VFPCLASS ||
+      Op.getOpcode() == X86ISD::VFPCLASSS)
+    return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
+
+  if (PreservedSrc.getOpcode() == ISD::UNDEF)
+    PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+  return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
+}
+
+static int getSEHRegistrationNodeSize(const Function *Fn) {
+  if (!Fn->hasPersonalityFn())
+    report_fatal_error(
+        "querying registration node size for function without personality");
+  // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
+  // WinEHStatePass for the full struct definition.
+  switch (classifyEHPersonality(Fn->getPersonalityFn())) {
+  case EHPersonality::MSVC_X86SEH: return 24;
+  case EHPersonality::MSVC_CXX: return 16;
+  default: break;
+  }
+  report_fatal_error(
+      "can only recover FP for 32-bit MSVC EH personality functions");
+}
+
+/// When the MSVC runtime transfers control to us, either to an outlined
+/// function or when returning to a parent frame after catching an exception, we
+/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
+/// Here's the math:
+///   RegNodeBase = EntryEBP - RegNodeSize
+///   ParentFP = RegNodeBase - ParentFrameOffset
+/// Subtracting RegNodeSize takes us to the offset of the registration node, and
+/// subtracting the offset (negative on x86) takes us back to the parent FP.
+static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
+                                   SDValue EntryEBP) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDLoc dl;
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+
+  // It's possible that the parent function no longer has a personality function
+  // if the exceptional code was optimized away, in which case we just return
+  // the incoming EBP.
+  if (!Fn->hasPersonalityFn())
+    return EntryEBP;
+
+  // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
+  // registration, or the .set_setframe offset.
+  MCSymbol *OffsetSym =
+      MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
+          GlobalValue::getRealLinkageName(Fn->getName()));
+  SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
+  SDValue ParentFrameOffset =
+      DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
+
+  // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
+  // prologue to RBP in the parent function.
+  const X86Subtarget &Subtarget =
+      static_cast<const X86Subtarget &>(DAG.getSubtarget());
+  if (Subtarget.is64Bit())
+    return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
+
+  int RegNodeSize = getSEHRegistrationNodeSize(Fn);
+  // RegNodeBase = EntryEBP - RegNodeSize
+  // ParentFP = RegNodeBase - ParentFrameOffset
+  SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
+                                    DAG.getConstant(RegNodeSize, dl, PtrVT));
+  return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
+}
+
+static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  MVT VT = Op.getSimpleValueType();
+  const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
+  if (IntrData) {
+    switch(IntrData->Type) {
+    case INTR_TYPE_1OP:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
+    case INTR_TYPE_2OP:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+        Op.getOperand(2));
+    case INTR_TYPE_2OP_IMM8:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+                         DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2)));
+    case INTR_TYPE_3OP:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+        Op.getOperand(2), Op.getOperand(3));
+    case INTR_TYPE_4OP:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
+    case INTR_TYPE_1OP_MASK_RM: {
+      SDValue Src = Op.getOperand(1);
+      SDValue PassThru = Op.getOperand(2);
+      SDValue Mask = Op.getOperand(3);
+      SDValue RoundingMode;
+      // We allways add rounding mode to the Node.
+      // If the rounding mode is not specified, we add the
+      // "current direction" mode.
+      if (Op.getNumOperands() == 4)
+        RoundingMode =
+          DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+      else
+        RoundingMode = Op.getOperand(4);
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0)
+        if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() !=
+            X86::STATIC_ROUNDING::CUR_DIRECTION)
+          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                      dl, Op.getValueType(), Src, RoundingMode),
+                                      Mask, PassThru, Subtarget, DAG);
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
+                                              RoundingMode),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_1OP_MASK: {
+      SDValue Src = Op.getOperand(1);
+      SDValue PassThru = Op.getOperand(2);
+      SDValue Mask = Op.getOperand(3);
+      // We add rounding mode to the Node when
+      //   - RM Opcode is specified and
+      //   - RM is not "current direction".
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(4);
+        unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+        if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                      dl, Op.getValueType(),
+                                      Src, Rnd),
+                                      Mask, PassThru, Subtarget, DAG);
+        }
+      }
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_SCALAR_MASK: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue passThru = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
+                                  Mask, passThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_SCALAR_MASK_RM: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src0 = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      // There are 2 kinds of intrinsics in this group:
+      // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
+      // (2) With rounding mode and sae - 7 operands.
+      if (Op.getNumOperands() == 6) {
+        SDValue Sae  = Op.getOperand(5);
+        unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0;
+        return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2,
+                                                Sae),
+                                    Mask, Src0, Subtarget, DAG);
+      }
+      assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
+      SDValue RoundingMode  = Op.getOperand(5);
+      SDValue Sae  = Op.getOperand(6);
+      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
+                                              RoundingMode, Sae),
+                                  Mask, Src0, Subtarget, DAG);
+    }
+    case INTR_TYPE_2OP_MASK:
+    case INTR_TYPE_2OP_IMM8_MASK: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue PassThru = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+
+      if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
+        Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
+
+      // We specify 2 possible opcodes for intrinsics with rounding modes.
+      // First, we check if the intrinsic may have non-default rounding mode,
+      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(5);
+        unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+        if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                      dl, Op.getValueType(),
+                                      Src1, Src2, Rnd),
+                                      Mask, PassThru, Subtarget, DAG);
+        }
+      }
+      // TODO: Intrinsics should have fast-math-flags to propagate.
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_2OP_MASK_RM: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue PassThru = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      // We specify 2 possible modes for intrinsics, with/without rounding
+      // modes.
+      // First, we check if the intrinsic have rounding mode (6 operands),
+      // if not, we set rounding mode to "current".
+      SDValue Rnd;
+      if (Op.getNumOperands() == 6)
+        Rnd = Op.getOperand(5);
+      else
+        Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                              Src1, Src2, Rnd),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_3OP_SCALAR_MASK_RM: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue PassThru = Op.getOperand(4);
+      SDValue Mask = Op.getOperand(5);
+      SDValue Sae  = Op.getOperand(6);
+
+      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
+                                              Src2, Src3, Sae),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_3OP_MASK_RM: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Imm = Op.getOperand(3);
+      SDValue PassThru = Op.getOperand(4);
+      SDValue Mask = Op.getOperand(5);
+      // We specify 2 possible modes for intrinsics, with/without rounding
+      // modes.
+      // First, we check if the intrinsic have rounding mode (7 operands),
+      // if not, we set rounding mode to "current".
+      SDValue Rnd;
+      if (Op.getNumOperands() == 7)
+        Rnd = Op.getOperand(6);
+      else
+        Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+        Src1, Src2, Imm, Rnd),
+        Mask, PassThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_3OP_IMM8_MASK:
+    case INTR_TYPE_3OP_MASK:
+    case INSERT_SUBVEC: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue PassThru = Op.getOperand(4);
+      SDValue Mask = Op.getOperand(5);
+
+      if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
+        Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
+      else if (IntrData->Type == INSERT_SUBVEC) {
+        // imm should be adapted to ISD::INSERT_SUBVECTOR behavior
+        assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!");
+        unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue();
+        Imm *= Src2.getSimpleValueType().getVectorNumElements();
+        Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32);
+      }
+
+      // We specify 2 possible opcodes for intrinsics with rounding modes.
+      // First, we check if the intrinsic may have non-default rounding mode,
+      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(6);
+        unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+        if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                      dl, Op.getValueType(),
+                                      Src1, Src2, Src3, Rnd),
+                                      Mask, PassThru, Subtarget, DAG);
+        }
+      }
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                              Src1, Src2, Src3),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case VPERM_3OP_MASKZ:
+    case VPERM_3OP_MASK:{
+      // Src2 is the PassThru
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      MVT VT = Op.getSimpleValueType();
+      SDValue PassThru = SDValue();
+
+      // set PassThru element
+      if (IntrData->Type == VPERM_3OP_MASKZ)
+        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+      else
+        PassThru = DAG.getBitcast(VT, Src2);
+
+      // Swap Src1 and Src2 in the node creation
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+                                              dl, Op.getValueType(),
+                                              Src2, Src1, Src3),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case FMA_OP_MASK3:
+    case FMA_OP_MASKZ:
+    case FMA_OP_MASK: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      MVT VT = Op.getSimpleValueType();
+      SDValue PassThru = SDValue();
+
+      // set PassThru element
+      if (IntrData->Type == FMA_OP_MASKZ)
+        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+      else if (IntrData->Type == FMA_OP_MASK3)
+        PassThru = Src3;
+      else
+        PassThru = Src1;
+
+      // We specify 2 possible opcodes for intrinsics with rounding modes.
+      // First, we check if the intrinsic may have non-default rounding mode,
+      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(5);
+        if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
+            X86::STATIC_ROUNDING::CUR_DIRECTION)
+          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                                  dl, Op.getValueType(),
+                                                  Src1, Src2, Src3, Rnd),
+                                      Mask, PassThru, Subtarget, DAG);
+      }
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+                                              dl, Op.getValueType(),
+                                              Src1, Src2, Src3),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case TERLOG_OP_MASK:
+    case TERLOG_OP_MASKZ: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
+      SDValue Mask = Op.getOperand(5);
+      MVT VT = Op.getSimpleValueType();
+      SDValue PassThru = Src1;
+      // Set PassThru element.
+      if (IntrData->Type == TERLOG_OP_MASKZ)
+        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                              Src1, Src2, Src3, Src4),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case FPCLASS: {
+      // FPclass intrinsics with mask
+       SDValue Src1 = Op.getOperand(1);
+       MVT VT = Src1.getSimpleValueType();
+       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+       SDValue Imm = Op.getOperand(2);
+       SDValue Mask = Op.getOperand(3);
+       MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+                                     Mask.getSimpleValueType().getSizeInBits());
+       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
+       SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
+                                                 DAG.getTargetConstant(0, dl, MaskVT),
+                                                 Subtarget, DAG);
+       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+                                 DAG.getUNDEF(BitcastVT), FPclassMask,
+                                 DAG.getIntPtrConstant(0, dl));
+       return DAG.getBitcast(Op.getValueType(), Res);
+    }
+    case FPCLASSS: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Imm = Op.getOperand(2);
+      SDValue Mask = Op.getOperand(3);
+      SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
+      SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
+        DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
+      return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask);
+    }
+    case CMP_MASK:
+    case CMP_MASK_CC: {
+      // Comparison intrinsics with masks.
+      // Example of transformation:
+      // (i8 (int_x86_avx512_mask_pcmpeq_q_128
+      //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
+      // (i8 (bitcast
+      //   (v8i1 (insert_subvector undef,
+      //           (v2i1 (and (PCMPEQM %a, %b),
+      //                      (extract_subvector
+      //                         (v8i1 (bitcast %mask)), 0))), 0))))
+      MVT VT = Op.getOperand(1).getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+      SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
+      MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+                                       Mask.getSimpleValueType().getSizeInBits());
+      SDValue Cmp;
+      if (IntrData->Type == CMP_MASK_CC) {
+        SDValue CC = Op.getOperand(3);
+        CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
+        // We specify 2 possible opcodes for intrinsics with rounding modes.
+        // First, we check if the intrinsic may have non-default rounding mode,
+        // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+        if (IntrData->Opc1 != 0) {
+          SDValue Rnd = Op.getOperand(5);
+          if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
+              X86::STATIC_ROUNDING::CUR_DIRECTION)
+            Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
+                              Op.getOperand(2), CC, Rnd);
+        }
+        //default rounding mode
+        if(!Cmp.getNode())
+            Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+                              Op.getOperand(2), CC);
+
+      } else {
+        assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
+        Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+                          Op.getOperand(2));
+      }
+      SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
+                                             DAG.getTargetConstant(0, dl,
+                                                                   MaskVT),
+                                             Subtarget, DAG);
+      SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+                                DAG.getUNDEF(BitcastVT), CmpMask,
+                                DAG.getIntPtrConstant(0, dl));
+      return DAG.getBitcast(Op.getValueType(), Res);
+    }
+    case CMP_MASK_SCALAR_CC: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
+      SDValue Mask = Op.getOperand(4);
+
+      SDValue Cmp;
+      if (IntrData->Opc1 != 0) {
+        SDValue Rnd = Op.getOperand(5);
+        if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
+            X86::STATIC_ROUNDING::CUR_DIRECTION)
+          Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
+      }
+      //default rounding mode
+      if(!Cmp.getNode())
+        Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
+
+      SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
+                                             DAG.getTargetConstant(0, dl,
+                                                                   MVT::i1),
+                                             Subtarget, DAG);
+
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i8,
+                         DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, CmpMask),
+                         DAG.getValueType(MVT::i1));
+    }
+    case COMI: { // Comparison intrinsics
+      ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
+      SDValue LHS = Op.getOperand(1);
+      SDValue RHS = Op.getOperand(2);
+      unsigned X86CC = TranslateX86CC(CC, dl, true, LHS, RHS, DAG);
+      assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
+      SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
+      SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                  DAG.getConstant(X86CC, dl, MVT::i8), Cond);
+      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+    }
+    case COMI_RM: { // Comparison intrinsics with Sae
+      SDValue LHS = Op.getOperand(1);
+      SDValue RHS = Op.getOperand(2);
+      SDValue CC = Op.getOperand(3);
+      SDValue Sae = Op.getOperand(4);
+      auto ComiType = TranslateX86ConstCondToX86CC(CC);
+      // choose between ordered and unordered (comi/ucomi)
+      unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1;
+      SDValue Cond;
+      if (cast<ConstantSDNode>(Sae)->getZExtValue() !=
+                                           X86::STATIC_ROUNDING::CUR_DIRECTION)
+        Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae);
+      else
+        Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS);
+      SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+        DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond);
+      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+    }
+    case VSHIFT:
+      return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
+                                 Op.getOperand(1), Op.getOperand(2), DAG);
+    case VSHIFT_MASK:
+      return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
+                                                      Op.getSimpleValueType(),
+                                                      Op.getOperand(1),
+                                                      Op.getOperand(2), DAG),
+                                  Op.getOperand(4), Op.getOperand(3), Subtarget,
+                                  DAG);
+    case COMPRESS_EXPAND_IN_REG: {
+      SDValue Mask = Op.getOperand(3);
+      SDValue DataToCompress = Op.getOperand(1);
+      SDValue PassThru = Op.getOperand(2);
+      if (isAllOnesConstant(Mask)) // return data as is
+        return Op.getOperand(1);
+
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                              DataToCompress),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case BROADCASTM: {
+      SDValue Mask = Op.getOperand(1);
+      MVT MaskVT = MVT::getVectorVT(MVT::i1,
+                                    Mask.getSimpleValueType().getSizeInBits());
+      Mask = DAG.getBitcast(MaskVT, Mask);
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
+    }
+    case BLEND: {
+      SDValue Mask = Op.getOperand(3);
+      MVT VT = Op.getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+      return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
+                         Op.getOperand(2));
+    }
+    case KUNPCK: {
+      MVT VT = Op.getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
+
+      SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+      SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+      // Arguments should be swapped.
+      SDValue Res = DAG.getNode(IntrData->Opc0, dl,
+                                MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
+                                Src2, Src1);
+      return DAG.getBitcast(VT, Res);
+    }
+    case CONVERT_TO_MASK: {
+      MVT SrcVT = Op.getOperand(1).getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
+      MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
+
+      SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
+                                    Op.getOperand(1));
+      SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+                                DAG.getUNDEF(BitcastVT), CvtMask,
+                                DAG.getIntPtrConstant(0, dl));
+      return DAG.getBitcast(Op.getValueType(), Res);
+    }
+    case CONVERT_MASK_TO_VEC: {
+      SDValue Mask = Op.getOperand(1);
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+      return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
+    }
+    case BRCST_SUBVEC_TO_VEC: {
+      SDValue Src = Op.getOperand(1);
+      SDValue Passthru = Op.getOperand(2);
+      SDValue Mask = Op.getOperand(3);
+      EVT resVT = Passthru.getValueType();
+      SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
+                                       DAG.getUNDEF(resVT), Src,
+                                       DAG.getIntPtrConstant(0, dl));
+      SDValue immVal;
+      if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
+        immVal = DAG.getConstant(0x44, dl, MVT::i8);
+      else
+        immVal = DAG.getConstant(0, dl, MVT::i8);
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                              subVec, subVec, immVal),
+                                  Mask, Passthru, Subtarget, DAG);
+    }
+    default:
+      break;
+    }
+  }
+
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+
+  case Intrinsic::x86_avx2_permd:
+  case Intrinsic::x86_avx2_permps:
+    // Operands intentionally swapped. Mask is last operand to intrinsic,
+    // but second operand for node/instruction.
+    return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(1));
+
+  // ptest and testp intrinsics. The intrinsic these come from are designed to
+  // return an integer value, not just an instruction so lower it to the ptest
+  // or testp pattern and a setcc for the result.
+  case Intrinsic::x86_sse41_ptestz:
+  case Intrinsic::x86_sse41_ptestc:
+  case Intrinsic::x86_sse41_ptestnzc:
+  case Intrinsic::x86_avx_ptestz_256:
+  case Intrinsic::x86_avx_ptestc_256:
+  case Intrinsic::x86_avx_ptestnzc_256:
+  case Intrinsic::x86_avx_vtestz_ps:
+  case Intrinsic::x86_avx_vtestc_ps:
+  case Intrinsic::x86_avx_vtestnzc_ps:
+  case Intrinsic::x86_avx_vtestz_pd:
+  case Intrinsic::x86_avx_vtestc_pd:
+  case Intrinsic::x86_avx_vtestnzc_pd:
+  case Intrinsic::x86_avx_vtestz_ps_256:
+  case Intrinsic::x86_avx_vtestc_ps_256:
+  case Intrinsic::x86_avx_vtestnzc_ps_256:
+  case Intrinsic::x86_avx_vtestz_pd_256:
+  case Intrinsic::x86_avx_vtestc_pd_256:
+  case Intrinsic::x86_avx_vtestnzc_pd_256: {
+    bool IsTestPacked = false;
+    unsigned X86CC;
+    switch (IntNo) {
+    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
+    case Intrinsic::x86_avx_vtestz_ps:
+    case Intrinsic::x86_avx_vtestz_pd:
+    case Intrinsic::x86_avx_vtestz_ps_256:
+    case Intrinsic::x86_avx_vtestz_pd_256:
+      IsTestPacked = true; // Fallthrough
+    case Intrinsic::x86_sse41_ptestz:
+    case Intrinsic::x86_avx_ptestz_256:
+      // ZF = 1
+      X86CC = X86::COND_E;
+      break;
+    case Intrinsic::x86_avx_vtestc_ps:
+    case Intrinsic::x86_avx_vtestc_pd:
+    case Intrinsic::x86_avx_vtestc_ps_256:
+    case Intrinsic::x86_avx_vtestc_pd_256:
+      IsTestPacked = true; // Fallthrough
+    case Intrinsic::x86_sse41_ptestc:
+    case Intrinsic::x86_avx_ptestc_256:
+      // CF = 1
+      X86CC = X86::COND_B;
+      break;
+    case Intrinsic::x86_avx_vtestnzc_ps:
+    case Intrinsic::x86_avx_vtestnzc_pd:
+    case Intrinsic::x86_avx_vtestnzc_ps_256:
+    case Intrinsic::x86_avx_vtestnzc_pd_256:
+      IsTestPacked = true; // Fallthrough
+    case Intrinsic::x86_sse41_ptestnzc:
+    case Intrinsic::x86_avx_ptestnzc_256:
+      // ZF and CF = 0
+      X86CC = X86::COND_A;
+      break;
+    }
+
+    SDValue LHS = Op.getOperand(1);
+    SDValue RHS = Op.getOperand(2);
+    unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
+    SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
+    SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
+  case Intrinsic::x86_avx512_kortestz_w:
+  case Intrinsic::x86_avx512_kortestc_w: {
+    unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
+    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+    SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
+    SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
+    SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
+
+  case Intrinsic::x86_sse42_pcmpistria128:
+  case Intrinsic::x86_sse42_pcmpestria128:
+  case Intrinsic::x86_sse42_pcmpistric128:
+  case Intrinsic::x86_sse42_pcmpestric128:
+  case Intrinsic::x86_sse42_pcmpistrio128:
+  case Intrinsic::x86_sse42_pcmpestrio128:
+  case Intrinsic::x86_sse42_pcmpistris128:
+  case Intrinsic::x86_sse42_pcmpestris128:
+  case Intrinsic::x86_sse42_pcmpistriz128:
+  case Intrinsic::x86_sse42_pcmpestriz128: {
+    unsigned Opcode;
+    unsigned X86CC;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse42_pcmpistria128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_A;
+      break;
+    case Intrinsic::x86_sse42_pcmpestria128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_A;
+      break;
+    case Intrinsic::x86_sse42_pcmpistric128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_B;
+      break;
+    case Intrinsic::x86_sse42_pcmpestric128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_B;
+      break;
+    case Intrinsic::x86_sse42_pcmpistrio128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_O;
+      break;
+    case Intrinsic::x86_sse42_pcmpestrio128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_O;
+      break;
+    case Intrinsic::x86_sse42_pcmpistris128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_S;
+      break;
+    case Intrinsic::x86_sse42_pcmpestris128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_S;
+      break;
+    case Intrinsic::x86_sse42_pcmpistriz128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_E;
+      break;
+    case Intrinsic::x86_sse42_pcmpestriz128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_E;
+      break;
+    }
+    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                DAG.getConstant(X86CC, dl, MVT::i8),
+                                SDValue(PCMP.getNode(), 1));
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
+
+  case Intrinsic::x86_sse42_pcmpistri128:
+  case Intrinsic::x86_sse42_pcmpestri128: {
+    unsigned Opcode;
+    if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
+      Opcode = X86ISD::PCMPISTRI;
+    else
+      Opcode = X86ISD::PCMPESTRI;
+
+    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+    return DAG.getNode(Opcode, dl, VTs, NewOps);
+  }
+
+  case Intrinsic::x86_seh_lsda: {
+    // Compute the symbol for the LSDA. We know it'll get emitted later.
+    MachineFunction &MF = DAG.getMachineFunction();
+    SDValue Op1 = Op.getOperand(1);
+    auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
+    MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
+        GlobalValue::getRealLinkageName(Fn->getName()));
+
+    // Generate a simple absolute symbol reference. This intrinsic is only
+    // supported on 32-bit Windows, which isn't PIC.
+    SDValue Result = DAG.getMCSymbol(LSDASym, VT);
+    return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
+  }
+
+  case Intrinsic::x86_seh_recoverfp: {
+    SDValue FnOp = Op.getOperand(1);
+    SDValue IncomingFPOp = Op.getOperand(2);
+    GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
+    auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
+    if (!Fn)
+      report_fatal_error(
+          "llvm.x86.seh.recoverfp must take a function as the first argument");
+    return recoverFramePointer(DAG, Fn, IncomingFPOp);
+  }
+
+  case Intrinsic::localaddress: {
+    // Returns one of the stack, base, or frame pointer registers, depending on
+    // which is used to reference local variables.
+    MachineFunction &MF = DAG.getMachineFunction();
+    const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+    unsigned Reg;
+    if (RegInfo->hasBasePointer(MF))
+      Reg = RegInfo->getBaseRegister();
+    else // This function handles the SP or FP case.
+      Reg = RegInfo->getPtrSizedFrameRegister(MF);
+    return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+  }
+  }
+}
+
+static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                              SDValue Src, SDValue Mask, SDValue Base,
+                              SDValue Index, SDValue ScaleOp, SDValue Chain,
+                              const X86Subtarget * Subtarget) {
+  SDLoc dl(Op);
+  auto *C = cast<ConstantSDNode>(ScaleOp);
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+  MVT MaskVT = MVT::getVectorVT(MVT::i1,
+                             Index.getSimpleValueType().getVectorNumElements());
+  SDValue MaskInReg;
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+  if (MaskC)
+    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
+  else {
+    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+                                     Mask.getSimpleValueType().getSizeInBits());
+
+    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+    // are extracted by EXTRACT_SUBVECTOR.
+    MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                            DAG.getBitcast(BitcastVT, Mask),
+                            DAG.getIntPtrConstant(0, dl));
+  }
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+  SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  if (Src.getOpcode() == ISD::UNDEF)
+    Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
+  SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+  return DAG.getMergeValues(RetOps, dl);
+}
+
+static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                               SDValue Src, SDValue Mask, SDValue Base,
+                               SDValue Index, SDValue ScaleOp, SDValue Chain) {
+  SDLoc dl(Op);
+  auto *C = cast<ConstantSDNode>(ScaleOp);
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+  SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  MVT MaskVT = MVT::getVectorVT(MVT::i1,
+                             Index.getSimpleValueType().getVectorNumElements());
+  SDValue MaskInReg;
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+  if (MaskC)
+    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
+  else {
+    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+                                     Mask.getSimpleValueType().getSizeInBits());
+
+    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+    // are extracted by EXTRACT_SUBVECTOR.
+    MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                            DAG.getBitcast(BitcastVT, Mask),
+                            DAG.getIntPtrConstant(0, dl));
+  }
+  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  return SDValue(Res, 1);
+}
+
+static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                               SDValue Mask, SDValue Base, SDValue Index,
+                               SDValue ScaleOp, SDValue Chain) {
+  SDLoc dl(Op);
+  auto *C = cast<ConstantSDNode>(ScaleOp);
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+  SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  MVT MaskVT =
+    MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
+  SDValue MaskInReg;
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+  if (MaskC)
+    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
+  else
+    MaskInReg = DAG.getBitcast(MaskVT, Mask);
+  //SDVTList VTs = DAG.getVTList(MVT::Other);
+  SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
+  return SDValue(Res, 0);
+}
+
+// getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
+// read performance monitor counters (x86_rdpmc).
+static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
+                              SelectionDAG &DAG, const X86Subtarget *Subtarget,
+                              SmallVectorImpl<SDValue> &Results) {
+  assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue LO, HI;
+
+  // The ECX register is used to select the index of the performance counter
+  // to read.
+  SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
+                                   N->getOperand(2));
+  SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
+
+  // Reads the content of a 64-bit performance counter and returns it in the
+  // registers EDX:EAX.
+  if (Subtarget->is64Bit()) {
+    LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+                            LO.getValue(2));
+  } else {
+    LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+                            LO.getValue(2));
+  }
+  Chain = HI.getValue(1);
+
+  if (Subtarget->is64Bit()) {
+    // The EAX register is loaded with the low-order 32 bits. The EDX register
+    // is loaded with the supported high-order bits of the counter.
+    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+                              DAG.getConstant(32, DL, MVT::i8));
+    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+    Results.push_back(Chain);
+    return;
+  }
+
+  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+  SDValue Ops[] = { LO, HI };
+  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+  Results.push_back(Pair);
+  Results.push_back(Chain);
+}
+
+// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
+// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
+// also used to custom lower READCYCLECOUNTER nodes.
+static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
+                              SelectionDAG &DAG, const X86Subtarget *Subtarget,
+                              SmallVectorImpl<SDValue> &Results) {
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
+  SDValue LO, HI;
+
+  // The processor's time-stamp counter (a 64-bit MSR) is stored into the
+  // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
+  // and the EAX register is loaded with the low-order 32 bits.
+  if (Subtarget->is64Bit()) {
+    LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+                            LO.getValue(2));
+  } else {
+    LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+                            LO.getValue(2));
+  }
+  SDValue Chain = HI.getValue(1);
+
+  if (Opcode == X86ISD::RDTSCP_DAG) {
+    assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+
+    // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
+    // the ECX register. Add 'ecx' explicitly to the chain.
+    SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
+                                     HI.getValue(2));
+    // Explicitly store the content of ECX at the location passed in input
+    // to the 'rdtscp' intrinsic.
+    Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
+                         MachinePointerInfo(), false, false, 0);
+  }
+
+  if (Subtarget->is64Bit()) {
+    // The EDX register is loaded with the high-order 32 bits of the MSR, and
+    // the EAX register is loaded with the low-order 32 bits.
+    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+                              DAG.getConstant(32, DL, MVT::i8));
+    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+    Results.push_back(Chain);
+    return;
+  }
+
+  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+  SDValue Ops[] = { LO, HI };
+  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+  Results.push_back(Pair);
+  Results.push_back(Chain);
+}
+
+static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
+                                     SelectionDAG &DAG) {
+  SmallVector<SDValue, 2> Results;
+  SDLoc DL(Op);
+  getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
+                          Results);
+  return DAG.getMergeValues(Results, DL);
+}
+
+static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDValue Chain = Op.getOperand(0);
+  SDValue RegNode = Op.getOperand(2);
+  WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
+  if (!EHInfo)
+    report_fatal_error("EH registrations only live in functions using WinEH");
+
+  // Cast the operand to an alloca, and remember the frame index.
+  auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
+  if (!FINode)
+    report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
+  EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
+
+  // Return the chain operand without making any DAG nodes.
+  return Chain;
+}
+
+/// \brief Lower intrinsics for TRUNCATE_TO_MEM case
+/// return truncate Store/MaskedStore Node
+static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op,
+                                               SelectionDAG &DAG,
+                                               MVT ElementType) {
+  SDLoc dl(Op);
+  SDValue Mask = Op.getOperand(4);
+  SDValue DataToTruncate = Op.getOperand(3);
+  SDValue Addr = Op.getOperand(2);
+  SDValue Chain = Op.getOperand(0);
+
+  MVT VT  = DataToTruncate.getSimpleValueType();
+  MVT SVT = MVT::getVectorVT(ElementType, VT.getVectorNumElements());
+
+  if (isAllOnesConstant(Mask)) // return just a truncate store
+    return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr,
+                             MachinePointerInfo(), SVT, false, false,
+                             SVT.getScalarSizeInBits()/8);
+
+  MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+  MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+                                   Mask.getSimpleValueType().getSizeInBits());
+  // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+  // are extracted by EXTRACT_SUBVECTOR.
+  SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                              DAG.getBitcast(BitcastVT, Mask),
+                              DAG.getIntPtrConstant(0, dl));
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MachinePointerInfo(),
+                         MachineMemOperand::MOStore, SVT.getStoreSize(),
+                         SVT.getScalarSizeInBits()/8);
+
+  return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr,
+                            VMask, SVT, MMO, true);
+}
+
+static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+  const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
+  if (!IntrData) {
+    if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
+      return MarkEHRegistrationNode(Op, DAG);
+    if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
+        IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
+        IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
+        IntNo == llvm::Intrinsic::x86_flags_write_u64) {
+      // We need a frame pointer because this will get lowered to a PUSH/POP
+      // sequence.
+      MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+      MFI->setHasOpaqueSPAdjustment(true);
+      // Don't do anything here, we will expand these intrinsics out later
+      // during ExpandISelPseudos in EmitInstrWithCustomInserter.
+      return SDValue();
+    }
+    return SDValue();
+  }
+
+  SDLoc dl(Op);
+  switch(IntrData->Type) {
+  default: llvm_unreachable("Unknown Intrinsic Type");
+  case RDSEED:
+  case RDRAND: {
+    // Emit the node with the right value type.
+    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
+    SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
+
+    // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
+    // Otherwise return the value from Rand, which is always 0, casted to i32.
+    SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
+                      DAG.getConstant(1, dl, Op->getValueType(1)),
+                      DAG.getConstant(X86::COND_B, dl, MVT::i32),
+                      SDValue(Result.getNode(), 1) };
+    SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
+                                  DAG.getVTList(Op->getValueType(1), MVT::Glue),
+                                  Ops);
+
+    // Return { result, isValid, chain }.
+    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
+                       SDValue(Result.getNode(), 2));
+  }
+  case GATHER: {
+  //gather(v1, mask, index, base, scale);
+    SDValue Chain = Op.getOperand(0);
+    SDValue Src   = Op.getOperand(2);
+    SDValue Base  = Op.getOperand(3);
+    SDValue Index = Op.getOperand(4);
+    SDValue Mask  = Op.getOperand(5);
+    SDValue Scale = Op.getOperand(6);
+    return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
+                         Chain, Subtarget);
+  }
+  case SCATTER: {
+  //scatter(base, mask, index, v1, scale);
+    SDValue Chain = Op.getOperand(0);
+    SDValue Base  = Op.getOperand(2);
+    SDValue Mask  = Op.getOperand(3);
+    SDValue Index = Op.getOperand(4);
+    SDValue Src   = Op.getOperand(5);
+    SDValue Scale = Op.getOperand(6);
+    return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
+                          Scale, Chain);
+  }
+  case PREFETCH: {
+    SDValue Hint = Op.getOperand(6);
+    unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
+    assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
+    unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
+    SDValue Chain = Op.getOperand(0);
+    SDValue Mask  = Op.getOperand(2);
+    SDValue Index = Op.getOperand(3);
+    SDValue Base  = Op.getOperand(4);
+    SDValue Scale = Op.getOperand(5);
+    return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
+  }
+  // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
+  case RDTSC: {
+    SmallVector<SDValue, 2> Results;
+    getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
+                            Results);
+    return DAG.getMergeValues(Results, dl);
+  }
+  // Read Performance Monitoring Counters.
+  case RDPMC: {
+    SmallVector<SDValue, 2> Results;
+    getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
+    return DAG.getMergeValues(Results, dl);
+  }
+  // XTEST intrinsics.
+  case XTEST: {
+    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
+    SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                DAG.getConstant(X86::COND_NE, dl, MVT::i8),
+                                InTrans);
+    SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
+    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
+                       Ret, SDValue(InTrans.getNode(), 1));
+  }
+  // ADC/ADCX/SBB
+  case ADX: {
+    SmallVector<SDValue, 2> Results;
+    SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
+    SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
+    SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
+                                DAG.getConstant(-1, dl, MVT::i8));
+    SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
+                              Op.getOperand(4), GenCF.getValue(1));
+    SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
+                                 Op.getOperand(5), MachinePointerInfo(),
+                                 false, false, 0);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                DAG.getConstant(X86::COND_B, dl, MVT::i8),
+                                Res.getValue(1));
+    Results.push_back(SetCC);
+    Results.push_back(Store);
+    return DAG.getMergeValues(Results, dl);
+  }
+  case COMPRESS_TO_MEM: {
+    SDLoc dl(Op);
+    SDValue Mask = Op.getOperand(4);
+    SDValue DataToCompress = Op.getOperand(3);
+    SDValue Addr = Op.getOperand(2);
+    SDValue Chain = Op.getOperand(0);
+
+    MVT VT = DataToCompress.getSimpleValueType();
+    if (isAllOnesConstant(Mask)) // return just a store
+      return DAG.getStore(Chain, dl, DataToCompress, Addr,
+                          MachinePointerInfo(), false, false,
+                          VT.getScalarSizeInBits()/8);
+
+    SDValue Compressed =
+      getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress),
+                           Mask, DAG.getUNDEF(VT), Subtarget, DAG);
+    return DAG.getStore(Chain, dl, Compressed, Addr,
+                        MachinePointerInfo(), false, false,
+                        VT.getScalarSizeInBits()/8);
+  }
+  case TRUNCATE_TO_MEM_VI8:
+    return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i8);
+  case TRUNCATE_TO_MEM_VI16:
+    return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i16);
+  case TRUNCATE_TO_MEM_VI32:
+    return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32);
+  case EXPAND_FROM_MEM: {
+    SDLoc dl(Op);
+    SDValue Mask = Op.getOperand(4);
+    SDValue PassThru = Op.getOperand(3);
+    SDValue Addr = Op.getOperand(2);
+    SDValue Chain = Op.getOperand(0);
+    MVT VT = Op.getSimpleValueType();
+
+    if (isAllOnesConstant(Mask)) // return just a load
+      return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
+                         false, VT.getScalarSizeInBits()/8);
+
+    SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
+                                       false, false, false,
+                                       VT.getScalarSizeInBits()/8);
+
+    SDValue Results[] = {
+      getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand),
+                           Mask, PassThru, Subtarget, DAG), Chain};
+    return DAG.getMergeValues(Results, dl);
+  }
+  }
+}
+
+SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDLoc dl(Op);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  if (Depth > 0) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+    SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, PtrVT,
+                                   FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Just load the return address.
+  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
+  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
+}
+
+SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  EVT VT = Op.getValueType();
+
+  MFI->setFrameAddressIsTaken(true);
+
+  if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
+    // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
+    // is not possible to crawl up the stack without looking at the unwind codes
+    // simultaneously.
+    int FrameAddrIndex = FuncInfo->getFAIndex();
+    if (!FrameAddrIndex) {
+      // Set up a frame object for the return address.
+      unsigned SlotSize = RegInfo->getSlotSize();
+      FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
+          SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
+      FuncInfo->setFAIndex(FrameAddrIndex);
+    }
+    return DAG.getFrameIndex(FrameAddrIndex, VT);
+  }
+
+  unsigned FrameReg =
+      RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
+  SDLoc dl(Op);  // FIXME probably not meaningful
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
+          (FrameReg == X86::EBP && VT == MVT::i32)) &&
+         "Invalid Frame Register!");
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(),
+                            false, false, false, 0);
+  return FrameAddr;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                              SelectionDAG &DAG) const {
+  const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+  const MachineFunction &MF = DAG.getMachineFunction();
+
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                       .Case("esp", X86::ESP)
+                       .Case("rsp", X86::RSP)
+                       .Case("ebp", X86::EBP)
+                       .Case("rbp", X86::RBP)
+                       .Default(0);
+
+  if (Reg == X86::EBP || Reg == X86::RBP) {
+    if (!TFI.hasFP(MF))
+      report_fatal_error("register " + StringRef(RegName) +
+                         " is allocatable: function has no frame pointer");
+#ifndef NDEBUG
+    else {
+      const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+      unsigned FrameReg =
+          RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
+      assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
+             "Invalid Frame Register!");
+    }
+#endif
+  }
+
+  if (Reg)
+    return Reg;
+
+  report_fatal_error("Invalid register name global variable");
+}
+
+SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
+}
+
+unsigned X86TargetLowering::getExceptionPointerRegister(
+    const Constant *PersonalityFn) const {
+  if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
+    return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX;
+
+  return Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX;
+}
+
+unsigned X86TargetLowering::getExceptionSelectorRegister(
+    const Constant *PersonalityFn) const {
+  // Funclet personalities don't use selectors (the runtime does the selection).
+  assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
+  return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX;
+}
+
+SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain     = Op.getOperand(0);
+  SDValue Offset    = Op.getOperand(1);
+  SDValue Handler   = Op.getOperand(2);
+  SDLoc dl      (Op);
+
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
+  assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
+          (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
+         "Invalid Frame Register!");
+  SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
+  unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
+
+  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
+                                 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
+                                                       dl));
+  StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
+  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
+                       false, false, 0);
+  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
+
+  return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
+                     DAG.getRegister(StoreAddrReg, PtrVT));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
+                     DAG.getVTList(MVT::i32, MVT::Other),
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
+  return Op.getOperand(0);
+}
+
+SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDValue Root = Op.getOperand(0);
+  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue FPtr = Op.getOperand(2); // nested function
+  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+  SDLoc dl (Op);
+
+  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+  const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+
+  if (Subtarget->is64Bit()) {
+    SDValue OutChains[6];
+
+    // Large code-model.
+    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
+    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
+
+    const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
+    const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
+
+    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
+
+    // Load the pointer to the nested function into R11.
+    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
+    SDValue Addr = Trmp;
+    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
+                                Addr, MachinePointerInfo(TrmpAddr),
+                                false, false, 0);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(2, dl, MVT::i64));
+    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
+                                MachinePointerInfo(TrmpAddr, 2),
+                                false, false, 2);
+
+    // Load the 'nest' parameter value into R10.
+    // R10 is specified in X86CallingConv.td
+    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(10, dl, MVT::i64));
+    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
+                                Addr, MachinePointerInfo(TrmpAddr, 10),
+                                false, false, 0);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(12, dl, MVT::i64));
+    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
+                                MachinePointerInfo(TrmpAddr, 12),
+                                false, false, 2);
+
+    // Jump to the nested function.
+    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(20, dl, MVT::i64));
+    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
+                                Addr, MachinePointerInfo(TrmpAddr, 20),
+                                false, false, 0);
+
+    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(22, dl, MVT::i64));
+    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
+                                Addr, MachinePointerInfo(TrmpAddr, 22),
+                                false, false, 0);
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+  } else {
+    const Function *Func =
+      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+    CallingConv::ID CC = Func->getCallingConv();
+    unsigned NestReg;
+
+    switch (CC) {
+    default:
+      llvm_unreachable("Unsupported calling convention");
+    case CallingConv::C:
+    case CallingConv::X86_StdCall: {
+      // Pass 'nest' parameter in ECX.
+      // Must be kept in sync with X86CallingConv.td
+      NestReg = X86::ECX;
+
+      // Check that ECX wasn't needed by an 'inreg' parameter.
+      FunctionType *FTy = Func->getFunctionType();
+      const AttributeSet &Attrs = Func->getAttributes();
+
+      if (!Attrs.isEmpty() && !Func->isVarArg()) {
+        unsigned InRegCount = 0;
+        unsigned Idx = 1;
+
+        for (FunctionType::param_iterator I = FTy->param_begin(),
+             E = FTy->param_end(); I != E; ++I, ++Idx)
+          if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
+            auto &DL = DAG.getDataLayout();
+            // FIXME: should only count parameters that are lowered to integers.
+            InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
+          }
+
+        if (InRegCount > 2) {
+          report_fatal_error("Nest register in use - reduce number of inreg"
+                             " parameters!");
+        }
+      }
+      break;
+    }
+    case CallingConv::X86_FastCall:
+    case CallingConv::X86_ThisCall:
+    case CallingConv::Fast:
+      // Pass 'nest' parameter in EAX.
+      // Must be kept in sync with X86CallingConv.td
+      NestReg = X86::EAX;
+      break;
+    }
+
+    SDValue OutChains[4];
+    SDValue Addr, Disp;
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(10, dl, MVT::i32));
+    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
+
+    // This is storing the opcode for MOV32ri.
+    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
+    const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
+    OutChains[0] = DAG.getStore(Root, dl,
+                                DAG.getConstant(MOV32ri|N86Reg, dl, MVT::i8),
+                                Trmp, MachinePointerInfo(TrmpAddr),
+                                false, false, 0);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(1, dl, MVT::i32));
+    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
+                                MachinePointerInfo(TrmpAddr, 1),
+                                false, false, 1);
+
+    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(5, dl, MVT::i32));
+    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
+                                Addr, MachinePointerInfo(TrmpAddr, 5),
+                                false, false, 1);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(6, dl, MVT::i32));
+    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
+                                MachinePointerInfo(TrmpAddr, 6),
+                                false, false, 1);
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+  }
+}
+
+SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  /*
+   The rounding mode is in bits 11:10 of FPSR, and has the following
+   settings:
+     00 Round to nearest
+     01 Round to -inf
+     10 Round to +inf
+     11 Round to 0
+
+  FLT_ROUNDS, on the other hand, expects the following:
+    -1 Undefined
+     0 Round to 0
+     1 Round to nearest
+     2 Round to +inf
+     3 Round to -inf
+
+  To perform the conversion, we do:
+    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
+  */
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+  unsigned StackAlignment = TFI.getStackAlignment();
+  MVT VT = Op.getSimpleValueType();
+  SDLoc DL(Op);
+
+  // Save FP Control Word to stack slot
+  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
+  SDValue StackSlot =
+      DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
+
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
+                              MachineMemOperand::MOStore, 2, 2);
+
+  SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
+  SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
+                                          DAG.getVTList(MVT::Other),
+                                          Ops, MVT::i16, MMO);
+
+  // Load FP Control Word from stack slot
+  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
+                            MachinePointerInfo(), false, false, false, 0);
+
+  // Transform as necessary
+  SDValue CWD1 =
+    DAG.getNode(ISD::SRL, DL, MVT::i16,
+                DAG.getNode(ISD::AND, DL, MVT::i16,
+                            CWD, DAG.getConstant(0x800, DL, MVT::i16)),
+                DAG.getConstant(11, DL, MVT::i8));
+  SDValue CWD2 =
+    DAG.getNode(ISD::SRL, DL, MVT::i16,
+                DAG.getNode(ISD::AND, DL, MVT::i16,
+                            CWD, DAG.getConstant(0x400, DL, MVT::i16)),
+                DAG.getConstant(9, DL, MVT::i8));
+
+  SDValue RetVal =
+    DAG.getNode(ISD::AND, DL, MVT::i16,
+                DAG.getNode(ISD::ADD, DL, MVT::i16,
+                            DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
+                            DAG.getConstant(1, DL, MVT::i16)),
+                DAG.getConstant(3, DL, MVT::i16));
+
+  return DAG.getNode((VT.getSizeInBits() < 16 ?
+                      ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
+}
+
+/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
+//
+// 1. i32/i64 128/256-bit vector (native support require VLX) are expended
+//    to 512-bit vector.
+// 2. i8/i16 vector implemented using dword LZCNT vector instruction
+//    ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
+//    split the vector, perform operation on it's Lo a Hi part and
+//    concatenate the results.
+static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (EltVT == MVT::i64 || EltVT == MVT::i32) {
+    // Extend to 512 bit vector.
+    assert((VT.is256BitVector() || VT.is128BitVector()) &&
+              "Unsupported value type for operation");
+
+    MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
+    SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
+                                 DAG.getUNDEF(NewVT),
+                                 Op.getOperand(0),
+                                 DAG.getIntPtrConstant(0, dl));
+    SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
+
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
+                       DAG.getIntPtrConstant(0, dl));
+  }
+
+  assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
+          "Unsupported element type");
+
+  if (16 < NumElems) {
+    // Split vector, it's Lo and Hi parts will be handled in next iteration.
+    SDValue Lo, Hi;
+    std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
+    MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+    Lo = DAG.getNode(Op.getOpcode(), dl, OutVT, Lo);
+    Hi = DAG.getNode(Op.getOpcode(), dl, OutVT, Hi);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+  }
+
+  MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
+
+  assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
+          "Unsupported value type for operation");
+
+  // Use native supported vector instruction vplzcntd.
+  Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
+  SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
+  SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
+  SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
+
+  return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
+}
+
+static SDValue LowerCTLZ(SDValue Op, const X86Subtarget *Subtarget,
+                         SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  MVT OpVT = VT;
+  unsigned NumBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+
+  if (VT.isVector() && Subtarget->hasAVX512())
+    return LowerVectorCTLZ_AVX512(Op, DAG);
+
+  Op = Op.getOperand(0);
+  if (VT == MVT::i8) {
+    // Zero extend to i32 since there is not an i8 bsr.
+    OpVT = MVT::i32;
+    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+  }
+
+  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+  // If src is zero (i.e. bsr sets ZF), returns NumBits.
+  SDValue Ops[] = {
+    Op,
+    DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
+    DAG.getConstant(X86::COND_E, dl, MVT::i8),
+    Op.getValue(1)
+  };
+  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
+
+  // Finally xor with NumBits-1.
+  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
+                   DAG.getConstant(NumBits - 1, dl, OpVT));
+
+  if (VT == MVT::i8)
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+  return Op;
+}
+
+static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget,
+                                    SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  EVT OpVT = VT;
+  unsigned NumBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+
+  Op = Op.getOperand(0);
+  if (VT == MVT::i8) {
+    // Zero extend to i32 since there is not an i8 bsr.
+    OpVT = MVT::i32;
+    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+  }
+
+  // Issue a bsr (scan bits in reverse).
+  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+  // And xor with NumBits-1.
+  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
+                   DAG.getConstant(NumBits - 1, dl, OpVT));
+
+  if (VT == MVT::i8)
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+  return Op;
+}
+
+static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  unsigned NumBits = VT.getScalarSizeInBits();
+  SDLoc dl(Op);
+
+  if (VT.isVector()) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+    SDValue N0 = Op.getOperand(0);
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+
+    // lsb(x) = (x & -x)
+    SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
+                              DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
+
+    // cttz_undef(x) = (width - 1) - ctlz(lsb)
+    if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF &&
+        TLI.isOperationLegal(ISD::CTLZ, VT)) {
+      SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
+      return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
+                         DAG.getNode(ISD::CTLZ, dl, VT, LSB));
+    }
+
+    // cttz(x) = ctpop(lsb - 1)
+    SDValue One = DAG.getConstant(1, dl, VT);
+    return DAG.getNode(ISD::CTPOP, dl, VT,
+                       DAG.getNode(ISD::SUB, dl, VT, LSB, One));
+  }
+
+  assert(Op.getOpcode() == ISD::CTTZ &&
+         "Only scalar CTTZ requires custom lowering");
+
+  // Issue a bsf (scan bits forward) which also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
+
+  // If src is zero (i.e. bsf sets ZF), returns NumBits.
+  SDValue Ops[] = {
+    Op,
+    DAG.getConstant(NumBits, dl, VT),
+    DAG.getConstant(X86::COND_E, dl, MVT::i8),
+    Op.getValue(1)
+  };
+  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
+}
+
+// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
+// ones, and then concatenate the result back.
+static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+
+  assert(VT.is256BitVector() && VT.isInteger() &&
+         "Unsupported value type for operation");
+
+  unsigned NumElems = VT.getVectorNumElements();
+  SDLoc dl(Op);
+
+  // Extract the LHS vectors
+  SDValue LHS = Op.getOperand(0);
+  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
+
+  // Extract the RHS vectors
+  SDValue RHS = Op.getOperand(1);
+  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
+
+  MVT EltVT = VT.getVectorElementType();
+  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
+}
+
+static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
+  if (Op.getValueType() == MVT::i1)
+    return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(0), Op.getOperand(1));
+  assert(Op.getSimpleValueType().is256BitVector() &&
+         Op.getSimpleValueType().isInteger() &&
+         "Only handle AVX 256-bit vector integer operation");
+  return Lower256IntArith(Op, DAG);
+}
+
+static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
+  if (Op.getValueType() == MVT::i1)
+    return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(0), Op.getOperand(1));
+  assert(Op.getSimpleValueType().is256BitVector() &&
+         Op.getSimpleValueType().isInteger() &&
+         "Only handle AVX 256-bit vector integer operation");
+  return Lower256IntArith(Op, DAG);
+}
+
+static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getSimpleValueType().is256BitVector() &&
+         Op.getSimpleValueType().isInteger() &&
+         "Only handle AVX 256-bit vector integer operation");
+  return Lower256IntArith(Op, DAG);
+}
+
+static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
+                        SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  if (VT == MVT::i1)
+    return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
+
+  // Decompose 256-bit ops into smaller 128-bit ops.
+  if (VT.is256BitVector() && !Subtarget->hasInt256())
+    return Lower256IntArith(Op, DAG);
+
+  SDValue A = Op.getOperand(0);
+  SDValue B = Op.getOperand(1);
+
+  // Lower v16i8/v32i8 mul as promotion to v8i16/v16i16 vector
+  // pairs, multiply and truncate.
+  if (VT == MVT::v16i8 || VT == MVT::v32i8) {
+    if (Subtarget->hasInt256()) {
+      if (VT == MVT::v32i8) {
+        MVT SubVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() / 2);
+        SDValue Lo = DAG.getIntPtrConstant(0, dl);
+        SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
+        SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Lo);
+        SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Lo);
+        SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Hi);
+        SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Hi);
+        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                           DAG.getNode(ISD::MUL, dl, SubVT, ALo, BLo),
+                           DAG.getNode(ISD::MUL, dl, SubVT, AHi, BHi));
+      }
+
+      MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
+      return DAG.getNode(
+          ISD::TRUNCATE, dl, VT,
+          DAG.getNode(ISD::MUL, dl, ExVT,
+                      DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
+                      DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
+    }
+
+    assert(VT == MVT::v16i8 &&
+           "Pre-AVX2 support only supports v16i8 multiplication");
+    MVT ExVT = MVT::v8i16;
+
+    // Extract the lo parts and sign extend to i16
+    SDValue ALo, BLo;
+    if (Subtarget->hasSSE41()) {
+      ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
+      BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
+    } else {
+      const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
+                              -1, 4, -1, 5, -1, 6, -1, 7};
+      ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+      BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+      ALo = DAG.getBitcast(ExVT, ALo);
+      BLo = DAG.getBitcast(ExVT, BLo);
+      ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
+      BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
+    }
+
+    // Extract the hi parts and sign extend to i16
+    SDValue AHi, BHi;
+    if (Subtarget->hasSSE41()) {
+      const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
+                              -1, -1, -1, -1, -1, -1, -1, -1};
+      AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+      BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+      AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
+      BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
+    } else {
+      const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
+                              -1, 12, -1, 13, -1, 14, -1, 15};
+      AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+      BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+      AHi = DAG.getBitcast(ExVT, AHi);
+      BHi = DAG.getBitcast(ExVT, BHi);
+      AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
+      BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
+    }
+
+    // Multiply, mask the lower 8bits of the lo/hi results and pack
+    SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
+    SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
+    RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
+    RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
+    return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+  }
+
+  // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
+  if (VT == MVT::v4i32) {
+    assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
+           "Should not custom lower when pmuldq is available!");
+
+    // Extract the odd parts.
+    static const int UnpackMask[] = { 1, -1, 3, -1 };
+    SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
+    SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
+
+    // Multiply the even parts.
+    SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
+    // Now multiply odd parts.
+    SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
+
+    Evens = DAG.getBitcast(VT, Evens);
+    Odds = DAG.getBitcast(VT, Odds);
+
+    // Merge the two vectors back together with a shuffle. This expands into 2
+    // shuffles.
+    static const int ShufMask[] = { 0, 4, 2, 6 };
+    return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
+  }
+
+  assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
+         "Only know how to lower V2I64/V4I64/V8I64 multiply");
+
+  //  Ahi = psrlqi(a, 32);
+  //  Bhi = psrlqi(b, 32);
+  //
+  //  AloBlo = pmuludq(a, b);
+  //  AloBhi = pmuludq(a, Bhi);
+  //  AhiBlo = pmuludq(Ahi, b);
+
+  //  AloBhi = psllqi(AloBhi, 32);
+  //  AhiBlo = psllqi(AhiBlo, 32);
+  //  return AloBlo + AloBhi + AhiBlo;
+
+  SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
+  SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
+
+  SDValue AhiBlo = Ahi;
+  SDValue AloBhi = Bhi;
+  // Bit cast to 32-bit vectors for MULUDQ
+  MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
+                                  (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
+  A = DAG.getBitcast(MulVT, A);
+  B = DAG.getBitcast(MulVT, B);
+  Ahi = DAG.getBitcast(MulVT, Ahi);
+  Bhi = DAG.getBitcast(MulVT, Bhi);
+
+  SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
+  // After shifting right const values the result may be all-zero.
+  if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) {
+    AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
+    AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
+  }
+  if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) {
+    AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
+    AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
+  }
+
+  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
+  return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
+}
+
+SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWin64() && "Unexpected target");
+  EVT VT = Op.getValueType();
+  assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
+         "Unexpected return type for lowering");
+
+  RTLIB::Libcall LC;
+  bool isSigned;
+  switch (Op->getOpcode()) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
+  case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
+  case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
+  case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
+  case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
+  case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
+  }
+
+  SDLoc dl(Op);
+  SDValue InChain = DAG.getEntryNode();
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Op->getOperand(i).getValueType();
+    assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
+           "Unexpected argument type for lowering");
+    SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+    Entry.Node = StackPtr;
+    InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
+                           false, false, 16);
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Ty = PointerType::get(ArgTy,0);
+    Entry.isSExt = false;
+    Entry.isZExt = false;
+    Args.push_back(Entry);
+  }
+
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+                                         getPointerTy(DAG.getDataLayout()));
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(InChain)
+    .setCallee(getLibcallCallingConv(LC),
+               static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
+               Callee, std::move(Args), 0)
+    .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
+
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+  return DAG.getBitcast(VT, CallInfo.first);
+}
+
+static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
+                             SelectionDAG &DAG) {
+  SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+  MVT VT = Op0.getSimpleValueType();
+  SDLoc dl(Op);
+
+  assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
+         (VT == MVT::v8i32 && Subtarget->hasInt256()));
+
+  // PMULxD operations multiply each even value (starting at 0) of LHS with
+  // the related value of RHS and produce a widen result.
+  // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+  // => <2 x i64> <ae|cg>
+  //
+  // In other word, to have all the results, we need to perform two PMULxD:
+  // 1. one with the even values.
+  // 2. one with the odd values.
+  // To achieve #2, with need to place the odd values at an even position.
+  //
+  // Place the odd value at an even position (basically, shift all values 1
+  // step to the left):
+  const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
+  // <a|b|c|d> => <b|undef|d|undef>
+  SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
+  // <e|f|g|h> => <f|undef|h|undef>
+  SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
+
+  // Emit two multiplies, one for the lower 2 ints and one for the higher 2
+  // ints.
+  MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
+  bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
+  unsigned Opcode =
+      (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+  // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+  // => <2 x i64> <ae|cg>
+  SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+  // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
+  // => <2 x i64> <bf|dh>
+  SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
+
+  // Shuffle it back into the right order.
+  SDValue Highs, Lows;
+  if (VT == MVT::v8i32) {
+    const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
+    Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+    const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
+    Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+  } else {
+    const int HighMask[] = {1, 5, 3, 7};
+    Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+    const int LowMask[] = {0, 4, 2, 6};
+    Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+  }
+
+  // If we have a signed multiply but no PMULDQ fix up the high parts of a
+  // unsigned multiply.
+  if (IsSigned && !Subtarget->hasSSE41()) {
+    SDValue ShAmt = DAG.getConstant(
+        31, dl,
+        DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
+    SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
+                             DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
+    SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
+                             DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
+
+    SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
+    Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
+  }
+
+  // The first result of MUL_LOHI is actually the low value, followed by the
+  // high value.
+  SDValue Ops[] = {Lows, Highs};
+  return DAG.getMergeValues(Ops, dl);
+}
+
+// Return true if the required (according to Opcode) shift-imm form is natively
+// supported by the Subtarget
+static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget,
+                                        unsigned Opcode) {
+  if (VT.getScalarSizeInBits() < 16)
+    return false;
+
+  if (VT.is512BitVector() &&
+      (VT.getScalarSizeInBits() > 16 || Subtarget->hasBWI()))
+    return true;
+
+  bool LShift = VT.is128BitVector() ||
+    (VT.is256BitVector() && Subtarget->hasInt256());
+
+  bool AShift = LShift && (Subtarget->hasVLX() ||
+    (VT != MVT::v2i64 && VT != MVT::v4i64));
+  return (Opcode == ISD::SRA) ? AShift : LShift;
+}
+
+// The shift amount is a variable, but it is the same for all vector lanes.
+// These instructions are defined together with shift-immediate.
+static
+bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget,
+                                      unsigned Opcode) {
+  return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
+}
+
+// Return true if the required (according to Opcode) variable-shift form is
+// natively supported by the Subtarget
+static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget,
+                                    unsigned Opcode) {
+
+  if (!Subtarget->hasInt256() || VT.getScalarSizeInBits() < 16)
+    return false;
+
+  // vXi16 supported only on AVX-512, BWI
+  if (VT.getScalarSizeInBits() == 16 && !Subtarget->hasBWI())
+    return false;
+
+  if (VT.is512BitVector() || Subtarget->hasVLX())
+    return true;
+
+  bool LShift = VT.is128BitVector() || VT.is256BitVector();
+  bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
+  return (Opcode == ISD::SRA) ? AShift : LShift;
+}
+
+static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
+                                         const X86Subtarget *Subtarget) {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+  SDValue R = Op.getOperand(0);
+  SDValue Amt = Op.getOperand(1);
+
+  unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
+    (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
+
+  auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
+    assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
+    MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
+    SDValue Ex = DAG.getBitcast(ExVT, R);
+
+    if (ShiftAmt >= 32) {
+      // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
+      SDValue Upper =
+          getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
+      SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
+                                                 ShiftAmt - 32, DAG);
+      if (VT == MVT::v2i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
+      if (VT == MVT::v4i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
+                                  {9, 1, 11, 3, 13, 5, 15, 7});
+    } else {
+      // SRA upper i32, SHL whole i64 and select lower i32.
+      SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
+                                                 ShiftAmt, DAG);
+      SDValue Lower =
+          getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
+      Lower = DAG.getBitcast(ExVT, Lower);
+      if (VT == MVT::v2i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
+      if (VT == MVT::v4i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
+                                  {8, 1, 10, 3, 12, 5, 14, 7});
+    }
+    return DAG.getBitcast(VT, Ex);
+  };
+
+  // Optimize shl/srl/sra with constant shift amount.
+  if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
+    if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
+      uint64_t ShiftAmt = ShiftConst->getZExtValue();
+
+      if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
+        return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+
+      // i64 SRA needs to be performed as partial shifts.
+      if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
+          Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP())
+        return ArithmeticShiftRight64(ShiftAmt);
+
+      if (VT == MVT::v16i8 ||
+          (Subtarget->hasInt256() && VT == MVT::v32i8) ||
+          VT == MVT::v64i8) {
+        unsigned NumElts = VT.getVectorNumElements();
+        MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+
+        // Simple i8 add case
+        if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
+          return DAG.getNode(ISD::ADD, dl, VT, R, R);
+
+        // ashr(R, 7)  === cmp_slt(R, 0)
+        if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
+          SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+          return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+        }
+
+        // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
+        if (VT == MVT::v16i8 && Subtarget->hasXOP())
+          return SDValue();
+
+        if (Op.getOpcode() == ISD::SHL) {
+          // Make a large shift.
+          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
+                                                   R, ShiftAmt, DAG);
+          SHL = DAG.getBitcast(VT, SHL);
+          // Zero out the rightmost bits.
+          return DAG.getNode(ISD::AND, dl, VT, SHL,
+                             DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
+        }
+        if (Op.getOpcode() == ISD::SRL) {
+          // Make a large shift.
+          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
+                                                   R, ShiftAmt, DAG);
+          SRL = DAG.getBitcast(VT, SRL);
+          // Zero out the leftmost bits.
+          return DAG.getNode(ISD::AND, dl, VT, SRL,
+                             DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
+        }
+        if (Op.getOpcode() == ISD::SRA) {
+          // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
+          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+
+          SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
+          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+          return Res;
+        }
+        llvm_unreachable("Unknown shift opcode.");
+      }
+    }
+  }
+
+  // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+  if (!Subtarget->is64Bit() && !Subtarget->hasXOP() &&
+      (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64))) {
+
+    // Peek through any splat that was introduced for i64 shift vectorization.
+    int SplatIndex = -1;
+    if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
+      if (SVN->isSplat()) {
+        SplatIndex = SVN->getSplatIndex();
+        Amt = Amt.getOperand(0);
+        assert(SplatIndex < (int)VT.getVectorNumElements() &&
+               "Splat shuffle referencing second operand");
+      }
+
+    if (Amt.getOpcode() != ISD::BITCAST ||
+        Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
+      return SDValue();
+
+    Amt = Amt.getOperand(0);
+    unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
+                     VT.getVectorNumElements();
+    unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
+    uint64_t ShiftAmt = 0;
+    unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
+    for (unsigned i = 0; i != Ratio; ++i) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
+      if (!C)
+        return SDValue();
+      // 6 == Log2(64)
+      ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
+    }
+
+    // Check remaining shift amounts (if not a splat).
+    if (SplatIndex < 0) {
+      for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
+        uint64_t ShAmt = 0;
+        for (unsigned j = 0; j != Ratio; ++j) {
+          ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
+          if (!C)
+            return SDValue();
+          // 6 == Log2(64)
+          ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
+        }
+        if (ShAmt != ShiftAmt)
+          return SDValue();
+      }
+    }
+
+    if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
+      return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+
+    if (Op.getOpcode() == ISD::SRA)
+      return ArithmeticShiftRight64(ShiftAmt);
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
+                                        const X86Subtarget* Subtarget) {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+  SDValue R = Op.getOperand(0);
+  SDValue Amt = Op.getOperand(1);
+
+  unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
+    (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
+
+  unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
+    (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
+
+  if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
+    SDValue BaseShAmt;
+    MVT EltVT = VT.getVectorElementType();
+
+    if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
+      // Check if this build_vector node is doing a splat.
+      // If so, then set BaseShAmt equal to the splat value.
+      BaseShAmt = BV->getSplatValue();
+      if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
+        BaseShAmt = SDValue();
+    } else {
+      if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+        Amt = Amt.getOperand(0);
+
+      ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
+      if (SVN && SVN->isSplat()) {
+        unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
+        SDValue InVec = Amt.getOperand(0);
+        if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+          assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
+                 "Unexpected shuffle index found!");
+          BaseShAmt = InVec.getOperand(SplatIdx);
+        } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+           if (ConstantSDNode *C =
+               dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
+             if (C->getZExtValue() == SplatIdx)
+               BaseShAmt = InVec.getOperand(1);
+           }
+        }
+
+        if (!BaseShAmt)
+          // Avoid introducing an extract element from a shuffle.
+          BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
+                                  DAG.getIntPtrConstant(SplatIdx, dl));
+      }
+    }
+
+    if (BaseShAmt.getNode()) {
+      assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
+      if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
+        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
+      else if (EltVT.bitsLT(MVT::i32))
+        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
+
+      return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG);
+    }
+  }
+
+  // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+  if (!Subtarget->is64Bit() && VT == MVT::v2i64  &&
+      Amt.getOpcode() == ISD::BITCAST &&
+      Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+    Amt = Amt.getOperand(0);
+    unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
+                     VT.getVectorNumElements();
+    std::vector<SDValue> Vals(Ratio);
+    for (unsigned i = 0; i != Ratio; ++i)
+      Vals[i] = Amt.getOperand(i);
+    for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
+      for (unsigned j = 0; j != Ratio; ++j)
+        if (Vals[j] != Amt.getOperand(i + j))
+          return SDValue();
+    }
+
+    if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
+      return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
+  }
+  return SDValue();
+}
+
+static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
+                          SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+  SDValue R = Op.getOperand(0);
+  SDValue Amt = Op.getOperand(1);
+
+  assert(VT.isVector() && "Custom lowering only for vector shifts!");
+  assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
+
+  if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
+    return V;
+
+  if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
+    return V;
+
+  if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
+    return Op;
+
+  // XOP has 128-bit variable logical/arithmetic shifts.
+  // +ve/-ve Amt = shift left/right.
+  if (Subtarget->hasXOP() &&
+      (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+       VT == MVT::v8i16 || VT == MVT::v16i8)) {
+    if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
+      SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
+      Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
+    }
+    if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
+      return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
+    if (Op.getOpcode() == ISD::SRA)
+      return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
+  }
+
+  // 2i64 vector logical shifts can efficiently avoid scalarization - do the
+  // shifts per-lane and then shuffle the partial results back together.
+  if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
+    // Splat the shift amounts so the scalar shifts above will catch it.
+    SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
+    SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
+    SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
+    SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
+    return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
+  }
+
+  // i64 vector arithmetic shift can be emulated with the transform:
+  // M = lshr(SIGN_BIT, Amt)
+  // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
+  if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget->hasInt256())) &&
+      Op.getOpcode() == ISD::SRA) {
+    SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
+    SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
+    R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+    R = DAG.getNode(ISD::XOR, dl, VT, R, M);
+    R = DAG.getNode(ISD::SUB, dl, VT, R, M);
+    return R;
+  }
+
+  // If possible, lower this packed shift into a vector multiply instead of
+  // expanding it into a sequence of scalar shifts.
+  // Do this only if the vector shift count is a constant build_vector.
+  if (Op.getOpcode() == ISD::SHL &&
+      (VT == MVT::v8i16 || VT == MVT::v4i32 ||
+       (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
+      ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+    SmallVector<SDValue, 8> Elts;
+    MVT SVT = VT.getVectorElementType();
+    unsigned SVTBits = SVT.getSizeInBits();
+    APInt One(SVTBits, 1);
+    unsigned NumElems = VT.getVectorNumElements();
+
+    for (unsigned i=0; i !=NumElems; ++i) {
+      SDValue Op = Amt->getOperand(i);
+      if (Op->getOpcode() == ISD::UNDEF) {
+        Elts.push_back(Op);
+        continue;
+      }
+
+      ConstantSDNode *ND = cast<ConstantSDNode>(Op);
+      APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
+      uint64_t ShAmt = C.getZExtValue();
+      if (ShAmt >= SVTBits) {
+        Elts.push_back(DAG.getUNDEF(SVT));
+        continue;
+      }
+      Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
+    }
+    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
+    return DAG.getNode(ISD::MUL, dl, VT, R, BV);
+  }
+
+  // Lower SHL with variable shift amount.
+  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
+    Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
+
+    Op = DAG.getNode(ISD::ADD, dl, VT, Op,
+                     DAG.getConstant(0x3f800000U, dl, VT));
+    Op = DAG.getBitcast(MVT::v4f32, Op);
+    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
+    return DAG.getNode(ISD::MUL, dl, VT, Op, R);
+  }
+
+  // If possible, lower this shift as a sequence of two shifts by
+  // constant plus a MOVSS/MOVSD instead of scalarizing it.
+  // Example:
+  //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
+  //
+  // Could be rewritten as:
+  //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
+  //
+  // The advantage is that the two shifts from the example would be
+  // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
+  // the vector shift into four scalar shifts plus four pairs of vector
+  // insert/extract.
+  if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
+      ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+    unsigned TargetOpcode = X86ISD::MOVSS;
+    bool CanBeSimplified;
+    // The splat value for the first packed shift (the 'X' from the example).
+    SDValue Amt1 = Amt->getOperand(0);
+    // The splat value for the second packed shift (the 'Y' from the example).
+    SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
+                                        Amt->getOperand(2);
+
+    // See if it is possible to replace this node with a sequence of
+    // two shifts followed by a MOVSS/MOVSD
+    if (VT == MVT::v4i32) {
+      // Check if it is legal to use a MOVSS.
+      CanBeSimplified = Amt2 == Amt->getOperand(2) &&
+                        Amt2 == Amt->getOperand(3);
+      if (!CanBeSimplified) {
+        // Otherwise, check if we can still simplify this node using a MOVSD.
+        CanBeSimplified = Amt1 == Amt->getOperand(1) &&
+                          Amt->getOperand(2) == Amt->getOperand(3);
+        TargetOpcode = X86ISD::MOVSD;
+        Amt2 = Amt->getOperand(2);
+      }
+    } else {
+      // Do similar checks for the case where the machine value type
+      // is MVT::v8i16.
+      CanBeSimplified = Amt1 == Amt->getOperand(1);
+      for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
+        CanBeSimplified = Amt2 == Amt->getOperand(i);
+
+      if (!CanBeSimplified) {
+        TargetOpcode = X86ISD::MOVSD;
+        CanBeSimplified = true;
+        Amt2 = Amt->getOperand(4);
+        for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
+          CanBeSimplified = Amt1 == Amt->getOperand(i);
+        for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
+          CanBeSimplified = Amt2 == Amt->getOperand(j);
+      }
+    }
+
+    if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
+        isa<ConstantSDNode>(Amt2)) {
+      // Replace this node with two shifts followed by a MOVSS/MOVSD.
+      MVT CastVT = MVT::v4i32;
+      SDValue Splat1 =
+        DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
+      SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
+      SDValue Splat2 =
+        DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
+      SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
+      if (TargetOpcode == X86ISD::MOVSD)
+        CastVT = MVT::v2i64;
+      SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
+      SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
+      SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
+                                            BitCast1, DAG);
+      return DAG.getBitcast(VT, Result);
+    }
+  }
+
+  // v4i32 Non Uniform Shifts.
+  // If the shift amount is constant we can shift each lane using the SSE2
+  // immediate shifts, else we need to zero-extend each lane to the lower i64
+  // and shift using the SSE2 variable shifts.
+  // The separate results can then be blended together.
+  if (VT == MVT::v4i32) {
+    unsigned Opc = Op.getOpcode();
+    SDValue Amt0, Amt1, Amt2, Amt3;
+    if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+      Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
+      Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
+      Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
+      Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
+    } else {
+      // ISD::SHL is handled above but we include it here for completeness.
+      switch (Opc) {
+      default:
+        llvm_unreachable("Unknown target vector shift node");
+      case ISD::SHL:
+        Opc = X86ISD::VSHL;
+        break;
+      case ISD::SRL:
+        Opc = X86ISD::VSRL;
+        break;
+      case ISD::SRA:
+        Opc = X86ISD::VSRA;
+        break;
+      }
+      // The SSE2 shifts use the lower i64 as the same shift amount for
+      // all lanes and the upper i64 is ignored. These shuffle masks
+      // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
+      SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
+      Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
+      Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
+      Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
+      Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
+    }
+
+    SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
+    SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
+    SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
+    SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
+    SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
+    SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
+    return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
+  }
+
+  if (VT == MVT::v16i8 ||
+      (VT == MVT::v32i8 && Subtarget->hasInt256() && !Subtarget->hasXOP())) {
+    MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
+    unsigned ShiftOpcode = Op->getOpcode();
+
+    auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
+      // On SSE41 targets we make use of the fact that VSELECT lowers
+      // to PBLENDVB which selects bytes based just on the sign bit.
+      if (Subtarget->hasSSE41()) {
+        V0 = DAG.getBitcast(VT, V0);
+        V1 = DAG.getBitcast(VT, V1);
+        Sel = DAG.getBitcast(VT, Sel);
+        return DAG.getBitcast(SelVT,
+                              DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+      }
+      // On pre-SSE41 targets we test for the sign bit by comparing to
+      // zero - a negative value will set all bits of the lanes to true
+      // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
+      SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
+      SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
+      return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
+    };
+
+    // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
+    // We can safely do this using i16 shifts as we're only interested in
+    // the 3 lower bits of each byte.
+    Amt = DAG.getBitcast(ExtVT, Amt);
+    Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
+    Amt = DAG.getBitcast(VT, Amt);
+
+    if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
+      // r = VSELECT(r, shift(r, 4), a);
+      SDValue M =
+          DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
+      R = SignBitSelect(VT, Amt, M, R);
+
+      // a += a
+      Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+      // r = VSELECT(r, shift(r, 2), a);
+      M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
+      R = SignBitSelect(VT, Amt, M, R);
+
+      // a += a
+      Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+      // return VSELECT(r, shift(r, 1), a);
+      M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
+      R = SignBitSelect(VT, Amt, M, R);
+      return R;
+    }
+
+    if (Op->getOpcode() == ISD::SRA) {
+      // For SRA we need to unpack each byte to the higher byte of a i16 vector
+      // so we can correctly sign extend. We don't care what happens to the
+      // lower byte.
+      SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
+      SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
+      SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
+      SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
+      ALo = DAG.getBitcast(ExtVT, ALo);
+      AHi = DAG.getBitcast(ExtVT, AHi);
+      RLo = DAG.getBitcast(ExtVT, RLo);
+      RHi = DAG.getBitcast(ExtVT, RHi);
+
+      // r = VSELECT(r, shift(r, 4), a);
+      SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
+                                DAG.getConstant(4, dl, ExtVT));
+      SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
+                                DAG.getConstant(4, dl, ExtVT));
+      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
+      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
+
+      // a += a
+      ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
+      AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
+
+      // r = VSELECT(r, shift(r, 2), a);
+      MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
+                        DAG.getConstant(2, dl, ExtVT));
+      MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
+                        DAG.getConstant(2, dl, ExtVT));
+      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
+      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
+
+      // a += a
+      ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
+      AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
+
+      // r = VSELECT(r, shift(r, 1), a);
+      MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
+                        DAG.getConstant(1, dl, ExtVT));
+      MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
+                        DAG.getConstant(1, dl, ExtVT));
+      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
+      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
+
+      // Logical shift the result back to the lower byte, leaving a zero upper
+      // byte
+      // meaning that we can safely pack with PACKUSWB.
+      RLo =
+          DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
+      RHi =
+          DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
+      return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+    }
+  }
+
+  // It's worth extending once and using the v8i32 shifts for 16-bit types, but
+  // the extra overheads to get from v16i8 to v8i32 make the existing SSE
+  // solution better.
+  if (Subtarget->hasInt256() && VT == MVT::v8i16) {
+    MVT ExtVT = MVT::v8i32;
+    unsigned ExtOpc =
+        Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    R = DAG.getNode(ExtOpc, dl, ExtVT, R);
+    Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
+    return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                       DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
+  }
+
+  if (Subtarget->hasInt256() && !Subtarget->hasXOP() && VT == MVT::v16i16) {
+    MVT ExtVT = MVT::v8i32;
+    SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
+    SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
+    SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
+    SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, R, R);
+    SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, R, R);
+    ALo = DAG.getBitcast(ExtVT, ALo);
+    AHi = DAG.getBitcast(ExtVT, AHi);
+    RLo = DAG.getBitcast(ExtVT, RLo);
+    RHi = DAG.getBitcast(ExtVT, RHi);
+    SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
+    SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
+    Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
+    Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
+    return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
+  }
+
+  if (VT == MVT::v8i16) {
+    unsigned ShiftOpcode = Op->getOpcode();
+
+    auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
+      // On SSE41 targets we make use of the fact that VSELECT lowers
+      // to PBLENDVB which selects bytes based just on the sign bit.
+      if (Subtarget->hasSSE41()) {
+        MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
+        V0 = DAG.getBitcast(ExtVT, V0);
+        V1 = DAG.getBitcast(ExtVT, V1);
+        Sel = DAG.getBitcast(ExtVT, Sel);
+        return DAG.getBitcast(
+            VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
+      }
+      // On pre-SSE41 targets we splat the sign bit - a negative value will
+      // set all bits of the lanes to true and VSELECT uses that in
+      // its OR(AND(V0,C),AND(V1,~C)) lowering.
+      SDValue C =
+          DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
+      return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
+    };
+
+    // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
+    if (Subtarget->hasSSE41()) {
+      // On SSE41 targets we need to replicate the shift mask in both
+      // bytes for PBLENDVB.
+      Amt = DAG.getNode(
+          ISD::OR, dl, VT,
+          DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
+          DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
+    } else {
+      Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
+    }
+
+    // r = VSELECT(r, shift(r, 8), a);
+    SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
+    R = SignBitSelect(Amt, M, R);
+
+    // a += a
+    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+    // r = VSELECT(r, shift(r, 4), a);
+    M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
+    R = SignBitSelect(Amt, M, R);
+
+    // a += a
+    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+    // r = VSELECT(r, shift(r, 2), a);
+    M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
+    R = SignBitSelect(Amt, M, R);
+
+    // a += a
+    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+    // return VSELECT(r, shift(r, 1), a);
+    M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
+    R = SignBitSelect(Amt, M, R);
+    return R;
+  }
+
+  // Decompose 256-bit shifts into smaller 128-bit shifts.
+  if (VT.is256BitVector()) {
+    unsigned NumElems = VT.getVectorNumElements();
+    MVT EltVT = VT.getVectorElementType();
+    MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+    // Extract the two vectors
+    SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
+    SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
+
+    // Recreate the shift amount vectors
+    SDValue Amt1, Amt2;
+    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
+      // Constant shift amount
+      SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems);
+      ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2);
+      ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2);
+
+      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
+      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
+    } else {
+      // Variable shift amount
+      Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
+      Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
+    }
+
+    // Issue new vector shifts for the smaller types
+    V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
+    V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
+
+    // Concatenate the result back
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc DL(Op);
+  SDValue R = Op.getOperand(0);
+  SDValue Amt = Op.getOperand(1);
+
+  assert(VT.isVector() && "Custom lowering only for vector rotates!");
+  assert(Subtarget->hasXOP() && "XOP support required for vector rotates!");
+  assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
+
+  // XOP has 128-bit vector variable + immediate rotates.
+  // +ve/-ve Amt = rotate left/right.
+
+  // Split 256-bit integers.
+  if (VT.is256BitVector())
+    return Lower256IntArith(Op, DAG);
+
+  assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
+
+  // Attempt to rotate by immediate.
+  if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
+    if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
+      uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
+      assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
+      return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
+                         DAG.getConstant(RotateAmt, DL, MVT::i8));
+    }
+  }
+
+  // Use general rotate by variable (per-element).
+  return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
+}
+
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
+  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
+  // looks for this combo and may remove the "setcc" instruction if the "setcc"
+  // has only one use.
+  SDNode *N = Op.getNode();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  unsigned BaseOp = 0;
+  unsigned Cond = 0;
+  SDLoc DL(Op);
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Unknown ovf instruction!");
+  case ISD::SADDO:
+    // A subtract of one will be selected as a INC. Note that INC doesn't
+    // set CF, so we can't do this for UADDO.
+    if (isOneConstant(RHS)) {
+        BaseOp = X86ISD::INC;
+        Cond = X86::COND_O;
+        break;
+      }
+    BaseOp = X86ISD::ADD;
+    Cond = X86::COND_O;
+    break;
+  case ISD::UADDO:
+    BaseOp = X86ISD::ADD;
+    Cond = X86::COND_B;
+    break;
+  case ISD::SSUBO:
+    // A subtract of one will be selected as a DEC. Note that DEC doesn't
+    // set CF, so we can't do this for USUBO.
+    if (isOneConstant(RHS)) {
+        BaseOp = X86ISD::DEC;
+        Cond = X86::COND_O;
+        break;
+      }
+    BaseOp = X86ISD::SUB;
+    Cond = X86::COND_O;
+    break;
+  case ISD::USUBO:
+    BaseOp = X86ISD::SUB;
+    Cond = X86::COND_B;
+    break;
+  case ISD::SMULO:
+    BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
+    Cond = X86::COND_O;
+    break;
+  case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
+    if (N->getValueType(0) == MVT::i8) {
+      BaseOp = X86ISD::UMUL8;
+      Cond = X86::COND_O;
+      break;
+    }
+    SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
+                                 MVT::i32);
+    SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
+
+    SDValue SetCC =
+      DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                  DAG.getConstant(X86::COND_O, DL, MVT::i32),
+                  SDValue(Sum.getNode(), 2));
+
+    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
+  }
+  }
+
+  // Also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
+  SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
+
+  SDValue SetCC =
+    DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
+                DAG.getConstant(Cond, DL, MVT::i32),
+                SDValue(Sum.getNode(), 1));
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
+}
+
+/// Returns true if the operand type is exactly twice the native width, and
+/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
+/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
+/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
+bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
+  unsigned OpWidth = MemType->getPrimitiveSizeInBits();
+
+  if (OpWidth == 64)
+    return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
+  else if (OpWidth == 128)
+    return Subtarget->hasCmpxchg16b();
+  else
+    return false;
+}
+
+bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+  return needsCmpXchgNb(SI->getValueOperand()->getType());
+}
+
+// Note: this turns large loads into lock cmpxchg8b/16b.
+// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+  auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
+  return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
+                                               : AtomicExpansionKind::None;
+}
+
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
+  Type *MemType = AI->getType();
+
+  // If the operand is too big, we must see if cmpxchg8/16b is available
+  // and default to library calls otherwise.
+  if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
+    return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
+                                   : AtomicExpansionKind::None;
+  }
+
+  AtomicRMWInst::BinOp Op = AI->getOperation();
+  switch (Op) {
+  default:
+    llvm_unreachable("Unknown atomic operation");
+  case AtomicRMWInst::Xchg:
+  case AtomicRMWInst::Add:
+  case AtomicRMWInst::Sub:
+    // It's better to use xadd, xsub or xchg for these in all cases.
+    return AtomicExpansionKind::None;
+  case AtomicRMWInst::Or:
+  case AtomicRMWInst::And:
+  case AtomicRMWInst::Xor:
+    // If the atomicrmw's result isn't actually used, we can just add a "lock"
+    // prefix to a normal instruction for these operations.
+    return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
+                            : AtomicExpansionKind::None;
+  case AtomicRMWInst::Nand:
+  case AtomicRMWInst::Max:
+  case AtomicRMWInst::Min:
+  case AtomicRMWInst::UMax:
+  case AtomicRMWInst::UMin:
+    // These always require a non-trivial set of data operations on x86. We must
+    // use a cmpxchg loop.
+    return AtomicExpansionKind::CmpXChg;
+  }
+}
+
+static bool hasMFENCE(const X86Subtarget& Subtarget) {
+  // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
+  // no-sse2). There isn't any reason to disable it if the target processor
+  // supports it.
+  return Subtarget.hasSSE2() || Subtarget.is64Bit();
+}
+
+LoadInst *
+X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
+  unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
+  Type *MemType = AI->getType();
+  // Accesses larger than the native width are turned into cmpxchg/libcalls, so
+  // there is no benefit in turning such RMWs into loads, and it is actually
+  // harmful as it introduces a mfence.
+  if (MemType->getPrimitiveSizeInBits() > NativeWidth)
+    return nullptr;
+
+  auto Builder = IRBuilder<>(AI);
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  auto SynchScope = AI->getSynchScope();
+  // We must restrict the ordering to avoid generating loads with Release or
+  // ReleaseAcquire orderings.
+  auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
+  auto Ptr = AI->getPointerOperand();
+
+  // Before the load we need a fence. Here is an example lifted from
+  // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
+  // is required:
+  // Thread 0:
+  //   x.store(1, relaxed);
+  //   r1 = y.fetch_add(0, release);
+  // Thread 1:
+  //   y.fetch_add(42, acquire);
+  //   r2 = x.load(relaxed);
+  // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
+  // lowered to just a load without a fence. A mfence flushes the store buffer,
+  // making the optimization clearly correct.
+  // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
+  // otherwise, we might be able to be more aggressive on relaxed idempotent
+  // rmw. In practice, they do not look useful, so we don't try to be
+  // especially clever.
+  if (SynchScope == SingleThread)
+    // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
+    // the IR level, so we must wrap it in an intrinsic.
+    return nullptr;
+
+  if (!hasMFENCE(*Subtarget))
+    // FIXME: it might make sense to use a locked operation here but on a
+    // different cache-line to prevent cache-line bouncing. In practice it
+    // is probably a small win, and x86 processors without mfence are rare
+    // enough that we do not bother.
+    return nullptr;
+
+  Function *MFence =
+      llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
+  Builder.CreateCall(MFence, {});
+
+  // Finally we can emit the atomic load.
+  LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
+          AI->getType()->getPrimitiveSizeInBits());
+  Loaded->setAtomic(Order, SynchScope);
+  AI->replaceAllUsesWith(Loaded);
+  AI->eraseFromParent();
+  return Loaded;
+}
+
+static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
+                                 SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
+    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
+  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
+    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+  // The only fence that needs an instruction is a sequentially-consistent
+  // cross-thread fence.
+  if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
+    if (hasMFENCE(*Subtarget))
+      return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
+
+    SDValue Chain = Op.getOperand(0);
+    SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
+    SDValue Ops[] = {
+      DAG.getRegister(X86::ESP, MVT::i32),     // Base
+      DAG.getTargetConstant(1, dl, MVT::i8),   // Scale
+      DAG.getRegister(0, MVT::i32),            // Index
+      DAG.getTargetConstant(0, dl, MVT::i32),  // Disp
+      DAG.getRegister(0, MVT::i32),            // Segment.
+      Zero,
+      Chain
+    };
+    SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
+    return SDValue(Res, 0);
+  }
+
+  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+  return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
+}
+
+static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
+                             SelectionDAG &DAG) {
+  MVT T = Op.getSimpleValueType();
+  SDLoc DL(Op);
+  unsigned Reg = 0;
+  unsigned size = 0;
+  switch(T.SimpleTy) {
+  default: llvm_unreachable("Invalid value type!");
+  case MVT::i8:  Reg = X86::AL;  size = 1; break;
+  case MVT::i16: Reg = X86::AX;  size = 2; break;
+  case MVT::i32: Reg = X86::EAX; size = 4; break;
+  case MVT::i64:
+    assert(Subtarget->is64Bit() && "Node not type legal!");
+    Reg = X86::RAX; size = 8;
+    break;
+  }
+  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
+                                  Op.getOperand(2), SDValue());
+  SDValue Ops[] = { cpIn.getValue(0),
+                    Op.getOperand(1),
+                    Op.getOperand(3),
+                    DAG.getTargetConstant(size, DL, MVT::i8),
+                    cpIn.getValue(1) };
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
+  SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
+                                           Ops, T, MMO);
+
+  SDValue cpOut =
+    DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
+  SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
+                                      MVT::i32, cpOut.getValue(2));
+  SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
+                                DAG.getConstant(X86::COND_E, DL, MVT::i8),
+                                EFLAGS);
+
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
+  return SDValue();
+}
+
+static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
+                            SelectionDAG &DAG) {
+  MVT SrcVT = Op.getOperand(0).getSimpleValueType();
+  MVT DstVT = Op.getSimpleValueType();
+
+  if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
+    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+    if (DstVT != MVT::f64)
+      // This conversion needs to be expanded.
+      return SDValue();
+
+    SDValue InVec = Op->getOperand(0);
+    SDLoc dl(Op);
+    unsigned NumElts = SrcVT.getVectorNumElements();
+    MVT SVT = SrcVT.getVectorElementType();
+
+    // Widen the vector in input in the case of MVT::v2i32.
+    // Example: from MVT::v2i32 to MVT::v4i32.
+    SmallVector<SDValue, 16> Elts;
+    for (unsigned i = 0, e = NumElts; i != e; ++i)
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
+                                 DAG.getIntPtrConstant(i, dl)));
+
+    // Explicitly mark the extra elements as Undef.
+    Elts.append(NumElts, DAG.getUNDEF(SVT));
+
+    EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
+    SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
+                       DAG.getIntPtrConstant(0, dl));
+  }
+
+  assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
+         Subtarget->hasMMX() && "Unexpected custom BITCAST");
+  assert((DstVT == MVT::i64 ||
+          (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
+         "Unexpected custom BITCAST");
+  // i64 <=> MMX conversions are Legal.
+  if (SrcVT==MVT::i64 && DstVT.isVector())
+    return Op;
+  if (DstVT==MVT::i64 && SrcVT.isVector())
+    return Op;
+  // MMX <=> MMX conversions are Legal.
+  if (SrcVT.isVector() && DstVT.isVector())
+    return Op;
+  // All other conversions need to be expanded.
+  return SDValue();
+}
+
+/// Compute the horizontal sum of bytes in V for the elements of VT.
+///
+/// Requires V to be a byte vector and VT to be an integer vector type with
+/// wider elements than V's type. The width of the elements of VT determines
+/// how many bytes of V are summed horizontally to produce each element of the
+/// result.
+static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
+                                      const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  SDLoc DL(V);
+  MVT ByteVecVT = V.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  int NumElts = VT.getVectorNumElements();
+  assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
+         "Expected value to have byte element type.");
+  assert(EltVT != MVT::i8 &&
+         "Horizontal byte sum only makes sense for wider elements!");
+  unsigned VecSize = VT.getSizeInBits();
+  assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
+
+  // PSADBW instruction horizontally add all bytes and leave the result in i64
+  // chunks, thus directly computes the pop count for v2i64 and v4i64.
+  if (EltVT == MVT::i64) {
+    SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
+    MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
+    V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
+    return DAG.getBitcast(VT, V);
+  }
+
+  if (EltVT == MVT::i32) {
+    // We unpack the low half and high half into i32s interleaved with zeros so
+    // that we can use PSADBW to horizontally sum them. The most useful part of
+    // this is that it lines up the results of two PSADBW instructions to be
+    // two v2i64 vectors which concatenated are the 4 population counts. We can
+    // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
+    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
+    SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros);
+    SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros);
+
+    // Do the horizontal sums into two v2i64s.
+    Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
+    MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
+    Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
+                      DAG.getBitcast(ByteVecVT, Low), Zeros);
+    High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
+                       DAG.getBitcast(ByteVecVT, High), Zeros);
+
+    // Merge them together.
+    MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
+    V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
+                    DAG.getBitcast(ShortVecVT, Low),
+                    DAG.getBitcast(ShortVecVT, High));
+
+    return DAG.getBitcast(VT, V);
+  }
+
+  // The only element type left is i16.
+  assert(EltVT == MVT::i16 && "Unknown how to handle type");
+
+  // To obtain pop count for each i16 element starting from the pop count for
+  // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
+  // right by 8. It is important to shift as i16s as i8 vector shift isn't
+  // directly supported.
+  SmallVector<SDValue, 16> Shifters(NumElts, DAG.getConstant(8, DL, EltVT));
+  SDValue Shifter = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters);
+  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), Shifter);
+  V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
+                  DAG.getBitcast(ByteVecVT, V));
+  return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), Shifter);
+}
+
+static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
+                                        const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  unsigned VecSize = VT.getSizeInBits();
+
+  // Implement a lookup table in register by using an algorithm based on:
+  // http://wm.ite.pl/articles/sse-popcount.html
+  //
+  // The general idea is that every lower byte nibble in the input vector is an
+  // index into a in-register pre-computed pop count table. We then split up the
+  // input vector in two new ones: (1) a vector with only the shifted-right
+  // higher nibbles for each byte and (2) a vector with the lower nibbles (and
+  // masked out higher ones) for each byte. PSHUB is used separately with both
+  // to index the in-register table. Next, both are added and the result is a
+  // i8 vector where each element contains the pop count for input byte.
+  //
+  // To obtain the pop count for elements != i8, we follow up with the same
+  // approach and use additional tricks as described below.
+  //
+  const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
+                       /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
+                       /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
+                       /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
+
+  int NumByteElts = VecSize / 8;
+  MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
+  SDValue In = DAG.getBitcast(ByteVecVT, Op);
+  SmallVector<SDValue, 16> LUTVec;
+  for (int i = 0; i < NumByteElts; ++i)
+    LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
+  SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec);
+  SmallVector<SDValue, 16> Mask0F(NumByteElts,
+                                  DAG.getConstant(0x0F, DL, MVT::i8));
+  SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Mask0F);
+
+  // High nibbles
+  SmallVector<SDValue, 16> Four(NumByteElts, DAG.getConstant(4, DL, MVT::i8));
+  SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Four);
+  SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
+
+  // Low nibbles
+  SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
+
+  // The input vector is used as the shuffle mask that index elements into the
+  // LUT. After counting low and high nibbles, add the vector to obtain the
+  // final pop count per i8 element.
+  SDValue HighPopCnt =
+      DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
+  SDValue LowPopCnt =
+      DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
+  SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
+
+  if (EltVT == MVT::i8)
+    return PopCnt;
+
+  return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
+}
+
+static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  assert(VT.is128BitVector() &&
+         "Only 128-bit vector bitmath lowering supported.");
+
+  int VecSize = VT.getSizeInBits();
+  MVT EltVT = VT.getVectorElementType();
+  int Len = EltVT.getSizeInBits();
+
+  // This is the vectorized version of the "best" algorithm from
+  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+  // with a minor tweak to use a series of adds + shifts instead of vector
+  // multiplications. Implemented for all integer vector types. We only use
+  // this when we don't have SSSE3 which allows a LUT-based lowering that is
+  // much faster, even faster than using native popcnt instructions.
+
+  auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
+    MVT VT = V.getSimpleValueType();
+    SmallVector<SDValue, 32> Shifters(
+        VT.getVectorNumElements(),
+        DAG.getConstant(Shifter, DL, VT.getVectorElementType()));
+    return DAG.getNode(OpCode, DL, VT, V,
+                       DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters));
+  };
+  auto GetMask = [&](SDValue V, APInt Mask) {
+    MVT VT = V.getSimpleValueType();
+    SmallVector<SDValue, 32> Masks(
+        VT.getVectorNumElements(),
+        DAG.getConstant(Mask, DL, VT.getVectorElementType()));
+    return DAG.getNode(ISD::AND, DL, VT, V,
+                       DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Masks));
+  };
+
+  // We don't want to incur the implicit masks required to SRL vNi8 vectors on
+  // x86, so set the SRL type to have elements at least i16 wide. This is
+  // correct because all of our SRLs are followed immediately by a mask anyways
+  // that handles any bits that sneak into the high bits of the byte elements.
+  MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
+
+  SDValue V = Op;
+
+  // v = v - ((v >> 1) & 0x55555555...)
+  SDValue Srl =
+      DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
+  SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
+  V = DAG.getNode(ISD::SUB, DL, VT, V, And);
+
+  // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+  SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
+  Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
+  SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
+  V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
+
+  // v = (v + (v >> 4)) & 0x0F0F0F0F...
+  Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
+  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
+  V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
+
+  // At this point, V contains the byte-wise population count, and we are
+  // merely doing a horizontal sum if necessary to get the wider element
+  // counts.
+  if (EltVT == MVT::i8)
+    return V;
+
+  return LowerHorizontalByteSum(
+      DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
+      DAG);
+}
+
+static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget,
+                                SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  // FIXME: Need to add AVX-512 support here!
+  assert((VT.is256BitVector() || VT.is128BitVector()) &&
+         "Unknown CTPOP type to handle");
+  SDLoc DL(Op.getNode());
+  SDValue Op0 = Op.getOperand(0);
+
+  if (!Subtarget->hasSSSE3()) {
+    // We can't use the fast LUT approach, so fall back on vectorized bitmath.
+    assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
+    return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
+  }
+
+  if (VT.is256BitVector() && !Subtarget->hasInt256()) {
+    unsigned NumElems = VT.getVectorNumElements();
+
+    // Extract each 128-bit vector, compute pop count and concat the result.
+    SDValue LHS = Extract128BitVector(Op0, 0, DAG, DL);
+    SDValue RHS = Extract128BitVector(Op0, NumElems/2, DAG, DL);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+                       LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
+                       LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
+  }
+
+  return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
+}
+
+static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
+                          SelectionDAG &DAG) {
+  assert(Op.getSimpleValueType().isVector() &&
+         "We only do custom lowering for vector population count.");
+  return LowerVectorCTPOP(Op, Subtarget, DAG);
+}
+
+static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  SDLoc dl(Node);
+  EVT T = Node->getValueType(0);
+  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
+                              DAG.getConstant(0, dl, T), Node->getOperand(2));
+  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
+                       cast<AtomicSDNode>(Node)->getMemoryVT(),
+                       Node->getOperand(0),
+                       Node->getOperand(1), negOp,
+                       cast<AtomicSDNode>(Node)->getMemOperand(),
+                       cast<AtomicSDNode>(Node)->getOrdering(),
+                       cast<AtomicSDNode>(Node)->getSynchScope());
+}
+
+static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  SDLoc dl(Node);
+  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+
+  // Convert seq_cst store -> xchg
+  // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
+  // FIXME: On 32-bit, store -> fist or movq would be more efficient
+  //        (The only way to get a 16-byte store is cmpxchg16b)
+  // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
+  if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
+      !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
+                                 Node->getOperand(0),
+                                 Node->getOperand(1), Node->getOperand(2),
+                                 cast<AtomicSDNode>(Node)->getMemOperand(),
+                                 cast<AtomicSDNode>(Node)->getOrdering(),
+                                 cast<AtomicSDNode>(Node)->getSynchScope());
+    return Swap.getValue(1);
+  }
+  // Other atomic stores have a simple pattern.
+  return Op;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getNode()->getSimpleValueType(0);
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+  unsigned Opc;
+  bool ExtraOp = false;
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid code");
+  case ISD::ADDC: Opc = X86ISD::ADD; break;
+  case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
+  case ISD::SUBC: Opc = X86ISD::SUB; break;
+  case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
+  }
+
+  if (!ExtraOp)
+    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
+                       Op.getOperand(1));
+  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
+                     Op.getOperand(1), Op.getOperand(2));
+}
+
+static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
+                            SelectionDAG &DAG) {
+  assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
+
+  // For MacOSX, we want to call an alternative entry point: __sincos_stret,
+  // which returns the values as { float, float } (in XMM0) or
+  // { double, double } (which is returned in XMM0, XMM1).
+  SDLoc dl(Op);
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  Entry.Node = Arg;
+  Entry.Ty = ArgTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  bool isF64 = ArgVT == MVT::f64;
+  // Only optimize x86_64 for now. i386 is a bit messy. For f32,
+  // the small struct {f32, f32} is returned in (eax, edx). For f64,
+  // the results are returned via SRet in memory.
+  const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Callee =
+      DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
+
+  Type *RetTy = isF64
+    ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
+    : (Type*)VectorType::get(ArgTy, 4);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
+
+  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+  if (isF64)
+    // Returned in xmm0 and xmm1.
+    return CallResult.first;
+
+  // Returned in bits 0:31 and 32:64 xmm0.
+  SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
+                               CallResult.first, DAG.getIntPtrConstant(0, dl));
+  SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
+                               CallResult.first, DAG.getIntPtrConstant(1, dl));
+  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
+}
+
+/// Widen a vector input to a vector of NVT.  The
+/// input vector must have the same element type as NVT.
+static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
+                            bool FillWithZeroes = false) {
+  // Check if InOp already has the right width.
+  MVT InVT = InOp.getSimpleValueType();
+  if (InVT == NVT)
+    return InOp;
+
+  if (InOp.isUndef())
+    return DAG.getUNDEF(NVT);
+
+  assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
+         "input and widen element type must match");
+
+  unsigned InNumElts = InVT.getVectorNumElements();
+  unsigned WidenNumElts = NVT.getVectorNumElements();
+  assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
+         "Unexpected request for vector widening");
+
+  EVT EltVT = NVT.getVectorElementType();
+
+  SDLoc dl(InOp);
+  if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
+      InOp.getNumOperands() == 2) {
+    SDValue N1 = InOp.getOperand(1);
+    if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
+        N1.isUndef()) {
+      InOp = InOp.getOperand(0);
+      InVT = InOp.getSimpleValueType();
+      InNumElts = InVT.getVectorNumElements();
+    }
+  }
+  if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
+      ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
+    SmallVector<SDValue, 16> Ops;
+    for (unsigned i = 0; i < InNumElts; ++i)
+      Ops.push_back(InOp.getOperand(i));
+
+    SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
+      DAG.getUNDEF(EltVT);
+    for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
+      Ops.push_back(FillVal);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops);
+  }
+  SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
+    DAG.getUNDEF(NVT);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
+                     InOp, DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
+                             SelectionDAG &DAG) {
+  assert(Subtarget->hasAVX512() &&
+         "MGATHER/MSCATTER are supported on AVX-512 arch only");
+
+  // X86 scatter kills mask register, so its type should be added to
+  // the list of return values.
+  // If the "scatter" has 2 return values, it is already handled.
+  if (Op.getNode()->getNumValues() == 2)
+    return Op;
+
+  MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
+  SDValue Src = N->getValue();
+  MVT VT = Src.getSimpleValueType();
+  assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
+  SDLoc dl(Op);
+
+  SDValue NewScatter;
+  SDValue Index = N->getIndex();
+  SDValue Mask = N->getMask();
+  SDValue Chain = N->getChain();
+  SDValue BasePtr = N->getBasePtr();
+  MVT MemVT = N->getMemoryVT().getSimpleVT();
+  MVT IndexVT = Index.getSimpleValueType();
+  MVT MaskVT = Mask.getSimpleValueType();
+
+  if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
+    // The v2i32 value was promoted to v2i64.
+    // Now we "redo" the type legalizer's work and widen the original
+    // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
+    // with a shuffle.
+    assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
+           "Unexpected memory type");
+    int ShuffleMask[] = {0, 2, -1, -1};
+    Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
+                               DAG.getUNDEF(MVT::v4i32), ShuffleMask);
+    // Now we have 4 elements instead of 2.
+    // Expand the index.
+    MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
+    Index = ExtendToType(Index, NewIndexVT, DAG);
+
+    // Expand the mask with zeroes
+    // Mask may be <2 x i64> or <2 x i1> at this moment
+    assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
+           "Unexpected mask type");
+    MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
+    Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
+    VT = MVT::v4i32;
+  }
+
+  unsigned NumElts = VT.getVectorNumElements();
+  if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
+      !Index.getSimpleValueType().is512BitVector()) {
+    // AVX512F supports only 512-bit vectors. Or data or index should
+    // be 512 bit wide. If now the both index and data are 256-bit, but
+    // the vector contains 8 elements, we just sign-extend the index
+    if (IndexVT == MVT::v8i32)
+      // Just extend index
+      Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+    else {
+      // The minimal number of elts in scatter is 8
+      NumElts = 8;
+      // Index
+      MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
+      // Use original index here, do not modify the index twice
+      Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
+      if (IndexVT.getScalarType() == MVT::i32)
+        Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+
+      // Mask
+      // At this point we have promoted mask operand
+      assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
+      MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
+      // Use the original mask here, do not modify the mask twice
+      Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
+
+      // The value that should be stored
+      MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
+      Src = ExtendToType(Src, NewVT, DAG);
+    }
+  }
+  // If the mask is "wide" at this point - truncate it to i1 vector
+  MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+  Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
+
+  // The mask is killed by scatter, add it to the values
+  SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
+  SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
+  NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
+                                    N->getMemOperand());
+  DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
+  return SDValue(NewScatter.getNode(), 0);
+}
+
+static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget,
+                          SelectionDAG &DAG) {
+
+  MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
+  MVT VT = Op.getSimpleValueType();
+  SDValue Mask = N->getMask();
+  SDLoc dl(Op);
+
+  if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
+      !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
+    // This operation is legal for targets with VLX, but without
+    // VLX the vector should be widened to 512 bit
+    unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+    MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
+    MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+    SDValue Src0 = N->getSrc0();
+    Src0 = ExtendToType(Src0, WideDataVT, DAG);
+    Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+    SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
+                                        N->getBasePtr(), Mask, Src0,
+                                        N->getMemoryVT(), N->getMemOperand(),
+                                        N->getExtensionType());
+
+    SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                 NewLoad.getValue(0),
+                                 DAG.getIntPtrConstant(0, dl));
+    SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
+    return DAG.getMergeValues(RetOps, dl);
+  }
+  return Op;
+}
+
+static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG) {
+  MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
+  SDValue DataToStore = N->getValue();
+  MVT VT = DataToStore.getSimpleValueType();
+  SDValue Mask = N->getMask();
+  SDLoc dl(Op);
+
+  if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
+      !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
+    // This operation is legal for targets with VLX, but without
+    // VLX the vector should be widened to 512 bit
+    unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+    MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
+    MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+    DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
+    Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+    return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
+                              Mask, N->getMemoryVT(), N->getMemOperand(),
+                              N->isTruncatingStore());
+  }
+  return Op;
+}
+
+static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget,
+                            SelectionDAG &DAG) {
+  assert(Subtarget->hasAVX512() &&
+         "MGATHER/MSCATTER are supported on AVX-512 arch only");
+
+  MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue Index = N->getIndex();
+  SDValue Mask = N->getMask();
+  SDValue Src0 = N->getValue();
+  MVT IndexVT = Index.getSimpleValueType();
+  MVT MaskVT = Mask.getSimpleValueType();
+
+  unsigned NumElts = VT.getVectorNumElements();
+  assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
+
+  if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
+      !Index.getSimpleValueType().is512BitVector()) {
+    // AVX512F supports only 512-bit vectors. Or data or index should
+    // be 512 bit wide. If now the both index and data are 256-bit, but
+    // the vector contains 8 elements, we just sign-extend the index
+    if (NumElts == 8) {
+      Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),  N->getOperand(2),
+                        N->getOperand(3), Index };
+      DAG.UpdateNodeOperands(N, Ops);
+      return Op;
+    }
+
+    // Minimal number of elements in Gather
+    NumElts = 8;
+    // Index
+    MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
+    Index = ExtendToType(Index, NewIndexVT, DAG);
+    if (IndexVT.getScalarType() == MVT::i32)
+      Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+
+    // Mask
+    MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
+    // At this point we have promoted mask operand
+    assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
+    MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
+    Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
+    Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
+
+    // The pass-thru value
+    MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
+    Src0 = ExtendToType(Src0, NewVT, DAG);
+
+    SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+    SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
+                                            N->getMemoryVT(), dl, Ops,
+                                            N->getMemOperand());
+    SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                 NewGather.getValue(0),
+                                 DAG.getIntPtrConstant(0, dl));
+    SDValue RetOps[] = {Exract, NewGather.getValue(1)};
+    return DAG.getMergeValues(RetOps, dl);
+  }
+  return Op;
+}
+
+SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  // TODO: Eventually, the lowering of these nodes should be informed by or
+  // deferred to the GC strategy for the function in which they appear. For
+  // now, however, they must be lowered to something. Since they are logically
+  // no-ops in the case of a null GC strategy (or a GC strategy which does not
+  // require special handling for these nodes), lower them as literal NOOPs for
+  // the time being.
+  SmallVector<SDValue, 2> Ops;
+
+  Ops.push_back(Op.getOperand(0));
+  if (Op->getGluedNode())
+    Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
+
+  SDLoc OpDL(Op);
+  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
+
+  return NOOP;
+}
+
+SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  // TODO: Eventually, the lowering of these nodes should be informed by or
+  // deferred to the GC strategy for the function in which they appear. For
+  // now, however, they must be lowered to something. Since they are logically
+  // no-ops in the case of a null GC strategy (or a GC strategy which does not
+  // require special handling for these nodes), lower them as literal NOOPs for
+  // the time being.
+  SmallVector<SDValue, 2> Ops;
+
+  Ops.push_back(Op.getOperand(0));
+  if (Op->getGluedNode())
+    Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
+
+  SDLoc OpDL(Op);
+  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
+
+  return NOOP;
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Should not custom lower this!");
+  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+    return LowerCMP_SWAP(Op, Subtarget, DAG);
+  case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
+  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
+  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
+  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
+  case ISD::VECTOR_SHUFFLE:     return lowerVectorShuffle(Op, Subtarget, DAG);
+  case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
+  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
+  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
+  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
+  case ISD::SHL_PARTS:
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
+  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
+  case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
+  case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
+  case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
+  case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+    return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG);
+  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
+  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
+  case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
+  case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
+  case ISD::FABS:
+  case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
+  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
+  case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
+  case ISD::SETCC:              return LowerSETCC(Op, DAG);
+  case ISD::SETCCE:             return LowerSETCCE(Op, DAG);
+  case ISD::SELECT:             return LowerSELECT(Op, DAG);
+  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
+  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+  case ISD::VASTART:            return LowerVASTART(Op, DAG);
+  case ISD::VAARG:              return LowerVAARG(Op, DAG);
+  case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  case ISD::FRAME_TO_ARGS_OFFSET:
+                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
+  case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
+  case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
+  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
+  case ISD::CTLZ:               return LowerCTLZ(Op, Subtarget, DAG);
+  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, Subtarget, DAG);
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
+  case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
+  case ISD::UMUL_LOHI:
+  case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
+  case ISD::ROTL:               return LowerRotate(Op, Subtarget, DAG);
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:              return LowerXALUO(Op, DAG);
+  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
+  case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::ADD:                return LowerADD(Op, DAG);
+  case ISD::SUB:                return LowerSUB(Op, DAG);
+  case ISD::SMAX:
+  case ISD::SMIN:
+  case ISD::UMAX:
+  case ISD::UMIN:               return LowerMINMAX(Op, DAG);
+  case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
+  case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
+  case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
+  case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
+  case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
+  case ISD::GC_TRANSITION_START:
+                                return LowerGC_TRANSITION_START(Op, DAG);
+  case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
+  }
+}
+
+/// ReplaceNodeResults - Replace a node with an illegal result type
+/// with a new node built out of custom code.
+void X86TargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) const {
+  SDLoc dl(N);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
+  case X86ISD::AVG: {
+    // Legalize types for X86ISD::AVG by expanding vectors.
+    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+
+    auto InVT = N->getValueType(0);
+    auto InVTSize = InVT.getSizeInBits();
+    const unsigned RegSize =
+        (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
+    assert((!Subtarget->hasAVX512() || RegSize < 512) &&
+           "512-bit vector requires AVX512");
+    assert((!Subtarget->hasAVX2() || RegSize < 256) &&
+           "256-bit vector requires AVX2");
+
+    auto ElemVT = InVT.getVectorElementType();
+    auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
+                                  RegSize / ElemVT.getSizeInBits());
+    assert(RegSize % InVT.getSizeInBits() == 0);
+    unsigned NumConcat = RegSize / InVT.getSizeInBits();
+
+    SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
+    Ops[0] = N->getOperand(0);
+    SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
+    Ops[0] = N->getOperand(1);
+    SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
+
+    SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
+    Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
+                                  DAG.getIntPtrConstant(0, dl)));
+    return;
+  }
+  // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
+  case X86ISD::FMINC:
+  case X86ISD::FMIN:
+  case X86ISD::FMAXC:
+  case X86ISD::FMAX: {
+    EVT VT = N->getValueType(0);
+    assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
+    SDValue UNDEF = DAG.getUNDEF(VT);
+    SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+                              N->getOperand(0), UNDEF);
+    SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+                              N->getOperand(1), UNDEF);
+    Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
+    return;
+  }
+  case ISD::SIGN_EXTEND_INREG:
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:
+    // We don't want to expand or promote these.
+    return;
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+  case ISD::SDIVREM:
+  case ISD::UDIVREM: {
+    SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
+    Results.push_back(V);
+    return;
+  }
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT: {
+    bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+
+    std::pair<SDValue,SDValue> Vals =
+        FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
+    SDValue FIST = Vals.first, StackSlot = Vals.second;
+    if (FIST.getNode()) {
+      EVT VT = N->getValueType(0);
+      // Return a load from the stack slot.
+      if (StackSlot.getNode())
+        Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
+                                      MachinePointerInfo(),
+                                      false, false, false, 0));
+      else
+        Results.push_back(FIST);
+    }
+    return;
+  }
+  case ISD::UINT_TO_FP: {
+    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+    if (N->getOperand(0).getValueType() != MVT::v2i32 ||
+        N->getValueType(0) != MVT::v2f32)
+      return;
+    SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
+                                 N->getOperand(0));
+    SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
+                                     MVT::f64);
+    SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
+    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
+                             DAG.getBitcast(MVT::v2i64, VBias));
+    Or = DAG.getBitcast(MVT::v2f64, Or);
+    // TODO: Are there any fast-math-flags to propagate here?
+    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
+    Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
+    return;
+  }
+  case ISD::FP_ROUND: {
+    if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
+        return;
+    SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
+    Results.push_back(V);
+    return;
+  }
+  case ISD::FP_EXTEND: {
+    // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
+    // No other ValueType for FP_EXTEND should reach this point.
+    assert(N->getValueType(0) == MVT::v2f32 &&
+           "Do not know how to legalize this Node");
+    return;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default : llvm_unreachable("Do not know how to custom type "
+                               "legalize this intrinsic operation!");
+    case Intrinsic::x86_rdtsc:
+      return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+                                     Results);
+    case Intrinsic::x86_rdtscp:
+      return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
+                                     Results);
+    case Intrinsic::x86_rdpmc:
+      return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
+    }
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
+      Results.push_back(V);
+    return;
+  }
+  case ISD::READCYCLECOUNTER: {
+    return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+                                   Results);
+  }
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
+    EVT T = N->getValueType(0);
+    assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
+    bool Regs64bit = T == MVT::i128;
+    MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
+    SDValue cpInL, cpInH;
+    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+                        DAG.getConstant(0, dl, HalfT));
+    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+                        DAG.getConstant(1, dl, HalfT));
+    cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
+                             Regs64bit ? X86::RAX : X86::EAX,
+                             cpInL, SDValue());
+    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
+                             Regs64bit ? X86::RDX : X86::EDX,
+                             cpInH, cpInL.getValue(1));
+    SDValue swapInL, swapInH;
+    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+                          DAG.getConstant(0, dl, HalfT));
+    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+                          DAG.getConstant(1, dl, HalfT));
+    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
+                               Regs64bit ? X86::RBX : X86::EBX,
+                               swapInL, cpInH.getValue(1));
+    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
+                               Regs64bit ? X86::RCX : X86::ECX,
+                               swapInH, swapInL.getValue(1));
+    SDValue Ops[] = { swapInH.getValue(0),
+                      N->getOperand(1),
+                      swapInH.getValue(1) };
+    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
+    unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
+                                  X86ISD::LCMPXCHG8_DAG;
+    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
+                                        Regs64bit ? X86::RAX : X86::EAX,
+                                        HalfT, Result.getValue(1));
+    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
+                                        Regs64bit ? X86::RDX : X86::EDX,
+                                        HalfT, cpOutL.getValue(2));
+    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+
+    SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
+                                        MVT::i32, cpOutH.getValue(2));
+    SDValue Success =
+        DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                    DAG.getConstant(X86::COND_E, dl, MVT::i8), EFLAGS);
+    Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
+
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
+    Results.push_back(Success);
+    Results.push_back(EFLAGS.getValue(1));
+    return;
+  }
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_LOAD: {
+    // Delegate to generic TypeLegalization. Situations we can really handle
+    // should have already been dealt with by AtomicExpandPass.cpp.
+    break;
+  }
+  case ISD::BITCAST: {
+    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+    EVT DstVT = N->getValueType(0);
+    EVT SrcVT = N->getOperand(0)->getValueType(0);
+
+    if (SrcVT != MVT::f64 ||
+        (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
+      return;
+
+    unsigned NumElts = DstVT.getVectorNumElements();
+    EVT SVT = DstVT.getVectorElementType();
+    EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+    SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                   MVT::v2f64, N->getOperand(0));
+    SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
+
+    if (ExperimentalVectorWideningLegalization) {
+      // If we are legalizing vectors by widening, we already have the desired
+      // legal vector type, just return it.
+      Results.push_back(ToVecInt);
+      return;
+    }
+
+    SmallVector<SDValue, 8> Elts;
+    for (unsigned i = 0, e = NumElts; i != e; ++i)
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
+                                   ToVecInt, DAG.getIntPtrConstant(i, dl)));
+
+    Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
+  }
+  }
+}
+
+const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch ((X86ISD::NodeType)Opcode) {
+  case X86ISD::FIRST_NUMBER:       break;
+  case X86ISD::BSF:                return "X86ISD::BSF";
+  case X86ISD::BSR:                return "X86ISD::BSR";
+  case X86ISD::SHLD:               return "X86ISD::SHLD";
+  case X86ISD::SHRD:               return "X86ISD::SHRD";
+  case X86ISD::FAND:               return "X86ISD::FAND";
+  case X86ISD::FANDN:              return "X86ISD::FANDN";
+  case X86ISD::FOR:                return "X86ISD::FOR";
+  case X86ISD::FXOR:               return "X86ISD::FXOR";
+  case X86ISD::FILD:               return "X86ISD::FILD";
+  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
+  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
+  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
+  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
+  case X86ISD::FLD:                return "X86ISD::FLD";
+  case X86ISD::FST:                return "X86ISD::FST";
+  case X86ISD::CALL:               return "X86ISD::CALL";
+  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
+  case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
+  case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
+  case X86ISD::BT:                 return "X86ISD::BT";
+  case X86ISD::CMP:                return "X86ISD::CMP";
+  case X86ISD::COMI:               return "X86ISD::COMI";
+  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
+  case X86ISD::CMPM:               return "X86ISD::CMPM";
+  case X86ISD::CMPMU:              return "X86ISD::CMPMU";
+  case X86ISD::CMPM_RND:           return "X86ISD::CMPM_RND";
+  case X86ISD::SETCC:              return "X86ISD::SETCC";
+  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
+  case X86ISD::FSETCC:             return "X86ISD::FSETCC";
+  case X86ISD::FGETSIGNx86:        return "X86ISD::FGETSIGNx86";
+  case X86ISD::CMOV:               return "X86ISD::CMOV";
+  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
+  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
+  case X86ISD::IRET:               return "X86ISD::IRET";
+  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
+  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
+  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
+  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
+  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
+  case X86ISD::MOVDQ2Q:            return "X86ISD::MOVDQ2Q";
+  case X86ISD::MMX_MOVD2W:         return "X86ISD::MMX_MOVD2W";
+  case X86ISD::MMX_MOVW2D:         return "X86ISD::MMX_MOVW2D";
+  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
+  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
+  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
+  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
+  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
+  case X86ISD::MMX_PINSRW:         return "X86ISD::MMX_PINSRW";
+  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
+  case X86ISD::ANDNP:              return "X86ISD::ANDNP";
+  case X86ISD::PSIGN:              return "X86ISD::PSIGN";
+  case X86ISD::BLENDI:             return "X86ISD::BLENDI";
+  case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
+  case X86ISD::ADDUS:              return "X86ISD::ADDUS";
+  case X86ISD::SUBUS:              return "X86ISD::SUBUS";
+  case X86ISD::HADD:               return "X86ISD::HADD";
+  case X86ISD::HSUB:               return "X86ISD::HSUB";
+  case X86ISD::FHADD:              return "X86ISD::FHADD";
+  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
+  case X86ISD::ABS:                return "X86ISD::ABS";
+  case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
+  case X86ISD::FMAX:               return "X86ISD::FMAX";
+  case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
+  case X86ISD::FMIN:               return "X86ISD::FMIN";
+  case X86ISD::FMIN_RND:           return "X86ISD::FMIN_RND";
+  case X86ISD::FMAXC:              return "X86ISD::FMAXC";
+  case X86ISD::FMINC:              return "X86ISD::FMINC";
+  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
+  case X86ISD::FRCP:               return "X86ISD::FRCP";
+  case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
+  case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
+  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
+  case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
+  case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
+  case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
+  case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
+  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
+  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
+  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
+  case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
+  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
+  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
+  case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
+  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
+  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
+  case X86ISD::VZEXT:              return "X86ISD::VZEXT";
+  case X86ISD::VSEXT:              return "X86ISD::VSEXT";
+  case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
+  case X86ISD::VTRUNCS:            return "X86ISD::VTRUNCS";
+  case X86ISD::VTRUNCUS:           return "X86ISD::VTRUNCUS";
+  case X86ISD::VINSERT:            return "X86ISD::VINSERT";
+  case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
+  case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
+  case X86ISD::CVTDQ2PD:           return "X86ISD::CVTDQ2PD";
+  case X86ISD::CVTUDQ2PD:          return "X86ISD::CVTUDQ2PD";
+  case X86ISD::CVT2MASK:           return "X86ISD::CVT2MASK";
+  case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
+  case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
+  case X86ISD::VSHL:               return "X86ISD::VSHL";
+  case X86ISD::VSRL:               return "X86ISD::VSRL";
+  case X86ISD::VSRA:               return "X86ISD::VSRA";
+  case X86ISD::VSHLI:              return "X86ISD::VSHLI";
+  case X86ISD::VSRLI:              return "X86ISD::VSRLI";
+  case X86ISD::VSRAI:              return "X86ISD::VSRAI";
+  case X86ISD::CMPP:               return "X86ISD::CMPP";
+  case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
+  case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
+  case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
+  case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
+  case X86ISD::ADD:                return "X86ISD::ADD";
+  case X86ISD::SUB:                return "X86ISD::SUB";
+  case X86ISD::ADC:                return "X86ISD::ADC";
+  case X86ISD::SBB:                return "X86ISD::SBB";
+  case X86ISD::SMUL:               return "X86ISD::SMUL";
+  case X86ISD::UMUL:               return "X86ISD::UMUL";
+  case X86ISD::SMUL8:              return "X86ISD::SMUL8";
+  case X86ISD::UMUL8:              return "X86ISD::UMUL8";
+  case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
+  case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
+  case X86ISD::INC:                return "X86ISD::INC";
+  case X86ISD::DEC:                return "X86ISD::DEC";
+  case X86ISD::OR:                 return "X86ISD::OR";
+  case X86ISD::XOR:                return "X86ISD::XOR";
+  case X86ISD::AND:                return "X86ISD::AND";
+  case X86ISD::BEXTR:              return "X86ISD::BEXTR";
+  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
+  case X86ISD::PTEST:              return "X86ISD::PTEST";
+  case X86ISD::TESTP:              return "X86ISD::TESTP";
+  case X86ISD::TESTM:              return "X86ISD::TESTM";
+  case X86ISD::TESTNM:             return "X86ISD::TESTNM";
+  case X86ISD::KORTEST:            return "X86ISD::KORTEST";
+  case X86ISD::KTEST:              return "X86ISD::KTEST";
+  case X86ISD::PACKSS:             return "X86ISD::PACKSS";
+  case X86ISD::PACKUS:             return "X86ISD::PACKUS";
+  case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
+  case X86ISD::VALIGN:             return "X86ISD::VALIGN";
+  case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
+  case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
+  case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
+  case X86ISD::SHUFP:              return "X86ISD::SHUFP";
+  case X86ISD::SHUF128:            return "X86ISD::SHUF128";
+  case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
+  case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
+  case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
+  case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
+  case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
+  case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
+  case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
+  case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
+  case X86ISD::MOVSD:              return "X86ISD::MOVSD";
+  case X86ISD::MOVSS:              return "X86ISD::MOVSS";
+  case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
+  case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
+  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
+  case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
+  case X86ISD::SUBV_BROADCAST:     return "X86ISD::SUBV_BROADCAST";
+  case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
+  case X86ISD::VPERMILPV:          return "X86ISD::VPERMILPV";
+  case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
+  case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
+  case X86ISD::VPERMV:             return "X86ISD::VPERMV";
+  case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
+  case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
+  case X86ISD::VPERMI:             return "X86ISD::VPERMI";
+  case X86ISD::VPTERNLOG:          return "X86ISD::VPTERNLOG";
+  case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
+  case X86ISD::VRANGE:             return "X86ISD::VRANGE";
+  case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
+  case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
+  case X86ISD::PSADBW:             return "X86ISD::PSADBW";
+  case X86ISD::DBPSADBW:           return "X86ISD::DBPSADBW";
+  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
+  case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
+  case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
+  case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
+  case X86ISD::MFENCE:             return "X86ISD::MFENCE";
+  case X86ISD::SFENCE:             return "X86ISD::SFENCE";
+  case X86ISD::LFENCE:             return "X86ISD::LFENCE";
+  case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
+  case X86ISD::SAHF:               return "X86ISD::SAHF";
+  case X86ISD::RDRAND:             return "X86ISD::RDRAND";
+  case X86ISD::RDSEED:             return "X86ISD::RDSEED";
+  case X86ISD::VPMADDUBSW:         return "X86ISD::VPMADDUBSW";
+  case X86ISD::VPMADDWD:           return "X86ISD::VPMADDWD";
+  case X86ISD::VPROT:              return "X86ISD::VPROT";
+  case X86ISD::VPROTI:             return "X86ISD::VPROTI";
+  case X86ISD::VPSHA:              return "X86ISD::VPSHA";
+  case X86ISD::VPSHL:              return "X86ISD::VPSHL";
+  case X86ISD::VPCOM:              return "X86ISD::VPCOM";
+  case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
+  case X86ISD::FMADD:              return "X86ISD::FMADD";
+  case X86ISD::FMSUB:              return "X86ISD::FMSUB";
+  case X86ISD::FNMADD:             return "X86ISD::FNMADD";
+  case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
+  case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
+  case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
+  case X86ISD::FMADD_RND:          return "X86ISD::FMADD_RND";
+  case X86ISD::FNMADD_RND:         return "X86ISD::FNMADD_RND";
+  case X86ISD::FMSUB_RND:          return "X86ISD::FMSUB_RND";
+  case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
+  case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
+  case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
+  case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
+  case X86ISD::VREDUCE:            return "X86ISD::VREDUCE";
+  case X86ISD::VGETMANT:           return "X86ISD::VGETMANT";
+  case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
+  case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
+  case X86ISD::XTEST:              return "X86ISD::XTEST";
+  case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
+  case X86ISD::EXPAND:             return "X86ISD::EXPAND";
+  case X86ISD::SELECT:             return "X86ISD::SELECT";
+  case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
+  case X86ISD::RCP28:              return "X86ISD::RCP28";
+  case X86ISD::EXP2:               return "X86ISD::EXP2";
+  case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
+  case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
+  case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
+  case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
+  case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
+  case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
+  case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
+  case X86ISD::SCALEF:             return "X86ISD::SCALEF";
+  case X86ISD::ADDS:               return "X86ISD::ADDS";
+  case X86ISD::SUBS:               return "X86ISD::SUBS";
+  case X86ISD::AVG:                return "X86ISD::AVG";
+  case X86ISD::MULHRS:             return "X86ISD::MULHRS";
+  case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
+  case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
+  case X86ISD::FP_TO_SINT_RND:     return "X86ISD::FP_TO_SINT_RND";
+  case X86ISD::FP_TO_UINT_RND:     return "X86ISD::FP_TO_UINT_RND";
+  case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
+  case X86ISD::VFPCLASSS:          return "X86ISD::VFPCLASSS";
+  }
+  return nullptr;
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                              const AddrMode &AM, Type *Ty,
+                                              unsigned AS) const {
+  // X86 supports extremely general addressing modes.
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+  Reloc::Model R = getTargetMachine().getRelocationModel();
+
+  // X86 allows a sign-extended 32-bit immediate field as a displacement.
+  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
+    return false;
+
+  if (AM.BaseGV) {
+    unsigned GVFlags =
+      Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
+
+    // If a reference to this global requires an extra load, we can't fold it.
+    if (isGlobalStubReference(GVFlags))
+      return false;
+
+    // If BaseGV requires a register for the PIC base, we cannot also have a
+    // BaseReg specified.
+    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
+      return false;
+
+    // If lower 4G is not available, then we must use rip-relative addressing.
+    if ((M != CodeModel::Small || R != Reloc::Static) &&
+        Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
+      return false;
+  }
+
+  switch (AM.Scale) {
+  case 0:
+  case 1:
+  case 2:
+  case 4:
+  case 8:
+    // These scales always work.
+    break;
+  case 3:
+  case 5:
+  case 9:
+    // These scales are formed with basereg+scalereg.  Only accept if there is
+    // no basereg yet.
+    if (AM.HasBaseReg)
+      return false;
+    break;
+  default:  // Other stuff never works.
+    return false;
+  }
+
+  return true;
+}
+
+bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
+  unsigned Bits = Ty->getScalarSizeInBits();
+
+  // 8-bit shifts are always expensive, but versions with a scalar amount aren't
+  // particularly cheaper than those without.
+  if (Bits == 8)
+    return false;
+
+  // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
+  // variable shifts just as cheap as scalar ones.
+  if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
+    return false;
+
+  // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
+  // fully general vector.
+  return true;
+}
+
+bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  return NumBits1 > NumBits2;
+}
+
+bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+
+  if (!isTypeLegal(EVT::getEVT(Ty1)))
+    return false;
+
+  assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
+
+  // Assuming the caller doesn't have a zeroext or signext return parameter,
+  // truncation all the way down to i1 is valid.
+  return true;
+}
+
+bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  return isInt<32>(Imm);
+}
+
+bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  // Can also use sub to handle negated immediates.
+  return isInt<32>(Imm);
+}
+
+bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  return NumBits1 > NumBits2;
+}
+
+bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  EVT VT1 = Val.getValueType();
+  if (isZExtFree(VT1, VT2))
+    return true;
+
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  if (!VT1.isSimple() || !VT1.isInteger() ||
+      !VT2.isSimple() || !VT2.isInteger())
+    return false;
+
+  switch (VT1.getSimpleVT().SimpleTy) {
+  default: break;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    // X86 has 8, 16, and 32-bit zero-extending loads.
+    return true;
+  }
+
+  return false;
+}
+
+bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
+
+bool
+X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  if (!Subtarget->hasAnyFMA())
+    return false;
+
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
+  // i16 instructions are longer (0x66 prefix) and potentially slower.
+  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
+}
+
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+                                      EVT VT) const {
+  if (!VT.isSimple())
+    return false;
+
+  // Not for i1 vectors
+  if (VT.getSimpleVT().getScalarType() == MVT::i1)
+    return false;
+
+  // Very little shuffling can be done for 64-bit vectors right now.
+  if (VT.getSimpleVT().getSizeInBits() == 64)
+    return false;
+
+  // We only care that the types being shuffled are legal. The lowering can
+  // handle any possible shuffle mask that results.
+  return isTypeLegal(VT.getSimpleVT());
+}
+
+bool
+X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+                                          EVT VT) const {
+  // Just delegate to the generic legality, clear masks aren't special.
+  return isShuffleMaskLegal(Mask, VT);
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+/// Utility function to emit xbegin specifying the start of an RTM region.
+static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
+                                     const TargetInstrInfo *TII) {
+  DebugLoc DL = MI->getDebugLoc();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = ++MBB->getIterator();
+
+  // For the v = xbegin(), we generate
+  //
+  // thisMBB:
+  //  xbegin sinkMBB
+  //
+  // mainMBB:
+  //  eax = -1
+  //
+  // sinkMBB:
+  //  v = eax
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineFunction *MF = MBB->getParent();
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  //  xbegin sinkMBB
+  //  # fallthrough to mainMBB
+  //  # abortion to sinkMBB
+  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
+  thisMBB->addSuccessor(mainMBB);
+  thisMBB->addSuccessor(sinkMBB);
+
+  // mainMBB:
+  //  EAX = -1
+  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  // EAX is live into the sinkMBB
+  sinkMBB->addLiveIn(X86::EAX);
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+    .addReg(X86::EAX);
+
+  MI->eraseFromParent();
+  return sinkMBB;
+}
+
+// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
+// or XMM0_V32I8 in AVX all of this code can be replaced with that
+// in the .td file.
+static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII) {
+  unsigned Opc;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("illegal opcode!");
+  case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
+  case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
+  case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
+  case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
+  case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
+  case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
+  case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
+  case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
+  }
+
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+  unsigned NumArgs = MI->getNumOperands();
+  for (unsigned i = 1; i < NumArgs; ++i) {
+    MachineOperand &Op = MI->getOperand(i);
+    if (!(Op.isReg() && Op.isImplicit()))
+      MIB.addOperand(Op);
+  }
+  if (MI->hasOneMemOperand())
+    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  BuildMI(*BB, MI, dl,
+    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+    .addReg(X86::XMM0);
+
+  MI->eraseFromParent();
+  return BB;
+}
+
+// FIXME: Custom handling because TableGen doesn't support multiple implicit
+// defs in an instruction pattern
+static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII) {
+  unsigned Opc;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("illegal opcode!");
+  case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
+  case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
+  case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
+  case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
+  case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
+  case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
+  case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
+  case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
+  }
+
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+  unsigned NumArgs = MI->getNumOperands(); // remove the results
+  for (unsigned i = 1; i < NumArgs; ++i) {
+    MachineOperand &Op = MI->getOperand(i);
+    if (!(Op.isReg() && Op.isImplicit()))
+      MIB.addOperand(Op);
+  }
+  if (MI->hasOneMemOperand())
+    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  BuildMI(*BB, MI, dl,
+    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+    .addReg(X86::ECX);
+
+  MI->eraseFromParent();
+  return BB;
+}
+
+static MachineBasicBlock *EmitWRPKRU(MachineInstr *MI, MachineBasicBlock *BB,
+                                     const X86Subtarget *Subtarget) {
+  DebugLoc dl = MI->getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+  // insert input VAL into EAX
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
+                           .addReg(MI->getOperand(0).getReg());
+  // insert zero to ECX
+  BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX)
+                           .addReg(X86::ECX)
+                           .addReg(X86::ECX);
+  // insert zero to EDX
+  BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::EDX)
+                           .addReg(X86::EDX)
+                           .addReg(X86::EDX);
+  // insert WRPKRU instruction
+  BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
+
+  MI->eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
+static MachineBasicBlock *EmitRDPKRU(MachineInstr *MI, MachineBasicBlock *BB,
+                                     const X86Subtarget *Subtarget) {
+  DebugLoc dl = MI->getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+  // insert zero to ECX
+  BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX)
+                           .addReg(X86::ECX)
+                           .addReg(X86::ECX);
+  // insert RDPKRU instruction
+  BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+                           .addReg(X86::EAX);
+
+  MI->eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
+static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
+                                      const X86Subtarget *Subtarget) {
+  DebugLoc dl = MI->getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  // Address into RAX/EAX, other two args into ECX, EDX.
+  unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
+  unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
+  for (int i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(i));
+
+  unsigned ValOps = X86::AddrNumOperands;
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
+    .addReg(MI->getOperand(ValOps).getReg());
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
+    .addReg(MI->getOperand(ValOps+1).getReg());
+
+  // The instruction doesn't actually take any operands though.
+  BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
+
+  MI->eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *MBB) const {
+  // Emit va_arg instruction on X86-64.
+
+  // Operands to this pseudo-instruction:
+  // 0  ) Output        : destination address (reg)
+  // 1-5) Input         : va_list address (addr, i64mem)
+  // 6  ) ArgSize       : Size (in bytes) of vararg type
+  // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
+  // 8  ) Align         : Alignment of type
+  // 9  ) EFLAGS (implicit-def)
+
+  assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
+  static_assert(X86::AddrNumOperands == 5,
+                "VAARG_64 assumes 5 address operands");
+
+  unsigned DestReg = MI->getOperand(0).getReg();
+  MachineOperand &Base = MI->getOperand(1);
+  MachineOperand &Scale = MI->getOperand(2);
+  MachineOperand &Index = MI->getOperand(3);
+  MachineOperand &Disp = MI->getOperand(4);
+  MachineOperand &Segment = MI->getOperand(5);
+  unsigned ArgSize = MI->getOperand(6).getImm();
+  unsigned ArgMode = MI->getOperand(7).getImm();
+  unsigned Align = MI->getOperand(8).getImm();
+
+  // Memory Reference
+  assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  // Machine Information
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
+  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
+  DebugLoc DL = MI->getDebugLoc();
+
+  // struct va_list {
+  //   i32   gp_offset
+  //   i32   fp_offset
+  //   i64   overflow_area (address)
+  //   i64   reg_save_area (address)
+  // }
+  // sizeof(va_list) = 24
+  // alignment(va_list) = 8
+
+  unsigned TotalNumIntRegs = 6;
+  unsigned TotalNumXMMRegs = 8;
+  bool UseGPOffset = (ArgMode == 1);
+  bool UseFPOffset = (ArgMode == 2);
+  unsigned MaxOffset = TotalNumIntRegs * 8 +
+                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
+
+  /* Align ArgSize to a multiple of 8 */
+  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
+  bool NeedsAlign = (Align > 8);
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *overflowMBB;
+  MachineBasicBlock *offsetMBB;
+  MachineBasicBlock *endMBB;
+
+  unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
+  unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
+  unsigned OffsetReg = 0;
+
+  if (!UseGPOffset && !UseFPOffset) {
+    // If we only pull from the overflow region, we don't create a branch.
+    // We don't need to alter control flow.
+    OffsetDestReg = 0; // unused
+    OverflowDestReg = DestReg;
+
+    offsetMBB = nullptr;
+    overflowMBB = thisMBB;
+    endMBB = thisMBB;
+  } else {
+    // First emit code to check if gp_offset (or fp_offset) is below the bound.
+    // If so, pull the argument from reg_save_area. (branch to offsetMBB)
+    // If not, pull from overflow_area. (branch to overflowMBB)
+    //
+    //       thisMBB
+    //         |     .
+    //         |        .
+    //     offsetMBB   overflowMBB
+    //         |        .
+    //         |     .
+    //        endMBB
+
+    // Registers for the PHI in endMBB
+    OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
+    OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
+
+    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+    MachineFunction *MF = MBB->getParent();
+    overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+    offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+    endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+    MachineFunction::iterator MBBIter = ++MBB->getIterator();
+
+    // Insert the new basic blocks
+    MF->insert(MBBIter, offsetMBB);
+    MF->insert(MBBIter, overflowMBB);
+    MF->insert(MBBIter, endMBB);
+
+    // Transfer the remainder of MBB and its successor edges to endMBB.
+    endMBB->splice(endMBB->begin(), thisMBB,
+                   std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
+    endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
+
+    // Make offsetMBB and overflowMBB successors of thisMBB
+    thisMBB->addSuccessor(offsetMBB);
+    thisMBB->addSuccessor(overflowMBB);
+
+    // endMBB is a successor of both offsetMBB and overflowMBB
+    offsetMBB->addSuccessor(endMBB);
+    overflowMBB->addSuccessor(endMBB);
+
+    // Load the offset value into a register
+    OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+    BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
+      .addOperand(Base)
+      .addOperand(Scale)
+      .addOperand(Index)
+      .addDisp(Disp, UseFPOffset ? 4 : 0)
+      .addOperand(Segment)
+      .setMemRefs(MMOBegin, MMOEnd);
+
+    // Check if there is enough room left to pull this argument.
+    BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
+      .addReg(OffsetReg)
+      .addImm(MaxOffset + 8 - ArgSizeA8);
+
+    // Branch to "overflowMBB" if offset >= max
+    // Fall through to "offsetMBB" otherwise
+    BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
+      .addMBB(overflowMBB);
+  }
+
+  // In offsetMBB, emit code to use the reg_save_area.
+  if (offsetMBB) {
+    assert(OffsetReg != 0);
+
+    // Read the reg_save_area address.
+    unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
+    BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
+      .addOperand(Base)
+      .addOperand(Scale)
+      .addOperand(Index)
+      .addDisp(Disp, 16)
+      .addOperand(Segment)
+      .setMemRefs(MMOBegin, MMOEnd);
+
+    // Zero-extend the offset
+    unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
+      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
+        .addImm(0)
+        .addReg(OffsetReg)
+        .addImm(X86::sub_32bit);
+
+    // Add the offset to the reg_save_area to get the final address.
+    BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
+      .addReg(OffsetReg64)
+      .addReg(RegSaveReg);
+
+    // Compute the offset for the next argument
+    unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+    BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
+      .addReg(OffsetReg)
+      .addImm(UseFPOffset ? 16 : 8);
+
+    // Store it back into the va_list.
+    BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
+      .addOperand(Base)
+      .addOperand(Scale)
+      .addOperand(Index)
+      .addDisp(Disp, UseFPOffset ? 4 : 0)
+      .addOperand(Segment)
+      .addReg(NextOffsetReg)
+      .setMemRefs(MMOBegin, MMOEnd);
+
+    // Jump to endMBB
+    BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
+      .addMBB(endMBB);
+  }
+
+  //
+  // Emit code to use overflow area
+  //
+
+  // Load the overflow_area address into a register.
+  unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
+  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
+    .addOperand(Base)
+    .addOperand(Scale)
+    .addOperand(Index)
+    .addDisp(Disp, 8)
+    .addOperand(Segment)
+    .setMemRefs(MMOBegin, MMOEnd);
+
+  // If we need to align it, do so. Otherwise, just copy the address
+  // to OverflowDestReg.
+  if (NeedsAlign) {
+    // Align the overflow address
+    assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
+    unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
+
+    // aligned_addr = (addr + (align-1)) & ~(align-1)
+    BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
+      .addReg(OverflowAddrReg)
+      .addImm(Align-1);
+
+    BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
+      .addReg(TmpReg)
+      .addImm(~(uint64_t)(Align-1));
+  } else {
+    BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
+      .addReg(OverflowAddrReg);
+  }
+
+  // Compute the next overflow address after this argument.
+  // (the overflow address should be kept 8-byte aligned)
+  unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
+  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
+    .addReg(OverflowDestReg)
+    .addImm(ArgSizeA8);
+
+  // Store the new overflow address.
+  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
+    .addOperand(Base)
+    .addOperand(Scale)
+    .addOperand(Index)
+    .addDisp(Disp, 8)
+    .addOperand(Segment)
+    .addReg(NextAddrReg)
+    .setMemRefs(MMOBegin, MMOEnd);
+
+  // If we branched, emit the PHI to the front of endMBB.
+  if (offsetMBB) {
+    BuildMI(*endMBB, endMBB->begin(), DL,
+            TII->get(X86::PHI), DestReg)
+      .addReg(OffsetDestReg).addMBB(offsetMBB)
+      .addReg(OverflowDestReg).addMBB(overflowMBB);
+  }
+
+  // Erase the pseudo instruction
+  MI->eraseFromParent();
+
+  return endMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
+                                                 MachineInstr *MI,
+                                                 MachineBasicBlock *MBB) const {
+  // Emit code to save XMM registers to the stack. The ABI says that the
+  // number of registers to save is given in %al, so it's theoretically
+  // possible to do an indirect jump trick to avoid saving all of them,
+  // however this code takes a simpler approach and just executes all
+  // of the stores if %al is non-zero. It's less code, and it's probably
+  // easier on the hardware branch predictor, and stores aren't all that
+  // expensive anyway.
+
+  // Create the new basic blocks. One block contains all the XMM stores,
+  // and one block is the final destination regardless of whether any
+  // stores were performed.
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction *F = MBB->getParent();
+  MachineFunction::iterator MBBIter = ++MBB->getIterator();
+  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(MBBIter, XMMSaveMBB);
+  F->insert(MBBIter, EndMBB);
+
+  // Transfer the remainder of MBB and its successor edges to EndMBB.
+  EndMBB->splice(EndMBB->begin(), MBB,
+                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // The original block will now fall through to the XMM save block.
+  MBB->addSuccessor(XMMSaveMBB);
+  // The XMMSaveMBB will fall through to the end block.
+  XMMSaveMBB->addSuccessor(EndMBB);
+
+  // Now add the instructions.
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned CountReg = MI->getOperand(0).getReg();
+  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
+  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
+
+  if (!Subtarget->isCallingConvWin64(F->getFunction()->getCallingConv())) {
+    // If %al is 0, branch around the XMM save block.
+    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
+    BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
+    MBB->addSuccessor(EndMBB);
+  }
+
+  // Make sure the last operand is EFLAGS, which gets clobbered by the branch
+  // that was just emitted, but clearly shouldn't be "saved".
+  assert((MI->getNumOperands() <= 3 ||
+          !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
+          MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
+         && "Expected last argument to be EFLAGS");
+  unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+  // In the XMM save block, save all the XMM argument registers.
+  for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
+    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
+    MachineMemOperand *MMO = F->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
+        MachineMemOperand::MOStore,
+        /*Size=*/16, /*Align=*/16);
+    BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
+      .addFrameIndex(RegSaveFrameIndex)
+      .addImm(/*Scale=*/1)
+      .addReg(/*IndexReg=*/0)
+      .addImm(/*Disp=*/Offset)
+      .addReg(/*Segment=*/0)
+      .addReg(MI->getOperand(i).getReg())
+      .addMemOperand(MMO);
+  }
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+
+  return EndMBB;
+}
+
+// The EFLAGS operand of SelectItr might be missing a kill marker
+// because there were multiple uses of EFLAGS, and ISel didn't know
+// which to mark. Figure out whether SelectItr should have had a
+// kill marker, and set it if it should. Returns the correct kill
+// marker value.
+static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
+                                     MachineBasicBlock* BB,
+                                     const TargetRegisterInfo* TRI) {
+  // Scan forward through BB for a use/def of EFLAGS.
+  MachineBasicBlock::iterator miI(std::next(SelectItr));
+  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
+    const MachineInstr& mi = *miI;
+    if (mi.readsRegister(X86::EFLAGS))
+      return false;
+    if (mi.definesRegister(X86::EFLAGS))
+      break; // Should have kill-flag - update below.
+  }
+
+  // If we hit the end of the block, check whether EFLAGS is live into a
+  // successor.
+  if (miI == BB->end()) {
+    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+                                          sEnd = BB->succ_end();
+         sItr != sEnd; ++sItr) {
+      MachineBasicBlock* succ = *sItr;
+      if (succ->isLiveIn(X86::EFLAGS))
+        return false;
+    }
+  }
+
+  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
+  // out. SelectMI should have a kill flag on EFLAGS.
+  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
+  return true;
+}
+
+// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
+// together with other CMOV pseudo-opcodes into a single basic-block with
+// conditional jump around it.
+static bool isCMOVPseudo(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case X86::CMOV_FR32:
+  case X86::CMOV_FR64:
+  case X86::CMOV_GR8:
+  case X86::CMOV_GR16:
+  case X86::CMOV_GR32:
+  case X86::CMOV_RFP32:
+  case X86::CMOV_RFP64:
+  case X86::CMOV_RFP80:
+  case X86::CMOV_V2F64:
+  case X86::CMOV_V2I64:
+  case X86::CMOV_V4F32:
+  case X86::CMOV_V4F64:
+  case X86::CMOV_V4I64:
+  case X86::CMOV_V16F32:
+  case X86::CMOV_V8F32:
+  case X86::CMOV_V8F64:
+  case X86::CMOV_V8I64:
+  case X86::CMOV_V8I1:
+  case X86::CMOV_V16I1:
+  case X86::CMOV_V32I1:
+  case X86::CMOV_V64I1:
+    return true;
+
+  default:
+    return false;
+  }
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = ++BB->getIterator();
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+
+  // This code lowers all pseudo-CMOV instructions. Generally it lowers these
+  // as described above, by inserting a BB, and then making a PHI at the join
+  // point to select the true and false operands of the CMOV in the PHI.
+  //
+  // The code also handles two different cases of multiple CMOV opcodes
+  // in a row.
+  //
+  // Case 1:
+  // In this case, there are multiple CMOVs in a row, all which are based on
+  // the same condition setting (or the exact opposite condition setting).
+  // In this case we can lower all the CMOVs using a single inserted BB, and
+  // then make a number of PHIs at the join point to model the CMOVs. The only
+  // trickiness here, is that in a case like:
+  //
+  // t2 = CMOV cond1 t1, f1
+  // t3 = CMOV cond1 t2, f2
+  //
+  // when rewriting this into PHIs, we have to perform some renaming on the
+  // temps since you cannot have a PHI operand refer to a PHI result earlier
+  // in the same block.  The "simple" but wrong lowering would be:
+  //
+  // t2 = PHI t1(BB1), f1(BB2)
+  // t3 = PHI t2(BB1), f2(BB2)
+  //
+  // but clearly t2 is not defined in BB1, so that is incorrect. The proper
+  // renaming is to note that on the path through BB1, t2 is really just a
+  // copy of t1, and do that renaming, properly generating:
+  //
+  // t2 = PHI t1(BB1), f1(BB2)
+  // t3 = PHI t1(BB1), f2(BB2)
+  //
+  // Case 2, we lower cascaded CMOVs such as
+  //
+  //   (CMOV (CMOV F, T, cc1), T, cc2)
+  //
+  // to two successives branches.  For that, we look for another CMOV as the
+  // following instruction.
+  //
+  // Without this, we would add a PHI between the two jumps, which ends up
+  // creating a few copies all around. For instance, for
+  //
+  //    (sitofp (zext (fcmp une)))
+  //
+  // we would generate:
+  //
+  //         ucomiss %xmm1, %xmm0
+  //         movss  <1.0f>, %xmm0
+  //         movaps  %xmm0, %xmm1
+  //         jne     .LBB5_2
+  //         xorps   %xmm1, %xmm1
+  // .LBB5_2:
+  //         jp      .LBB5_4
+  //         movaps  %xmm1, %xmm0
+  // .LBB5_4:
+  //         retq
+  //
+  // because this custom-inserter would have generated:
+  //
+  //   A
+  //   | \
+  //   |  B
+  //   | /
+  //   C
+  //   | \
+  //   |  D
+  //   | /
+  //   E
+  //
+  // A: X = ...; Y = ...
+  // B: empty
+  // C: Z = PHI [X, A], [Y, B]
+  // D: empty
+  // E: PHI [X, C], [Z, D]
+  //
+  // If we lower both CMOVs in a single step, we can instead generate:
+  //
+  //   A
+  //   | \
+  //   |  C
+  //   | /|
+  //   |/ |
+  //   |  |
+  //   |  D
+  //   | /
+  //   E
+  //
+  // A: X = ...; Y = ...
+  // D: empty
+  // E: PHI [X, A], [X, C], [Y, D]
+  //
+  // Which, in our sitofp/fcmp example, gives us something like:
+  //
+  //         ucomiss %xmm1, %xmm0
+  //         movss  <1.0f>, %xmm0
+  //         jne     .LBB5_4
+  //         jp      .LBB5_4
+  //         xorps   %xmm0, %xmm0
+  // .LBB5_4:
+  //         retq
+  //
+  MachineInstr *CascadedCMOV = nullptr;
+  MachineInstr *LastCMOV = MI;
+  X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm());
+  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+  MachineBasicBlock::iterator NextMIIt =
+      std::next(MachineBasicBlock::iterator(MI));
+
+  // Check for case 1, where there are multiple CMOVs with the same condition
+  // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
+  // number of jumps the most.
+
+  if (isCMOVPseudo(MI)) {
+    // See if we have a string of CMOVS with the same condition.
+    while (NextMIIt != BB->end() &&
+           isCMOVPseudo(NextMIIt) &&
+           (NextMIIt->getOperand(3).getImm() == CC ||
+            NextMIIt->getOperand(3).getImm() == OppCC)) {
+      LastCMOV = &*NextMIIt;
+      ++NextMIIt;
+    }
+  }
+
+  // This checks for case 2, but only do this if we didn't already find
+  // case 1, as indicated by LastCMOV == MI.
+  if (LastCMOV == MI &&
+      NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
+      NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() &&
+      NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) {
+    CascadedCMOV = &*NextMIIt;
+  }
+
+  MachineBasicBlock *jcc1MBB = nullptr;
+
+  // If we have a cascaded CMOV, we lower it to two successive branches to
+  // the same block.  EFLAGS is used by both, so mark it as live in the second.
+  if (CascadedCMOV) {
+    jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(It, jcc1MBB);
+    jcc1MBB->addLiveIn(X86::EFLAGS);
+  }
+
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // If the EFLAGS register isn't dead in the terminator, then claim that it's
+  // live into the sink and copy blocks.
+  const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+
+  MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
+  if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
+      !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
+    copy0MBB->addLiveIn(X86::EFLAGS);
+    sinkMBB->addLiveIn(X86::EFLAGS);
+  }
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add the true and fallthrough blocks as its successors.
+  if (CascadedCMOV) {
+    // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
+    BB->addSuccessor(jcc1MBB);
+
+    // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
+    // jump to the sinkMBB.
+    jcc1MBB->addSuccessor(copy0MBB);
+    jcc1MBB->addSuccessor(sinkMBB);
+  } else {
+    BB->addSuccessor(copy0MBB);
+  }
+
+  // The true block target of the first (or only) branch is always sinkMBB.
+  BB->addSuccessor(sinkMBB);
+
+  // Create the conditional branch instruction.
+  unsigned Opc = X86::GetCondBranchFromCond(CC);
+  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
+
+  if (CascadedCMOV) {
+    unsigned Opc2 = X86::GetCondBranchFromCond(
+        (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
+    BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
+  }
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  copy0MBB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
+  MachineBasicBlock::iterator MIItEnd =
+    std::next(MachineBasicBlock::iterator(LastCMOV));
+  MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
+  DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
+  MachineInstrBuilder MIB;
+
+  // As we are creating the PHIs, we have to be careful if there is more than
+  // one.  Later CMOVs may reference the results of earlier CMOVs, but later
+  // PHIs have to reference the individual true/false inputs from earlier PHIs.
+  // That also means that PHI construction must work forward from earlier to
+  // later, and that the code must maintain a mapping from earlier PHI's
+  // destination registers, and the registers that went into the PHI.
+
+  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+    unsigned DestReg = MIIt->getOperand(0).getReg();
+    unsigned Op1Reg = MIIt->getOperand(1).getReg();
+    unsigned Op2Reg = MIIt->getOperand(2).getReg();
+
+    // If this CMOV we are generating is the opposite condition from
+    // the jump we generated, then we have to swap the operands for the
+    // PHI that is going to be generated.
+    if (MIIt->getOperand(3).getImm() == OppCC)
+        std::swap(Op1Reg, Op2Reg);
+
+    if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
+      Op1Reg = RegRewriteTable[Op1Reg].first;
+
+    if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
+      Op2Reg = RegRewriteTable[Op2Reg].second;
+
+    MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
+                  TII->get(X86::PHI), DestReg)
+          .addReg(Op1Reg).addMBB(copy0MBB)
+          .addReg(Op2Reg).addMBB(thisMBB);
+
+    // Add this PHI to the rewrite table.
+    RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
+  }
+
+  // If we have a cascaded CMOV, the second Jcc provides the same incoming
+  // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
+  if (CascadedCMOV) {
+    MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB);
+    // Copy the PHI result to the register defined by the second CMOV.
+    BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
+            DL, TII->get(TargetOpcode::COPY),
+            CascadedCMOV->getOperand(0).getReg())
+        .addReg(MI->getOperand(0).getReg());
+    CascadedCMOV->eraseFromParent();
+  }
+
+  // Now remove the CMOV(s).
+  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
+    (MIIt++)->eraseFromParent();
+
+  return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI,
+                                       MachineBasicBlock *BB) const {
+  // Combine the following atomic floating-point modification pattern:
+  //   a.store(reg OP a.load(acquire), release)
+  // Transform them into:
+  //   OPss (%gpr), %xmm
+  //   movss %xmm, (%gpr)
+  // Or sd equivalent for 64-bit operations.
+  unsigned MOp, FOp;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
+  case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break;
+  case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break;
+  }
+  const X86InstrInfo *TII = Subtarget->getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  MachineOperand MSrc = MI->getOperand(0);
+  unsigned VSrc = MI->getOperand(5).getReg();
+  const MachineOperand &Disp = MI->getOperand(3);
+  MachineOperand ZeroDisp = MachineOperand::CreateImm(0);
+  bool hasDisp = Disp.isGlobal() || Disp.isImm();
+  if (hasDisp && MSrc.isReg())
+    MSrc.setIsKill(false);
+  MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp))
+                                .addOperand(/*Base=*/MSrc)
+                                .addImm(/*Scale=*/1)
+                                .addReg(/*Index=*/0)
+                                .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0)
+                                .addReg(0);
+  MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp),
+                              MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
+                          .addReg(VSrc)
+                          .addOperand(/*Base=*/MSrc)
+                          .addImm(/*Scale=*/1)
+                          .addReg(/*Index=*/0)
+                          .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0)
+                          .addReg(/*Segment=*/0);
+  MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill);
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
+                                        MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+
+  assert(MF->shouldSplitStack());
+
+  const bool Is64Bit = Subtarget->is64Bit();
+  const bool IsLP64 = Subtarget->isTarget64BitLP64();
+
+  const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
+  const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
+
+  // BB:
+  //  ... [Till the alloca]
+  // If stacklet is not large enough, jump to mallocMBB
+  //
+  // bumpMBB:
+  //  Allocate by subtracting from RSP
+  //  Jump to continueMBB
+  //
+  // mallocMBB:
+  //  Allocate by call to runtime
+  //
+  // continueMBB:
+  //  ...
+  //  [rest of original BB]
+  //
+
+  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterClass *AddrRegClass =
+      getRegClassFor(getPointerTy(MF->getDataLayout()));
+
+  unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+    bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+    tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
+    SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
+    sizeVReg = MI->getOperand(1).getReg(),
+    physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
+
+  MachineFunction::iterator MBBIter = ++BB->getIterator();
+
+  MF->insert(MBBIter, bumpMBB);
+  MF->insert(MBBIter, mallocMBB);
+  MF->insert(MBBIter, continueMBB);
+
+  continueMBB->splice(continueMBB->begin(), BB,
+                      std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  continueMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add code to the main basic block to check if the stack limit has been hit,
+  // and if so, jump to mallocMBB otherwise to bumpMBB.
+  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
+  BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
+    .addReg(tmpSPVReg).addReg(sizeVReg);
+  BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
+    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
+    .addReg(SPLimitVReg);
+  BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
+
+  // bumpMBB simply decreases the stack pointer, since we know the current
+  // stacklet has enough space.
+  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
+    .addReg(SPLimitVReg);
+  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
+    .addReg(SPLimitVReg);
+  BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
+
+  // Calls into a routine in libgcc to allocate more space from the heap.
+  const uint32_t *RegMask =
+      Subtarget->getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
+  if (IsLP64) {
+    BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
+      .addReg(sizeVReg);
+    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+      .addExternalSymbol("__morestack_allocate_stack_space")
+      .addRegMask(RegMask)
+      .addReg(X86::RDI, RegState::Implicit)
+      .addReg(X86::RAX, RegState::ImplicitDefine);
+  } else if (Is64Bit) {
+    BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
+      .addReg(sizeVReg);
+    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+      .addExternalSymbol("__morestack_allocate_stack_space")
+      .addRegMask(RegMask)
+      .addReg(X86::EDI, RegState::Implicit)
+      .addReg(X86::EAX, RegState::ImplicitDefine);
+  } else {
+    BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
+      .addImm(12);
+    BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
+    BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
+      .addExternalSymbol("__morestack_allocate_stack_space")
+      .addRegMask(RegMask)
+      .addReg(X86::EAX, RegState::ImplicitDefine);
+  }
+
+  if (!Is64Bit)
+    BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
+      .addImm(16);
+
+  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
+    .addReg(IsLP64 ? X86::RAX : X86::EAX);
+  BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
+
+  // Set up the CFG correctly.
+  BB->addSuccessor(bumpMBB);
+  BB->addSuccessor(mallocMBB);
+  mallocMBB->addSuccessor(continueMBB);
+  bumpMBB->addSuccessor(continueMBB);
+
+  // Take care of the PHI nodes.
+  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(mallocPtrVReg).addMBB(mallocMBB)
+    .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
+
+  // Delete the original pseudo instruction.
+  MI->eraseFromParent();
+
+  // And we're done.
+  return continueMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
+                                        MachineBasicBlock *BB) const {
+  assert(!Subtarget->isTargetMachO());
+  DebugLoc DL = MI->getDebugLoc();
+  MachineInstr *ResumeMI = Subtarget->getFrameLowering()->emitStackProbe(
+      *BB->getParent(), *BB, MI, DL, false);
+  MachineBasicBlock *ResumeBB = ResumeMI->getParent();
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return ResumeBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI,
+                                       MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  MachineBasicBlock *TargetMBB = MI->getOperand(0).getMBB();
+  DebugLoc DL = MI->getDebugLoc();
+
+  assert(!isAsynchronousEHPersonality(
+             classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
+         "SEH does not use catchret!");
+
+  // Only 32-bit EH needs to worry about manually restoring stack pointers.
+  if (!Subtarget->is32Bit())
+    return BB;
+
+  // C++ EH creates a new target block to hold the restore code, and wires up
+  // the new block to the return destination with a normal JMP_4.
+  MachineBasicBlock *RestoreMBB =
+      MF->CreateMachineBasicBlock(BB->getBasicBlock());
+  assert(BB->succ_size() == 1);
+  MF->insert(std::next(BB->getIterator()), RestoreMBB);
+  RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
+  BB->addSuccessor(RestoreMBB);
+  MI->getOperand(0).setMBB(RestoreMBB);
+
+  auto RestoreMBBI = RestoreMBB->begin();
+  BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
+  BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredCatchPad(MachineInstr *MI,
+                                       MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  const Constant *PerFn = MF->getFunction()->getPersonalityFn();
+  bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
+  // Only 32-bit SEH requires special handling for catchpad.
+  if (IsSEH && Subtarget->is32Bit()) {
+    const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+    DebugLoc DL = MI->getDebugLoc();
+    BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
+  }
+  MI->eraseFromParent();
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
+                                      MachineBasicBlock *BB) const {
+  // This is pretty easy.  We're taking the value that we received from
+  // our load from the relocation, sticking it in either RDI (x86-64)
+  // or EAX and doing an indirect call.  The return value will then
+  // be in the normal return register.
+  MachineFunction *F = BB->getParent();
+  const X86InstrInfo *TII = Subtarget->getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
+  assert(MI->getOperand(3).isGlobal() && "This should be a global");
+
+  // Get a register mask for the lowered call.
+  // FIXME: The 32-bit calls have non-standard calling conventions. Use a
+  // proper register mask.
+  const uint32_t *RegMask =
+      Subtarget->is64Bit() ?
+      Subtarget->getRegisterInfo()->getDarwinTLSCallPreservedMask() :
+      Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
+  if (Subtarget->is64Bit()) {
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+                                      TII->get(X86::MOV64rm), X86::RDI)
+    .addReg(X86::RIP)
+    .addImm(0).addReg(0)
+    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
+                      MI->getOperand(3).getTargetFlags())
+    .addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
+    addDirectMem(MIB, X86::RDI);
+    MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
+  } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+                                      TII->get(X86::MOV32rm), X86::EAX)
+    .addReg(0)
+    .addImm(0).addReg(0)
+    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
+                      MI->getOperand(3).getTargetFlags())
+    .addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+    addDirectMem(MIB, X86::EAX);
+    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
+  } else {
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+                                      TII->get(X86::MOV32rm), X86::EAX)
+    .addReg(TII->getGlobalBaseReg(F))
+    .addImm(0).addReg(0)
+    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
+                      MI->getOperand(3).getTargetFlags())
+    .addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+    addDirectMem(MIB, X86::EAX);
+    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
+  }
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
+                                    MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = ++MBB->getIterator();
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  unsigned DstReg;
+  unsigned MemOpndSlot = 0;
+
+  unsigned CurOp = 0;
+
+  DstReg = MI->getOperand(CurOp++).getReg();
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  assert(RC->hasType(MVT::i32) && "Invalid destination!");
+  unsigned mainDstReg = MRI.createVirtualRegister(RC);
+  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+
+  MemOpndSlot = CurOp;
+
+  MVT PVT = getPointerTy(MF->getDataLayout());
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+
+  // For v = setjmp(buf), we generate
+  //
+  // thisMBB:
+  //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
+  //  SjLjSetup restoreMBB
+  //
+  // mainMBB:
+  //  v_main = 0
+  //
+  // sinkMBB:
+  //  v = phi(main, restore)
+  //
+  // restoreMBB:
+  //  if base pointer being used, load it from frame
+  //  v_restore = 1
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+  MF->push_back(restoreMBB);
+  restoreMBB->setHasAddressTaken();
+
+  MachineInstrBuilder MIB;
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  unsigned PtrStoreOpc = 0;
+  unsigned LabelReg = 0;
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  Reloc::Model RM = MF->getTarget().getRelocationModel();
+  bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
+                     (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
+
+  // Prepare IP either in reg or imm.
+  if (!UseImmLabel) {
+    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+    const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+    LabelReg = MRI.createVirtualRegister(PtrRC);
+    if (Subtarget->is64Bit()) {
+      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
+              .addReg(X86::RIP)
+              .addImm(0)
+              .addReg(0)
+              .addMBB(restoreMBB)
+              .addReg(0);
+    } else {
+      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
+      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
+              .addReg(XII->getGlobalBaseReg(MF))
+              .addImm(0)
+              .addReg(0)
+              .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
+              .addReg(0);
+    }
+  } else
+    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
+  // Store IP
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
+    else
+      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  }
+  if (!UseImmLabel)
+    MIB.addReg(LabelReg);
+  else
+    MIB.addMBB(restoreMBB);
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Setup
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
+          .addMBB(restoreMBB);
+
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  MIB.addRegMask(RegInfo->getNoPreservedMask());
+  thisMBB->addSuccessor(mainMBB);
+  thisMBB->addSuccessor(restoreMBB);
+
+  // mainMBB:
+  //  EAX = 0
+  BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(X86::PHI), DstReg)
+    .addReg(mainDstReg).addMBB(mainMBB)
+    .addReg(restoreDstReg).addMBB(restoreMBB);
+
+  // restoreMBB:
+  if (RegInfo->hasBasePointer(*MF)) {
+    const bool Uses64BitFramePtr =
+        Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
+    X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+    X86FI->setRestoreBasePointer(MF);
+    unsigned FramePtr = RegInfo->getFrameRegister(*MF);
+    unsigned BasePtr = RegInfo->getBaseRegister();
+    unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
+    addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
+                 FramePtr, true, X86FI->getRestoreBasePointerOffset())
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
+  BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
+  BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
+  restoreMBB->addSuccessor(sinkMBB);
+
+  MI->eraseFromParent();
+  return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
+                                     MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  MVT PVT = getPointerTy(MF->getDataLayout());
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+
+  const TargetRegisterClass *RC =
+    (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+  unsigned Tmp = MRI.createVirtualRegister(RC);
+  // Since FP is only updated here but NOT referenced, it's treated as GPR.
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
+  unsigned SP = RegInfo->getStackRegister();
+
+  MachineInstrBuilder MIB;
+
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  const int64_t SPOffset = 2 * PVT.getStoreSize();
+
+  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
+  unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
+
+  // Reload FP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(i));
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Reload IP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(i), LabelOffset);
+    else
+      MIB.addOperand(MI->getOperand(i));
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Reload SP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(i), SPOffset);
+    else
+      MIB.addOperand(MI->getOperand(i));
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Jump
+  BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
+
+  MI->eraseFromParent();
+  return MBB;
+}
+
+// Replace 213-type (isel default) FMA3 instructions with 231-type for
+// accumulator loops. Writing back to the accumulator allows the coalescer
+// to remove extra copies in the loop.
+// FIXME: Do this on AVX512.  We don't support 231 variants yet (PR23937).
+MachineBasicBlock *
+X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
+                                 MachineBasicBlock *MBB) const {
+  MachineOperand &AddendOp = MI->getOperand(3);
+
+  // Bail out early if the addend isn't a register - we can't switch these.
+  if (!AddendOp.isReg())
+    return MBB;
+
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Check whether the addend is defined by a PHI:
+  assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
+  MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
+  if (!AddendDef.isPHI())
+    return MBB;
+
+  // Look for the following pattern:
+  // loop:
+  //   %addend = phi [%entry, 0], [%loop, %result]
+  //   ...
+  //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
+
+  // Replace with:
+  //   loop:
+  //   %addend = phi [%entry, 0], [%loop, %result]
+  //   ...
+  //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
+
+  for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
+    assert(AddendDef.getOperand(i).isReg());
+    MachineOperand PHISrcOp = AddendDef.getOperand(i);
+    MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
+    if (&PHISrcInst == MI) {
+      // Found a matching instruction.
+      unsigned NewFMAOpc = 0;
+      switch (MI->getOpcode()) {
+        case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
+        case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
+        case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
+        case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
+        case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
+        case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
+        case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
+        case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
+        case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
+        case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
+        case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
+        case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
+        case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
+        case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
+        case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
+        case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
+        case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
+        case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
+        case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
+        case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
+
+        case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
+        case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
+        case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
+        case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
+        case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
+        case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
+        case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
+        case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
+        case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
+        case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
+        case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
+        case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
+        default: llvm_unreachable("Unrecognized FMA variant.");
+      }
+
+      const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+      MachineInstrBuilder MIB =
+        BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
+        .addOperand(MI->getOperand(0))
+        .addOperand(MI->getOperand(3))
+        .addOperand(MI->getOperand(2))
+        .addOperand(MI->getOperand(1));
+      MBB->insert(MachineBasicBlock::iterator(MI), MIB);
+      MI->eraseFromParent();
+    }
+  }
+
+  return MBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                               MachineBasicBlock *BB) const {
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unexpected instr type to insert");
+  case X86::TAILJMPd64:
+  case X86::TAILJMPr64:
+  case X86::TAILJMPm64:
+  case X86::TAILJMPd64_REX:
+  case X86::TAILJMPr64_REX:
+  case X86::TAILJMPm64_REX:
+    llvm_unreachable("TAILJMP64 would not be touched here.");
+  case X86::TCRETURNdi64:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNmi64:
+    return BB;
+  case X86::WIN_ALLOCA:
+    return EmitLoweredWinAlloca(MI, BB);
+  case X86::CATCHRET:
+    return EmitLoweredCatchRet(MI, BB);
+  case X86::CATCHPAD:
+    return EmitLoweredCatchPad(MI, BB);
+  case X86::SEG_ALLOCA_32:
+  case X86::SEG_ALLOCA_64:
+    return EmitLoweredSegAlloca(MI, BB);
+  case X86::TLSCall_32:
+  case X86::TLSCall_64:
+    return EmitLoweredTLSCall(MI, BB);
+  case X86::CMOV_FR32:
+  case X86::CMOV_FR64:
+  case X86::CMOV_FR128:
+  case X86::CMOV_GR8:
+  case X86::CMOV_GR16:
+  case X86::CMOV_GR32:
+  case X86::CMOV_RFP32:
+  case X86::CMOV_RFP64:
+  case X86::CMOV_RFP80:
+  case X86::CMOV_V2F64:
+  case X86::CMOV_V2I64:
+  case X86::CMOV_V4F32:
+  case X86::CMOV_V4F64:
+  case X86::CMOV_V4I64:
+  case X86::CMOV_V16F32:
+  case X86::CMOV_V8F32:
+  case X86::CMOV_V8F64:
+  case X86::CMOV_V8I64:
+  case X86::CMOV_V8I1:
+  case X86::CMOV_V16I1:
+  case X86::CMOV_V32I1:
+  case X86::CMOV_V64I1:
+    return EmitLoweredSelect(MI, BB);
+
+  case X86::RDFLAGS32:
+  case X86::RDFLAGS64: {
+    DebugLoc DL = MI->getDebugLoc();
+    const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+    unsigned PushF =
+        MI->getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
+    unsigned Pop =
+        MI->getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
+    BuildMI(*BB, MI, DL, TII->get(PushF));
+    BuildMI(*BB, MI, DL, TII->get(Pop), MI->getOperand(0).getReg());
+
+    MI->eraseFromParent(); // The pseudo is gone now.
+    return BB;
+  }
+
+  case X86::WRFLAGS32:
+  case X86::WRFLAGS64: {
+    DebugLoc DL = MI->getDebugLoc();
+    const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+    unsigned Push =
+        MI->getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
+    unsigned PopF =
+        MI->getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
+    BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI->getOperand(0).getReg());
+    BuildMI(*BB, MI, DL, TII->get(PopF));
+
+    MI->eraseFromParent(); // The pseudo is gone now.
+    return BB;
+  }
+
+  case X86::RELEASE_FADD32mr:
+  case X86::RELEASE_FADD64mr:
+    return EmitLoweredAtomicFP(MI, BB);
+
+  case X86::FP32_TO_INT16_IN_MEM:
+  case X86::FP32_TO_INT32_IN_MEM:
+  case X86::FP32_TO_INT64_IN_MEM:
+  case X86::FP64_TO_INT16_IN_MEM:
+  case X86::FP64_TO_INT32_IN_MEM:
+  case X86::FP64_TO_INT64_IN_MEM:
+  case X86::FP80_TO_INT16_IN_MEM:
+  case X86::FP80_TO_INT32_IN_MEM:
+  case X86::FP80_TO_INT64_IN_MEM: {
+    MachineFunction *F = BB->getParent();
+    const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+    DebugLoc DL = MI->getDebugLoc();
+
+    // Change the floating point control register to use "round towards zero"
+    // mode when truncating to an integer value.
+    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FNSTCW16m)), CWFrameIdx);
+
+    // Load the old value of the high byte of the control word...
+    unsigned OldCW =
+      F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
+                      CWFrameIdx);
+
+    // Set the high part to be round to zero...
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
+      .addImm(0xC7F);
+
+    // Reload the modified control word now...
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+    // Restore the memory image of control word to original value
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
+      .addReg(OldCW);
+
+    // Get the X86 opcode to use.
+    unsigned Opc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
+    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
+    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
+    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
+    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
+    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
+    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
+    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
+    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
+    }
+
+    X86AddressMode AM;
+    MachineOperand &Op = MI->getOperand(0);
+    if (Op.isReg()) {
+      AM.BaseType = X86AddressMode::RegBase;
+      AM.Base.Reg = Op.getReg();
+    } else {
+      AM.BaseType = X86AddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = Op.getIndex();
+    }
+    Op = MI->getOperand(1);
+    if (Op.isImm())
+      AM.Scale = Op.getImm();
+    Op = MI->getOperand(2);
+    if (Op.isImm())
+      AM.IndexReg = Op.getImm();
+    Op = MI->getOperand(3);
+    if (Op.isGlobal()) {
+      AM.GV = Op.getGlobal();
+    } else {
+      AM.Disp = Op.getImm();
+    }
+    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
+                      .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
+
+    // Reload the original control word now.
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+    MI->eraseFromParent();   // The pseudo instruction is gone now.
+    return BB;
+  }
+    // String/text processing lowering.
+  case X86::PCMPISTRM128REG:
+  case X86::VPCMPISTRM128REG:
+  case X86::PCMPISTRM128MEM:
+  case X86::VPCMPISTRM128MEM:
+  case X86::PCMPESTRM128REG:
+  case X86::VPCMPESTRM128REG:
+  case X86::PCMPESTRM128MEM:
+  case X86::VPCMPESTRM128MEM:
+    assert(Subtarget->hasSSE42() &&
+           "Target must have SSE4.2 or AVX features enabled");
+    return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
+
+  // String/text processing lowering.
+  case X86::PCMPISTRIREG:
+  case X86::VPCMPISTRIREG:
+  case X86::PCMPISTRIMEM:
+  case X86::VPCMPISTRIMEM:
+  case X86::PCMPESTRIREG:
+  case X86::VPCMPESTRIREG:
+  case X86::PCMPESTRIMEM:
+  case X86::VPCMPESTRIMEM:
+    assert(Subtarget->hasSSE42() &&
+           "Target must have SSE4.2 or AVX features enabled");
+    return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
+
+  // Thread synchronization.
+  case X86::MONITOR:
+    return EmitMonitor(MI, BB, Subtarget);
+  // PKU feature
+  case X86::WRPKRU:
+    return EmitWRPKRU(MI, BB, Subtarget);
+  case X86::RDPKRU:
+    return EmitRDPKRU(MI, BB, Subtarget);
+  // xbegin
+  case X86::XBEGIN:
+    return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
+
+  case X86::VASTART_SAVE_XMM_REGS:
+    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
+
+  case X86::VAARG_64:
+    return EmitVAARG64WithCustomInserter(MI, BB);
+
+  case X86::EH_SjLj_SetJmp32:
+  case X86::EH_SjLj_SetJmp64:
+    return emitEHSjLjSetJmp(MI, BB);
+
+  case X86::EH_SjLj_LongJmp32:
+  case X86::EH_SjLj_LongJmp64:
+    return emitEHSjLjLongJmp(MI, BB);
+
+  case TargetOpcode::STATEPOINT:
+    // As an implementation detail, STATEPOINT shares the STACKMAP format at
+    // this point in the process.  We diverge later.
+    return emitPatchPoint(MI, BB);
+
+  case TargetOpcode::STACKMAP:
+  case TargetOpcode::PATCHPOINT:
+    return emitPatchPoint(MI, BB);
+
+  case X86::VFMADDPDr213r:
+  case X86::VFMADDPSr213r:
+  case X86::VFMADDSDr213r:
+  case X86::VFMADDSSr213r:
+  case X86::VFMSUBPDr213r:
+  case X86::VFMSUBPSr213r:
+  case X86::VFMSUBSDr213r:
+  case X86::VFMSUBSSr213r:
+  case X86::VFNMADDPDr213r:
+  case X86::VFNMADDPSr213r:
+  case X86::VFNMADDSDr213r:
+  case X86::VFNMADDSSr213r:
+  case X86::VFNMSUBPDr213r:
+  case X86::VFNMSUBPSr213r:
+  case X86::VFNMSUBSDr213r:
+  case X86::VFNMSUBSSr213r:
+  case X86::VFMADDSUBPDr213r:
+  case X86::VFMADDSUBPSr213r:
+  case X86::VFMSUBADDPDr213r:
+  case X86::VFMSUBADDPSr213r:
+  case X86::VFMADDPDr213rY:
+  case X86::VFMADDPSr213rY:
+  case X86::VFMSUBPDr213rY:
+  case X86::VFMSUBPSr213rY:
+  case X86::VFNMADDPDr213rY:
+  case X86::VFNMADDPSr213rY:
+  case X86::VFNMSUBPDr213rY:
+  case X86::VFNMSUBPSr213rY:
+  case X86::VFMADDSUBPDr213rY:
+  case X86::VFMADDSUBPSr213rY:
+  case X86::VFMSUBADDPDr213rY:
+  case X86::VFMSUBADDPSr213rY:
+    return emitFMA3Instr(MI, BB);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                      APInt &KnownZero,
+                                                      APInt &KnownOne,
+                                                      const SelectionDAG &DAG,
+                                                      unsigned Depth) const {
+  unsigned BitWidth = KnownZero.getBitWidth();
+  unsigned Opc = Op.getOpcode();
+  assert((Opc >= ISD::BUILTIN_OP_END ||
+          Opc == ISD::INTRINSIC_WO_CHAIN ||
+          Opc == ISD::INTRINSIC_W_CHAIN ||
+          Opc == ISD::INTRINSIC_VOID) &&
+         "Should use MaskedValueIsZero if you don't know whether Op"
+         " is a target node!");
+
+  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
+  switch (Opc) {
+  default: break;
+  case X86ISD::ADD:
+  case X86ISD::SUB:
+  case X86ISD::ADC:
+  case X86ISD::SBB:
+  case X86ISD::SMUL:
+  case X86ISD::UMUL:
+  case X86ISD::INC:
+  case X86ISD::DEC:
+  case X86ISD::OR:
+  case X86ISD::XOR:
+  case X86ISD::AND:
+    // These nodes' second result is a boolean.
+    if (Op.getResNo() == 0)
+      break;
+    // Fallthrough
+  case X86ISD::SETCC:
+    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    break;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned NumLoBits = 0;
+    switch (IntId) {
+    default: break;
+    case Intrinsic::x86_sse_movmsk_ps:
+    case Intrinsic::x86_avx_movmsk_ps_256:
+    case Intrinsic::x86_sse2_movmsk_pd:
+    case Intrinsic::x86_avx_movmsk_pd_256:
+    case Intrinsic::x86_mmx_pmovmskb:
+    case Intrinsic::x86_sse2_pmovmskb_128:
+    case Intrinsic::x86_avx2_pmovmskb: {
+      // High bits of movmskp{s|d}, pmovmskb are known zero.
+      switch (IntId) {
+        default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+        case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
+        case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
+        case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
+        case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
+        case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
+        case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
+        case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
+      }
+      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
+      break;
+    }
+    }
+    break;
+  }
+  }
+}
+
+unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
+  SDValue Op,
+  const SelectionDAG &,
+  unsigned Depth) const {
+  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
+  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
+    return Op.getValueType().getScalarSizeInBits();
+
+  // Fallback case.
+  return 1;
+}
+
+/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
+/// node is a GlobalAddress + offset.
+bool X86TargetLowering::isGAPlusOffset(SDNode *N,
+                                       const GlobalValue* &GA,
+                                       int64_t &Offset) const {
+  if (N->getOpcode() == X86ISD::Wrapper) {
+    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
+      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
+      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
+      return true;
+    }
+  }
+  return TargetLowering::isGAPlusOffset(N, GA, Offset);
+}
+
+/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
+/// FIXME: This could be expanded to support 512 bit vectors as well.
+static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const X86Subtarget* Subtarget) {
+  SDLoc dl(N);
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  MVT VT = SVOp->getSimpleValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
+      V2.getOpcode() == ISD::CONCAT_VECTORS) {
+    //
+    //                   0,0,0,...
+    //                      |
+    //    V      UNDEF    BUILD_VECTOR    UNDEF
+    //     \      /           \           /
+    //  CONCAT_VECTOR         CONCAT_VECTOR
+    //         \                  /
+    //          \                /
+    //          RESULT: V + zero extended
+    //
+    if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
+        V2.getOperand(1).getOpcode() != ISD::UNDEF ||
+        V1.getOperand(1).getOpcode() != ISD::UNDEF)
+      return SDValue();
+
+    if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
+      return SDValue();
+
+    // To match the shuffle mask, the first half of the mask should
+    // be exactly the first vector, and all the rest a splat with the
+    // first element of the second one.
+    for (unsigned i = 0; i != NumElems/2; ++i)
+      if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
+          !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
+        return SDValue();
+
+    // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
+    if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
+      if (Ld->hasNUsesOfValue(1, 0)) {
+        SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
+        SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
+        SDValue ResNode =
+          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+                                  Ld->getMemoryVT(),
+                                  Ld->getPointerInfo(),
+                                  Ld->getAlignment(),
+                                  false/*isVolatile*/, true/*ReadMem*/,
+                                  false/*WriteMem*/);
+
+        // Make sure the newly-created LOAD is in the same position as Ld in
+        // terms of dependency. We create a TokenFactor for Ld and ResNode,
+        // and update uses of Ld's output chain to use the TokenFactor.
+        if (Ld->hasAnyUseOfValue(1)) {
+          SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                             SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
+          DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
+          DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
+                                 SDValue(ResNode.getNode(), 1));
+        }
+
+        return DAG.getBitcast(VT, ResNode);
+      }
+    }
+
+    // Emit a zeroed vector and insert the desired subvector on its
+    // first half.
+    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
+
+  return SDValue();
+}
+
+/// \brief Combine an arbitrary chain of shuffles into a single instruction if
+/// possible.
+///
+/// This is the leaf of the recursive combinine below. When we have found some
+/// chain of single-use x86 shuffle instructions and accumulated the combined
+/// shuffle mask represented by them, this will try to pattern match that mask
+/// into either a single instruction if there is a special purpose instruction
+/// for this operation, or into a PSHUFB instruction which is a fully general
+/// instruction but should only be used to replace chains over a certain depth.
+static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
+                                   int Depth, bool HasPSHUFB, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
+  assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
+
+  // Find the operand that enters the chain. Note that multiple uses are OK
+  // here, we're not going to remove the operand we find.
+  SDValue Input = Op.getOperand(0);
+  while (Input.getOpcode() == ISD::BITCAST)
+    Input = Input.getOperand(0);
+
+  MVT VT = Input.getSimpleValueType();
+  MVT RootVT = Root.getSimpleValueType();
+  SDLoc DL(Root);
+
+  if (Mask.size() == 1) {
+    int Index = Mask[0];
+    assert((Index >= 0 || Index == SM_SentinelUndef ||
+            Index == SM_SentinelZero) &&
+           "Invalid shuffle index found!");
+
+    // We may end up with an accumulated mask of size 1 as a result of
+    // widening of shuffle operands (see function canWidenShuffleElements).
+    // If the only shuffle index is equal to SM_SentinelZero then propagate
+    // a zero vector. Otherwise, the combine shuffle mask is a no-op shuffle
+    // mask, and therefore the entire chain of shuffles can be folded away.
+    if (Index == SM_SentinelZero)
+      DCI.CombineTo(Root.getNode(), getZeroVector(RootVT, Subtarget, DAG, DL));
+    else
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
+                    /*AddTo*/ true);
+    return true;
+  }
+
+  // Use the float domain if the operand type is a floating point type.
+  bool FloatDomain = VT.isFloatingPoint();
+
+  // For floating point shuffles, we don't have free copies in the shuffle
+  // instructions or the ability to load as part of the instruction, so
+  // canonicalize their shuffles to UNPCK or MOV variants.
+  //
+  // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
+  // vectors because it can have a load folded into it that UNPCK cannot. This
+  // doesn't preclude something switching to the shorter encoding post-RA.
+  //
+  // FIXME: Should teach these routines about AVX vector widths.
+  if (FloatDomain && VT.is128BitVector()) {
+    if (Mask.equals({0, 0}) || Mask.equals({1, 1})) {
+      bool Lo = Mask.equals({0, 0});
+      unsigned Shuffle;
+      MVT ShuffleVT;
+      // Check if we have SSE3 which will let us use MOVDDUP. That instruction
+      // is no slower than UNPCKLPD but has the option to fold the input operand
+      // into even an unaligned memory load.
+      if (Lo && Subtarget->hasSSE3()) {
+        Shuffle = X86ISD::MOVDDUP;
+        ShuffleVT = MVT::v2f64;
+      } else {
+        // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
+        // than the UNPCK variants.
+        Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
+        ShuffleVT = MVT::v4f32;
+      }
+      if (Depth == 1 && Root->getOpcode() == Shuffle)
+        return false; // Nothing to do!
+      Op = DAG.getBitcast(ShuffleVT, Input);
+      DCI.AddToWorklist(Op.getNode());
+      if (Shuffle == X86ISD::MOVDDUP)
+        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
+      else
+        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+      DCI.AddToWorklist(Op.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
+                    /*AddTo*/ true);
+      return true;
+    }
+    if (Subtarget->hasSSE3() &&
+        (Mask.equals({0, 0, 2, 2}) || Mask.equals({1, 1, 3, 3}))) {
+      bool Lo = Mask.equals({0, 0, 2, 2});
+      unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
+      MVT ShuffleVT = MVT::v4f32;
+      if (Depth == 1 && Root->getOpcode() == Shuffle)
+        return false; // Nothing to do!
+      Op = DAG.getBitcast(ShuffleVT, Input);
+      DCI.AddToWorklist(Op.getNode());
+      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
+      DCI.AddToWorklist(Op.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
+                    /*AddTo*/ true);
+      return true;
+    }
+    if (Mask.equals({0, 0, 1, 1}) || Mask.equals({2, 2, 3, 3})) {
+      bool Lo = Mask.equals({0, 0, 1, 1});
+      unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+      MVT ShuffleVT = MVT::v4f32;
+      if (Depth == 1 && Root->getOpcode() == Shuffle)
+        return false; // Nothing to do!
+      Op = DAG.getBitcast(ShuffleVT, Input);
+      DCI.AddToWorklist(Op.getNode());
+      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+      DCI.AddToWorklist(Op.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
+                    /*AddTo*/ true);
+      return true;
+    }
+  }
+
+  // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
+  // variants as none of these have single-instruction variants that are
+  // superior to the UNPCK formulation.
+  if (!FloatDomain && VT.is128BitVector() &&
+      (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
+       Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) ||
+       Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) ||
+       Mask.equals(
+           {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) {
+    bool Lo = Mask[0] == 0;
+    unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+    if (Depth == 1 && Root->getOpcode() == Shuffle)
+      return false; // Nothing to do!
+    MVT ShuffleVT;
+    switch (Mask.size()) {
+    case 8:
+      ShuffleVT = MVT::v8i16;
+      break;
+    case 16:
+      ShuffleVT = MVT::v16i8;
+      break;
+    default:
+      llvm_unreachable("Impossible mask size!");
+    };
+    Op = DAG.getBitcast(ShuffleVT, Input);
+    DCI.AddToWorklist(Op.getNode());
+    Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+    DCI.AddToWorklist(Op.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  // Don't try to re-form single instruction chains under any circumstances now
+  // that we've done encoding canonicalization for them.
+  if (Depth < 2)
+    return false;
+
+  // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
+  // can replace them with a single PSHUFB instruction profitably. Intel's
+  // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
+  // in practice PSHUFB tends to be *very* fast so we're more aggressive.
+  if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
+    SmallVector<SDValue, 16> PSHUFBMask;
+    int NumBytes = VT.getSizeInBits() / 8;
+    int Ratio = NumBytes / Mask.size();
+    for (int i = 0; i < NumBytes; ++i) {
+      if (Mask[i / Ratio] == SM_SentinelUndef) {
+        PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
+        continue;
+      }
+      int M = Mask[i / Ratio] != SM_SentinelZero
+                  ? Ratio * Mask[i / Ratio] + i % Ratio
+                  : 255;
+      PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
+    }
+    MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
+    Op = DAG.getBitcast(ByteVT, Input);
+    DCI.AddToWorklist(Op.getNode());
+    SDValue PSHUFBMaskOp =
+        DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask);
+    DCI.AddToWorklist(PSHUFBMaskOp.getNode());
+    Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp);
+    DCI.AddToWorklist(Op.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  // Failed to find any combines.
+  return false;
+}
+
+/// \brief Fully generic combining of x86 shuffle instructions.
+///
+/// This should be the last combine run over the x86 shuffle instructions. Once
+/// they have been fully optimized, this will recursively consider all chains
+/// of single-use shuffle instructions, build a generic model of the cumulative
+/// shuffle operation, and check for simpler instructions which implement this
+/// operation. We use this primarily for two purposes:
+///
+/// 1) Collapse generic shuffles to specialized single instructions when
+///    equivalent. In most cases, this is just an encoding size win, but
+///    sometimes we will collapse multiple generic shuffles into a single
+///    special-purpose shuffle.
+/// 2) Look for sequences of shuffle instructions with 3 or more total
+///    instructions, and replace them with the slightly more expensive SSSE3
+///    PSHUFB instruction if available. We do this as the last combining step
+///    to ensure we avoid using PSHUFB if we can implement the shuffle with
+///    a suitable short sequence of other instructions. The PHUFB will either
+///    use a register or have to read from memory and so is slightly (but only
+///    slightly) more expensive than the other shuffle instructions.
+///
+/// Because this is inherently a quadratic operation (for each shuffle in
+/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
+/// This should never be an issue in practice as the shuffle lowering doesn't
+/// produce sequences of more than 8 instructions.
+///
+/// FIXME: We will currently miss some cases where the redundant shuffling
+/// would simplify under the threshold for PSHUFB formation because of
+/// combine-ordering. To fix this, we should do the redundant instruction
+/// combining in this recursive walk.
+static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
+                                          ArrayRef<int> RootMask,
+                                          int Depth, bool HasPSHUFB,
+                                          SelectionDAG &DAG,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          const X86Subtarget *Subtarget) {
+  // Bound the depth of our recursive combine because this is ultimately
+  // quadratic in nature.
+  if (Depth > 8)
+    return false;
+
+  // Directly rip through bitcasts to find the underlying operand.
+  while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
+    Op = Op.getOperand(0);
+
+  MVT VT = Op.getSimpleValueType();
+  if (!VT.isVector())
+    return false; // Bail if we hit a non-vector.
+
+  assert(Root.getSimpleValueType().isVector() &&
+         "Shuffles operate on vector types!");
+  assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
+         "Can only combine shuffles of the same vector register size.");
+
+  if (!isTargetShuffle(Op.getOpcode()))
+    return false;
+  SmallVector<int, 16> OpMask;
+  bool IsUnary;
+  bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
+  // We only can combine unary shuffles which we can decode the mask for.
+  if (!HaveMask || !IsUnary)
+    return false;
+
+  assert(VT.getVectorNumElements() == OpMask.size() &&
+         "Different mask size from vector size!");
+  assert(((RootMask.size() > OpMask.size() &&
+           RootMask.size() % OpMask.size() == 0) ||
+          (OpMask.size() > RootMask.size() &&
+           OpMask.size() % RootMask.size() == 0) ||
+          OpMask.size() == RootMask.size()) &&
+         "The smaller number of elements must divide the larger.");
+  int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
+  int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
+  assert(((RootRatio == 1 && OpRatio == 1) ||
+          (RootRatio == 1) != (OpRatio == 1)) &&
+         "Must not have a ratio for both incoming and op masks!");
+
+  SmallVector<int, 16> Mask;
+  Mask.reserve(std::max(OpMask.size(), RootMask.size()));
+
+  // Merge this shuffle operation's mask into our accumulated mask. Note that
+  // this shuffle's mask will be the first applied to the input, followed by the
+  // root mask to get us all the way to the root value arrangement. The reason
+  // for this order is that we are recursing up the operation chain.
+  for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
+    int RootIdx = i / RootRatio;
+    if (RootMask[RootIdx] < 0) {
+      // This is a zero or undef lane, we're done.
+      Mask.push_back(RootMask[RootIdx]);
+      continue;
+    }
+
+    int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
+    int OpIdx = RootMaskedIdx / OpRatio;
+    if (OpMask[OpIdx] < 0) {
+      // The incoming lanes are zero or undef, it doesn't matter which ones we
+      // are using.
+      Mask.push_back(OpMask[OpIdx]);
+      continue;
+    }
+
+    // Ok, we have non-zero lanes, map them through.
+    Mask.push_back(OpMask[OpIdx] * OpRatio +
+                   RootMaskedIdx % OpRatio);
+  }
+
+  // See if we can recurse into the operand to combine more things.
+  switch (Op.getOpcode()) {
+  case X86ISD::PSHUFB:
+    HasPSHUFB = true;
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+    if (Op.getOperand(0).hasOneUse() &&
+        combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
+                                      HasPSHUFB, DAG, DCI, Subtarget))
+      return true;
+    break;
+
+  case X86ISD::UNPCKL:
+  case X86ISD::UNPCKH:
+    assert(Op.getOperand(0) == Op.getOperand(1) &&
+           "We only combine unary shuffles!");
+    // We can't check for single use, we have to check that this shuffle is the
+    // only user.
+    if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
+        combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
+                                      HasPSHUFB, DAG, DCI, Subtarget))
+      return true;
+    break;
+  }
+
+  // Minor canonicalization of the accumulated shuffle mask to make it easier
+  // to match below. All this does is detect masks with squential pairs of
+  // elements, and shrink them to the half-width mask. It does this in a loop
+  // so it will reduce the size of the mask to the minimal width mask which
+  // performs an equivalent shuffle.
+  SmallVector<int, 16> WidenedMask;
+  while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
+    Mask = std::move(WidenedMask);
+    WidenedMask.clear();
+  }
+
+  return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
+                                Subtarget);
+}
+
+/// \brief Get the PSHUF-style mask from PSHUF node.
+///
+/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
+/// PSHUF-style masks that can be reused with such instructions.
+static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
+  MVT VT = N.getSimpleValueType();
+  SmallVector<int, 4> Mask;
+  bool IsUnary;
+  bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary);
+  (void)HaveMask;
+  assert(HaveMask);
+
+  // If we have more than 128-bits, only the low 128-bits of shuffle mask
+  // matter. Check that the upper masks are repeats and remove them.
+  if (VT.getSizeInBits() > 128) {
+    int LaneElts = 128 / VT.getScalarSizeInBits();
+#ifndef NDEBUG
+    for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
+      for (int j = 0; j < LaneElts; ++j)
+        assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
+               "Mask doesn't repeat in high 128-bit lanes!");
+#endif
+    Mask.resize(LaneElts);
+  }
+
+  switch (N.getOpcode()) {
+  case X86ISD::PSHUFD:
+    return Mask;
+  case X86ISD::PSHUFLW:
+    Mask.resize(4);
+    return Mask;
+  case X86ISD::PSHUFHW:
+    Mask.erase(Mask.begin(), Mask.begin() + 4);
+    for (int &M : Mask)
+      M -= 4;
+    return Mask;
+  default:
+    llvm_unreachable("No valid shuffle instruction found!");
+  }
+}
+
+/// \brief Search for a combinable shuffle across a chain ending in pshufd.
+///
+/// We walk up the chain and look for a combinable shuffle, skipping over
+/// shuffles that we could hoist this shuffle's transformation past without
+/// altering anything.
+static SDValue
+combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
+                             SelectionDAG &DAG,
+                             TargetLowering::DAGCombinerInfo &DCI) {
+  assert(N.getOpcode() == X86ISD::PSHUFD &&
+         "Called with something other than an x86 128-bit half shuffle!");
+  SDLoc DL(N);
+
+  // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
+  // of the shuffles in the chain so that we can form a fresh chain to replace
+  // this one.
+  SmallVector<SDValue, 8> Chain;
+  SDValue V = N.getOperand(0);
+  for (; V.hasOneUse(); V = V.getOperand(0)) {
+    switch (V.getOpcode()) {
+    default:
+      return SDValue(); // Nothing combined!
+
+    case ISD::BITCAST:
+      // Skip bitcasts as we always know the type for the target specific
+      // instructions.
+      continue;
+
+    case X86ISD::PSHUFD:
+      // Found another dword shuffle.
+      break;
+
+    case X86ISD::PSHUFLW:
+      // Check that the low words (being shuffled) are the identity in the
+      // dword shuffle, and the high words are self-contained.
+      if (Mask[0] != 0 || Mask[1] != 1 ||
+          !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
+        return SDValue();
+
+      Chain.push_back(V);
+      continue;
+
+    case X86ISD::PSHUFHW:
+      // Check that the high words (being shuffled) are the identity in the
+      // dword shuffle, and the low words are self-contained.
+      if (Mask[2] != 2 || Mask[3] != 3 ||
+          !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
+        return SDValue();
+
+      Chain.push_back(V);
+      continue;
+
+    case X86ISD::UNPCKL:
+    case X86ISD::UNPCKH:
+      // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
+      // shuffle into a preceding word shuffle.
+      if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
+          V.getSimpleValueType().getVectorElementType() != MVT::i16)
+        return SDValue();
+
+      // Search for a half-shuffle which we can combine with.
+      unsigned CombineOp =
+          V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
+      if (V.getOperand(0) != V.getOperand(1) ||
+          !V->isOnlyUserOf(V.getOperand(0).getNode()))
+        return SDValue();
+      Chain.push_back(V);
+      V = V.getOperand(0);
+      do {
+        switch (V.getOpcode()) {
+        default:
+          return SDValue(); // Nothing to combine.
+
+        case X86ISD::PSHUFLW:
+        case X86ISD::PSHUFHW:
+          if (V.getOpcode() == CombineOp)
+            break;
+
+          Chain.push_back(V);
+
+          // Fallthrough!
+        case ISD::BITCAST:
+          V = V.getOperand(0);
+          continue;
+        }
+        break;
+      } while (V.hasOneUse());
+      break;
+    }
+    // Break out of the loop if we break out of the switch.
+    break;
+  }
+
+  if (!V.hasOneUse())
+    // We fell out of the loop without finding a viable combining instruction.
+    return SDValue();
+
+  // Merge this node's mask and our incoming mask.
+  SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+  for (int &M : Mask)
+    M = VMask[M];
+  V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
+                  getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+
+  // Rebuild the chain around this new shuffle.
+  while (!Chain.empty()) {
+    SDValue W = Chain.pop_back_val();
+
+    if (V.getValueType() != W.getOperand(0).getValueType())
+      V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
+
+    switch (W.getOpcode()) {
+    default:
+      llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
+
+    case X86ISD::UNPCKL:
+    case X86ISD::UNPCKH:
+      V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
+      break;
+
+    case X86ISD::PSHUFD:
+    case X86ISD::PSHUFLW:
+    case X86ISD::PSHUFHW:
+      V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
+      break;
+    }
+  }
+  if (V.getValueType() != N.getValueType())
+    V = DAG.getBitcast(N.getValueType(), V);
+
+  // Return the new chain to replace N.
+  return V;
+}
+
+/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
+/// pshufhw.
+///
+/// We walk up the chain, skipping shuffles of the other half and looking
+/// through shuffles which switch halves trying to find a shuffle of the same
+/// pair of dwords.
+static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
+                                        SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI) {
+  assert(
+      (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
+      "Called with something other than an x86 128-bit half shuffle!");
+  SDLoc DL(N);
+  unsigned CombineOpcode = N.getOpcode();
+
+  // Walk up a single-use chain looking for a combinable shuffle.
+  SDValue V = N.getOperand(0);
+  for (; V.hasOneUse(); V = V.getOperand(0)) {
+    switch (V.getOpcode()) {
+    default:
+      return false; // Nothing combined!
+
+    case ISD::BITCAST:
+      // Skip bitcasts as we always know the type for the target specific
+      // instructions.
+      continue;
+
+    case X86ISD::PSHUFLW:
+    case X86ISD::PSHUFHW:
+      if (V.getOpcode() == CombineOpcode)
+        break;
+
+      // Other-half shuffles are no-ops.
+      continue;
+    }
+    // Break out of the loop if we break out of the switch.
+    break;
+  }
+
+  if (!V.hasOneUse())
+    // We fell out of the loop without finding a viable combining instruction.
+    return false;
+
+  // Combine away the bottom node as its shuffle will be accumulated into
+  // a preceding shuffle.
+  DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
+
+  // Record the old value.
+  SDValue Old = V;
+
+  // Merge this node's mask and our incoming mask (adjusted to account for all
+  // the pshufd instructions encountered).
+  SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+  for (int &M : Mask)
+    M = VMask[M];
+  V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
+                  getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+
+  // Check that the shuffles didn't cancel each other out. If not, we need to
+  // combine to the new one.
+  if (Old != V)
+    // Replace the combinable shuffle with the combined one, updating all users
+    // so that we re-evaluate the chain here.
+    DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
+
+  return true;
+}
+
+/// \brief Try to combine x86 target specific shuffles.
+static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+  MVT VT = N.getSimpleValueType();
+  SmallVector<int, 4> Mask;
+
+  switch (N.getOpcode()) {
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFLW:
+  case X86ISD::PSHUFHW:
+    Mask = getPSHUFShuffleMask(N);
+    assert(Mask.size() == 4);
+    break;
+  case X86ISD::UNPCKL: {
+    // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
+    // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
+    // moves upper half elements into the lower half part. For example:
+    //
+    // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
+    //     undef:v16i8
+    // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
+    //
+    // will be combined to:
+    //
+    // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
+
+    // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
+    // happen due to advanced instructions.
+    if (!VT.is128BitVector())
+      return SDValue();
+
+    auto Op0 = N.getOperand(0);
+    auto Op1 = N.getOperand(1);
+    if (Op0.getOpcode() == ISD::UNDEF &&
+        Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) {
+      ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
+
+      unsigned NumElts = VT.getVectorNumElements();
+      SmallVector<int, 8> ExpectedMask(NumElts, -1);
+      std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
+                NumElts / 2);
+
+      auto ShufOp = Op1.getOperand(0);
+      if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
+        return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
+    }
+    return SDValue();
+  }
+  case X86ISD::BLENDI: {
+    SDValue V0 = N->getOperand(0);
+    SDValue V1 = N->getOperand(1);
+    assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
+           "Unexpected input vector types");
+
+    // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
+    // operands and changing the mask to 1. This saves us a bunch of
+    // pattern-matching possibilities related to scalar math ops in SSE/AVX.
+    // x86InstrInfo knows how to commute this back after instruction selection
+    // if it would help register allocation.
+
+    // TODO: If optimizing for size or a processor that doesn't suffer from
+    // partial register update stalls, this should be transformed into a MOVSD
+    // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
+
+    if (VT == MVT::v2f64)
+      if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+        if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
+          SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
+          return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
+        }
+
+    return SDValue();
+  }
+  default:
+    return SDValue();
+  }
+
+  // Nuke no-op shuffles that show up after combining.
+  if (isNoopShuffleMask(Mask))
+    return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
+
+  // Look for simplifications involving one or two shuffle instructions.
+  SDValue V = N.getOperand(0);
+  switch (N.getOpcode()) {
+  default:
+    break;
+  case X86ISD::PSHUFLW:
+  case X86ISD::PSHUFHW:
+    assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
+
+    if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
+      return SDValue(); // We combined away this shuffle, so we're done.
+
+    // See if this reduces to a PSHUFD which is no more expensive and can
+    // combine with more operations. Note that it has to at least flip the
+    // dwords as otherwise it would have been removed as a no-op.
+    if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
+      int DMask[] = {0, 1, 2, 3};
+      int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
+      DMask[DOffset + 0] = DOffset + 1;
+      DMask[DOffset + 1] = DOffset + 0;
+      MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+      V = DAG.getBitcast(DVT, V);
+      DCI.AddToWorklist(V.getNode());
+      V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
+                      getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
+      DCI.AddToWorklist(V.getNode());
+      return DAG.getBitcast(VT, V);
+    }
+
+    // Look for shuffle patterns which can be implemented as a single unpack.
+    // FIXME: This doesn't handle the location of the PSHUFD generically, and
+    // only works when we have a PSHUFD followed by two half-shuffles.
+    if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
+        (V.getOpcode() == X86ISD::PSHUFLW ||
+         V.getOpcode() == X86ISD::PSHUFHW) &&
+        V.getOpcode() != N.getOpcode() &&
+        V.hasOneUse()) {
+      SDValue D = V.getOperand(0);
+      while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
+        D = D.getOperand(0);
+      if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
+        SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+        SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
+        int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+        int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+        int WordMask[8];
+        for (int i = 0; i < 4; ++i) {
+          WordMask[i + NOffset] = Mask[i] + NOffset;
+          WordMask[i + VOffset] = VMask[i] + VOffset;
+        }
+        // Map the word mask through the DWord mask.
+        int MappedMask[8];
+        for (int i = 0; i < 8; ++i)
+          MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
+        if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
+            makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
+          // We can replace all three shuffles with an unpack.
+          V = DAG.getBitcast(VT, D.getOperand(0));
+          DCI.AddToWorklist(V.getNode());
+          return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
+                                                : X86ISD::UNPCKH,
+                             DL, VT, V, V);
+        }
+      }
+    }
+
+    break;
+
+  case X86ISD::PSHUFD:
+    if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
+      return NewN;
+
+    break;
+  }
+
+  return SDValue();
+}
+
+/// \brief Try to combine a shuffle into a target-specific add-sub node.
+///
+/// We combine this directly on the abstract vector shuffle nodes so it is
+/// easier to generically match. We also insert dummy vector shuffle nodes for
+/// the operands which explicitly discard the lanes which are unused by this
+/// operation to try to flow through the rest of the combiner the fact that
+/// they're unused.
+static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
+      (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
+    return SDValue();
+
+  // We only handle target-independent shuffles.
+  // FIXME: It would be easy and harmless to use the target shuffle mask
+  // extraction tool to support more.
+  if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
+    return SDValue();
+
+  auto *SVN = cast<ShuffleVectorSDNode>(N);
+  SmallVector<int, 8> Mask;
+  for (int M : SVN->getMask())
+    Mask.push_back(M);
+
+  SDValue V1 = N->getOperand(0);
+  SDValue V2 = N->getOperand(1);
+
+  // We require the first shuffle operand to be the FSUB node, and the second to
+  // be the FADD node.
+  if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
+    ShuffleVectorSDNode::commuteMask(Mask);
+    std::swap(V1, V2);
+  } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
+    return SDValue();
+
+  // If there are other uses of these operations we can't fold them.
+  if (!V1->hasOneUse() || !V2->hasOneUse())
+    return SDValue();
+
+  // Ensure that both operations have the same operands. Note that we can
+  // commute the FADD operands.
+  SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
+  if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
+      (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
+    return SDValue();
+
+  // We're looking for blends between FADD and FSUB nodes. We insist on these
+  // nodes being lined up in a specific expected pattern.
+  if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
+        isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
+        isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
+    return SDValue();
+
+  return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
+}
+
+/// PerformShuffleCombine - Performs several different shuffle combines.
+static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const X86Subtarget *Subtarget) {
+  SDLoc dl(N);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  // Don't create instructions with illegal types after legalize types has run.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
+    return SDValue();
+
+  // If we have legalized the vector types, look for blends of FADD and FSUB
+  // nodes that we can fuse into an ADDSUB node.
+  if (TLI.isTypeLegal(VT))
+    if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
+      return AddSub;
+
+  // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
+  if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() &&
+      N->getOpcode() == ISD::VECTOR_SHUFFLE)
+    return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
+
+  // During Type Legalization, when promoting illegal vector types,
+  // the backend might introduce new shuffle dag nodes and bitcasts.
+  //
+  // This code performs the following transformation:
+  // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
+  //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
+  //
+  // We do this only if both the bitcast and the BINOP dag nodes have
+  // one use. Also, perform this transformation only if the new binary
+  // operation is legal. This is to avoid introducing dag nodes that
+  // potentially need to be further expanded (or custom lowered) into a
+  // less optimal sequence of dag nodes.
+  if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
+      N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
+      N0.getOpcode() == ISD::BITCAST) {
+    SDValue BC0 = N0.getOperand(0);
+    EVT SVT = BC0.getValueType();
+    unsigned Opcode = BC0.getOpcode();
+    unsigned NumElts = VT.getVectorNumElements();
+
+    if (BC0.hasOneUse() && SVT.isVector() &&
+        SVT.getVectorNumElements() * 2 == NumElts &&
+        TLI.isOperationLegal(Opcode, VT)) {
+      bool CanFold = false;
+      switch (Opcode) {
+      default : break;
+      case ISD::ADD :
+      case ISD::FADD :
+      case ISD::SUB :
+      case ISD::FSUB :
+      case ISD::MUL :
+      case ISD::FMUL :
+        CanFold = true;
+      }
+
+      unsigned SVTNumElts = SVT.getVectorNumElements();
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+      for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
+        CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
+      for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
+        CanFold = SVOp->getMaskElt(i) < 0;
+
+      if (CanFold) {
+        SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
+        SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
+        SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
+        return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
+      }
+    }
+  }
+
+  // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
+  // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
+  // consecutive, non-overlapping, and in the right order.
+  SmallVector<SDValue, 16> Elts;
+  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
+    Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
+
+  if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
+    return LD;
+
+  if (isTargetShuffle(N->getOpcode())) {
+    SDValue Shuffle =
+        PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
+    if (Shuffle.getNode())
+      return Shuffle;
+
+    // Try recursively combining arbitrary sequences of x86 shuffle
+    // instructions into higher-order shuffles. We do this after combining
+    // specific PSHUF instruction sequences into their minimal form so that we
+    // can evaluate how many specialized shuffle instructions are involved in
+    // a particular chain.
+    SmallVector<int, 1> NonceMask; // Just a placeholder.
+    NonceMask.push_back(0);
+    if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
+                                      /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
+                                      DCI, Subtarget))
+      return SDValue(); // This routine will use CombineTo to replace N.
+  }
+
+  return SDValue();
+}
+
+/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
+/// specific shuffle of a load can be folded into a single element load.
+/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
+/// shuffles have been custom lowered so we need to handle those here.
+static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue InVec = N->getOperand(0);
+  SDValue EltNo = N->getOperand(1);
+
+  if (!isa<ConstantSDNode>(EltNo))
+    return SDValue();
+
+  EVT OriginalVT = InVec.getValueType();
+
+  if (InVec.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!InVec.hasOneUse())
+      return SDValue();
+    EVT BCVT = InVec.getOperand(0).getValueType();
+    if (!BCVT.isVector() ||
+        BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
+      return SDValue();
+    InVec = InVec.getOperand(0);
+  }
+
+  EVT CurrentVT = InVec.getValueType();
+
+  if (!isTargetShuffle(InVec.getOpcode()))
+    return SDValue();
+
+  // Don't duplicate a load with other uses.
+  if (!InVec.hasOneUse())
+    return SDValue();
+
+  SmallVector<int, 16> ShuffleMask;
+  bool UnaryShuffle;
+  if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
+                            ShuffleMask, UnaryShuffle))
+    return SDValue();
+
+  // Select the input vector, guarding against out of range extract vector.
+  unsigned NumElems = CurrentVT.getVectorNumElements();
+  int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+  int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
+  SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
+                                         : InVec.getOperand(1);
+
+  // If inputs to shuffle are the same for both ops, then allow 2 uses
+  unsigned AllowedUses = InVec.getNumOperands() > 1 &&
+                         InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
+
+  if (LdNode.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
+      return SDValue();
+
+    AllowedUses = 1; // only allow 1 load use if we have a bitcast
+    LdNode = LdNode.getOperand(0);
+  }
+
+  if (!ISD::isNormalLoad(LdNode.getNode()))
+    return SDValue();
+
+  LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
+
+  if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
+    return SDValue();
+
+  EVT EltVT = N->getValueType(0);
+  // If there's a bitcast before the shuffle, check if the load type and
+  // alignment is valid.
+  unsigned Align = LN0->getAlignment();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
+      EltVT.getTypeForEVT(*DAG.getContext()));
+
+  if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
+    return SDValue();
+
+  // All checks match so transform back to vector_shuffle so that DAG combiner
+  // can finish the job
+  SDLoc dl(N);
+
+  // Create shuffle node taking into account the case that its a unary shuffle
+  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
+                                   : InVec.getOperand(1);
+  Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
+                                 InVec.getOperand(0), Shuffle,
+                                 &ShuffleMask[0]);
+  Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
+                     EltNo);
+}
+
+static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
+                                     const X86Subtarget *Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
+  // special and don't usually play with other vector types, it's better to
+  // handle them early to be sure we emit efficient code by avoiding
+  // store-load conversions.
+  if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
+      N0.getValueType() == MVT::v2i32 &&
+      isNullConstant(N0.getOperand(1))) {
+    SDValue N00 = N0->getOperand(0);
+    if (N00.getValueType() == MVT::i32)
+      return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
+  }
+
+  // Convert a bitcasted integer logic operation that has one bitcasted
+  // floating-point operand and one constant operand into a floating-point
+  // logic operation. This may create a load of the constant, but that is
+  // cheaper than materializing the constant in an integer register and
+  // transferring it to an SSE register or transferring the SSE operand to
+  // integer register and back.
+  unsigned FPOpcode;
+  switch (N0.getOpcode()) {
+    case ISD::AND: FPOpcode = X86ISD::FAND; break;
+    case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
+    case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
+    default: return SDValue();
+  }
+  if (((Subtarget->hasSSE1() && VT == MVT::f32) ||
+       (Subtarget->hasSSE2() && VT == MVT::f64)) &&
+      isa<ConstantSDNode>(N0.getOperand(1)) &&
+      N0.getOperand(0).getOpcode() == ISD::BITCAST &&
+      N0.getOperand(0).getOperand(0).getValueType() == VT) {
+    SDValue N000 = N0.getOperand(0).getOperand(0);
+    SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1));
+    return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst);
+  }
+
+  return SDValue();
+}
+
+/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
+/// generation and convert it from being a bunch of shuffles and extracts
+/// into a somewhat faster sequence. For i686, the best sequence is apparently
+/// storing the value and loading scalars back, while for x64 we should
+/// use 64-bit extracts and shifts.
+static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
+    return NewOp;
+
+  SDValue InputVector = N->getOperand(0);
+  SDLoc dl(InputVector);
+  // Detect mmx to i32 conversion through a v2i32 elt extract.
+  if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
+      N->getValueType(0) == MVT::i32 &&
+      InputVector.getValueType() == MVT::v2i32) {
+
+    // The bitcast source is a direct mmx result.
+    SDValue MMXSrc = InputVector.getNode()->getOperand(0);
+    if (MMXSrc.getValueType() == MVT::x86mmx)
+      return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+                         N->getValueType(0),
+                         InputVector.getNode()->getOperand(0));
+
+    // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
+    if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
+        MMXSrc.getValueType() == MVT::i64) {
+      SDValue MMXSrcOp = MMXSrc.getOperand(0);
+      if (MMXSrcOp.hasOneUse() && MMXSrcOp.getOpcode() == ISD::BITCAST &&
+          MMXSrcOp.getValueType() == MVT::v1i64 &&
+          MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
+        return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+                           N->getValueType(0), MMXSrcOp.getOperand(0));
+    }
+  }
+
+  EVT VT = N->getValueType(0);
+
+  if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
+      InputVector.getOpcode() == ISD::BITCAST &&
+      isa<ConstantSDNode>(InputVector.getOperand(0))) {
+    uint64_t ExtractedElt =
+        cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    uint64_t InputValue =
+        cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
+    uint64_t Res = (InputValue >> ExtractedElt) & 1;
+    return DAG.getConstant(Res, dl, MVT::i1);
+  }
+  // Only operate on vectors of 4 elements, where the alternative shuffling
+  // gets to be more expensive.
+  if (InputVector.getValueType() != MVT::v4i32)
+    return SDValue();
+
+  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
+  // single use which is a sign-extend or zero-extend, and all elements are
+  // used.
+  SmallVector<SDNode *, 4> Uses;
+  unsigned ExtractedElements = 0;
+  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
+       UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
+    if (UI.getUse().getResNo() != InputVector.getResNo())
+      return SDValue();
+
+    SDNode *Extract = *UI;
+    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    if (Extract->getValueType(0) != MVT::i32)
+      return SDValue();
+    if (!Extract->hasOneUse())
+      return SDValue();
+    if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
+        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
+      return SDValue();
+    if (!isa<ConstantSDNode>(Extract->getOperand(1)))
+      return SDValue();
+
+    // Record which element was extracted.
+    ExtractedElements |=
+      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
+
+    Uses.push_back(Extract);
+  }
+
+  // If not all the elements were used, this may not be worthwhile.
+  if (ExtractedElements != 15)
+    return SDValue();
+
+  // Ok, we've now decided to do the transformation.
+  // If 64-bit shifts are legal, use the extract-shift sequence,
+  // otherwise bounce the vector off the cache.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Vals[4];
+
+  if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
+    SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
+    auto &DL = DAG.getDataLayout();
+    EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
+    SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
+      DAG.getConstant(0, dl, VecIdxTy));
+    SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
+      DAG.getConstant(1, dl, VecIdxTy));
+
+    SDValue ShAmt = DAG.getConstant(
+        32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
+    Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
+    Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+      DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
+    Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
+    Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+      DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
+  } else {
+    // Store the value to a temporary stack slot.
+    SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
+    SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
+      MachinePointerInfo(), false, false, 0);
+
+    EVT ElementType = InputVector.getValueType().getVectorElementType();
+    unsigned EltSize = ElementType.getSizeInBits() / 8;
+
+    // Replace each use (extract) with a load of the appropriate element.
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t Offset = EltSize * i;
+      auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+      SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
+
+      SDValue ScalarAddr =
+          DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
+
+      // Load the scalar.
+      Vals[i] = DAG.getLoad(ElementType, dl, Ch,
+                            ScalarAddr, MachinePointerInfo(),
+                            false, false, false, 0);
+
+    }
+  }
+
+  // Replace the extracts
+  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
+    UE = Uses.end(); UI != UE; ++UI) {
+    SDNode *Extract = *UI;
+
+    SDValue Idx = Extract->getOperand(1);
+    uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
+  }
+
+  // The replacement was made in place; don't return anything.
+  return SDValue();
+}
+
+static SDValue
+transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  SDLoc dl(N);
+  SDValue Cond = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+
+  if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
+    SDValue CondSrc = Cond->getOperand(0);
+    if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
+      Cond = CondSrc->getOperand(0);
+  }
+
+  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
+
+  // A vselect where all conditions and data are constants can be optimized into
+  // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
+  if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
+      ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
+    return SDValue();
+
+  unsigned MaskValue = 0;
+  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+    return SDValue();
+
+  MVT VT = N->getSimpleValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> ShuffleMask(NumElems, -1);
+  for (unsigned i = 0; i < NumElems; ++i) {
+    // Be sure we emit undef where we can.
+    if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
+      ShuffleMask[i] = -1;
+    else
+      ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
+  }
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
+    return SDValue();
+  return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
+}
+
+/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
+/// nodes.
+static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+  SDValue Cond = N->getOperand(0);
+  // Get the LHS/RHS of the select.
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  EVT VT = LHS.getValueType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
+  // instructions match the semantics of the common C idiom x<y?x:y but not
+  // x<=y?x:y, because of how they handle negative zero (which can be
+  // ignored in unsafe-math mode).
+  // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
+  if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
+      VT != MVT::f80 && VT != MVT::f128 &&
+      (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
+      (Subtarget->hasSSE2() ||
+       (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+    unsigned Opcode = 0;
+    // Check for x CC y ? x : y.
+    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+      switch (CC) {
+      default: break;
+      case ISD::SETULT:
+        // Converting this to a min would handle NaNs incorrectly, and swapping
+        // the operands would cause it to handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
+          if (!DAG.getTarget().Options.UnsafeFPMath &&
+              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+            break;
+          std::swap(LHS, RHS);
+        }
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETOLE:
+        // Converting this to a min would handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+          break;
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETULE:
+        // Converting this to a min would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
+        std::swap(LHS, RHS);
+      case ISD::SETOLT:
+      case ISD::SETLT:
+      case ISD::SETLE:
+        Opcode = X86ISD::FMIN;
+        break;
+
+      case ISD::SETOGE:
+        // Converting this to a max would handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+          break;
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETUGT:
+        // Converting this to a max would handle NaNs incorrectly, and swapping
+        // the operands would cause it to handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
+          if (!DAG.getTarget().Options.UnsafeFPMath &&
+              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+            break;
+          std::swap(LHS, RHS);
+        }
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETUGE:
+        // Converting this to a max would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
+        std::swap(LHS, RHS);
+      case ISD::SETOGT:
+      case ISD::SETGT:
+      case ISD::SETGE:
+        Opcode = X86ISD::FMAX;
+        break;
+      }
+    // Check for x CC y ? y : x -- a min/max with reversed arms.
+    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
+               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
+      switch (CC) {
+      default: break;
+      case ISD::SETOGE:
+        // Converting this to a min would handle comparisons between positive
+        // and negative zero incorrectly, and swapping the operands would
+        // cause it to handle NaNs incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
+          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+            break;
+          std::swap(LHS, RHS);
+        }
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETUGT:
+        // Converting this to a min would handle NaNs incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+          break;
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETUGE:
+        // Converting this to a min would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
+        std::swap(LHS, RHS);
+      case ISD::SETOGT:
+      case ISD::SETGT:
+      case ISD::SETGE:
+        Opcode = X86ISD::FMIN;
+        break;
+
+      case ISD::SETULT:
+        // Converting this to a max would handle NaNs incorrectly.
+        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+          break;
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETOLE:
+        // Converting this to a max would handle comparisons between positive
+        // and negative zero incorrectly, and swapping the operands would
+        // cause it to handle NaNs incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
+          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+            break;
+          std::swap(LHS, RHS);
+        }
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETULE:
+        // Converting this to a max would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
+        std::swap(LHS, RHS);
+      case ISD::SETOLT:
+      case ISD::SETLT:
+      case ISD::SETLE:
+        Opcode = X86ISD::FMAX;
+        break;
+      }
+    }
+
+    if (Opcode)
+      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
+  }
+
+  EVT CondVT = Cond.getValueType();
+  if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
+      CondVT.getVectorElementType() == MVT::i1) {
+    // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
+    // lowering on KNL. In this case we convert it to
+    // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
+    // The same situation for all 128 and 256-bit vectors of i8 and i16.
+    // Since SKX these selects have a proper lowering.
+    EVT OpVT = LHS.getValueType();
+    if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
+        (OpVT.getVectorElementType() == MVT::i8 ||
+         OpVT.getVectorElementType() == MVT::i16) &&
+        !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
+      Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
+      DCI.AddToWorklist(Cond.getNode());
+      return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
+    }
+  }
+  // If this is a select between two integer constants, try to do some
+  // optimizations.
+  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
+    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
+      // Don't do this for crazy integer types.
+      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
+        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
+        // so that TrueC (the true value) is larger than FalseC.
+        bool NeedsCondInvert = false;
+
+        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
+            // Efficiently invertible.
+            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
+             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
+              isa<ConstantSDNode>(Cond.getOperand(1))))) {
+          NeedsCondInvert = true;
+          std::swap(TrueC, FalseC);
+        }
+
+        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
+        if (FalseC->getAPIntValue() == 0 &&
+            TrueC->getAPIntValue().isPowerOf2()) {
+          if (NeedsCondInvert) // Invert the condition if needed.
+            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                               DAG.getConstant(1, DL, Cond.getValueType()));
+
+          // Zero extend the condition if needed.
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
+
+          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
+                             DAG.getConstant(ShAmt, DL, MVT::i8));
+        }
+
+        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
+        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+          if (NeedsCondInvert) // Invert the condition if needed.
+            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                               DAG.getConstant(1, DL, Cond.getValueType()));
+
+          // Zero extend the condition if needed.
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+                             FalseC->getValueType(0), Cond);
+          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                             SDValue(FalseC, 0));
+        }
+
+        // Optimize cases that will turn into an LEA instruction.  This requires
+        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+          bool isFastMultiplier = false;
+          if (Diff < 10) {
+            switch ((unsigned char)Diff) {
+              default: break;
+              case 1:  // result = add base, cond
+              case 2:  // result = lea base(    , cond*2)
+              case 3:  // result = lea base(cond, cond*2)
+              case 4:  // result = lea base(    , cond*4)
+              case 5:  // result = lea base(cond, cond*4)
+              case 8:  // result = lea base(    , cond*8)
+              case 9:  // result = lea base(cond, cond*8)
+                isFastMultiplier = true;
+                break;
+            }
+          }
+
+          if (isFastMultiplier) {
+            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+            if (NeedsCondInvert) // Invert the condition if needed.
+              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                                 DAG.getConstant(1, DL, Cond.getValueType()));
+
+            // Zero extend the condition if needed.
+            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+                               Cond);
+            // Scale the condition by the difference.
+            if (Diff != 1)
+              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+                                 DAG.getConstant(Diff, DL,
+                                                 Cond.getValueType()));
+
+            // Add the base if non-zero.
+            if (FalseC->getAPIntValue() != 0)
+              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                                 SDValue(FalseC, 0));
+            return Cond;
+          }
+        }
+      }
+  }
+
+  // Canonicalize max and min:
+  // (x > y) ? x : y -> (x >= y) ? x : y
+  // (x < y) ? x : y -> (x <= y) ? x : y
+  // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
+  // the need for an extra compare
+  // against zero. e.g.
+  // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
+  // subl   %esi, %edi
+  // testl  %edi, %edi
+  // movl   $0, %eax
+  // cmovgl %edi, %eax
+  // =>
+  // xorl   %eax, %eax
+  // subl   %esi, $edi
+  // cmovsl %eax, %edi
+  if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
+      DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+    switch (CC) {
+    default: break;
+    case ISD::SETLT:
+    case ISD::SETGT: {
+      ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
+      Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
+                          Cond.getOperand(0), Cond.getOperand(1), NewCC);
+      return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
+    }
+    }
+  }
+
+  // Early exit check
+  if (!TLI.isTypeLegal(VT))
+    return SDValue();
+
+  // Match VSELECTs into subs with unsigned saturation.
+  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+      // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
+      ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
+       (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+    // Check if one of the arms of the VSELECT is a zero vector. If it's on the
+    // left side invert the predicate to simplify logic below.
+    SDValue Other;
+    if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
+      Other = RHS;
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
+      Other = LHS;
+    }
+
+    if (Other.getNode() && Other->getNumOperands() == 2 &&
+        DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
+      SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
+      SDValue CondRHS = Cond->getOperand(1);
+
+      // Look for a general sub with unsigned saturation first.
+      // x >= y ? x-y : 0 --> subus x, y
+      // x >  y ? x-y : 0 --> subus x, y
+      if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
+          Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
+        return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+
+      if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
+        if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
+          if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
+            if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
+              // If the RHS is a constant we have to reverse the const
+              // canonicalization.
+              // x > C-1 ? x+-C : 0 --> subus x, C
+              if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+                  CondRHSConst->getAPIntValue() ==
+                      (-OpRHSConst->getAPIntValue() - 1))
+                return DAG.getNode(
+                    X86ISD::SUBUS, DL, VT, OpLHS,
+                    DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
+
+          // Another special case: If C was a sign bit, the sub has been
+          // canonicalized into a xor.
+          // FIXME: Would it be better to use computeKnownBits to determine
+          //        whether it's safe to decanonicalize the xor?
+          // x s< 0 ? x^C : 0 --> subus x, C
+          if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
+              ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+              OpRHSConst->getAPIntValue().isSignBit())
+            // Note that we have to rebuild the RHS constant here to ensure we
+            // don't rely on particular values of undef lanes.
+            return DAG.getNode(
+                X86ISD::SUBUS, DL, VT, OpLHS,
+                DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
+        }
+    }
+  }
+
+  // Simplify vector selection if condition value type matches vselect
+  // operand type
+  if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
+    assert(Cond.getValueType().isVector() &&
+           "vector select expects a vector selector!");
+
+    bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
+    bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+    // Try invert the condition if true value is not all 1s and false value
+    // is not all 0s.
+    if (!TValIsAllOnes && !FValIsAllZeros &&
+        // Check if the selector will be produced by CMPP*/PCMP*
+        Cond.getOpcode() == ISD::SETCC &&
+        // Check if SETCC has already been promoted
+        TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
+            CondVT) {
+      bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
+      bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
+
+      if (TValIsAllZeros || FValIsAllOnes) {
+        SDValue CC = Cond.getOperand(2);
+        ISD::CondCode NewCC =
+          ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+                               Cond.getOperand(0).getValueType().isInteger());
+        Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
+        std::swap(LHS, RHS);
+        TValIsAllOnes = FValIsAllOnes;
+        FValIsAllZeros = TValIsAllZeros;
+      }
+    }
+
+    if (TValIsAllOnes || FValIsAllZeros) {
+      SDValue Ret;
+
+      if (TValIsAllOnes && FValIsAllZeros)
+        Ret = Cond;
+      else if (TValIsAllOnes)
+        Ret =
+            DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
+      else if (FValIsAllZeros)
+        Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
+                          DAG.getBitcast(CondVT, LHS));
+
+      return DAG.getBitcast(VT, Ret);
+    }
+  }
+
+  // We should generate an X86ISD::BLENDI from a vselect if its argument
+  // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
+  // constants. This specific pattern gets generated when we split a
+  // selector for a 512 bit vector in a machine without AVX512 (but with
+  // 256-bit vectors), during legalization:
+  //
+  // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
+  //
+  // Iff we find this pattern and the build_vectors are built from
+  // constants, we translate the vselect into a shuffle_vector that we
+  // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
+  if ((N->getOpcode() == ISD::VSELECT ||
+       N->getOpcode() == X86ISD::SHRUNKBLEND) &&
+      !DCI.isBeforeLegalize() && !VT.is512BitVector()) {
+    SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+    if (Shuffle.getNode())
+      return Shuffle;
+  }
+
+  // If this is a *dynamic* select (non-constant condition) and we can match
+  // this node with one of the variable blend instructions, restructure the
+  // condition so that the blends can use the high bit of each element and use
+  // SimplifyDemandedBits to simplify the condition operand.
+  if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
+      !DCI.isBeforeLegalize() &&
+      !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
+    unsigned BitWidth = Cond.getValueType().getScalarSizeInBits();
+
+    // Don't optimize vector selects that map to mask-registers.
+    if (BitWidth == 1)
+      return SDValue();
+
+    // We can only handle the cases where VSELECT is directly legal on the
+    // subtarget. We custom lower VSELECT nodes with constant conditions and
+    // this makes it hard to see whether a dynamic VSELECT will correctly
+    // lower, so we both check the operation's status and explicitly handle the
+    // cases where a *dynamic* blend will fail even though a constant-condition
+    // blend could be custom lowered.
+    // FIXME: We should find a better way to handle this class of problems.
+    // Potentially, we should combine constant-condition vselect nodes
+    // pre-legalization into shuffles and not mark as many types as custom
+    // lowered.
+    if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+      return SDValue();
+    // FIXME: We don't support i16-element blends currently. We could and
+    // should support them by making *all* the bits in the condition be set
+    // rather than just the high bit and using an i8-element blend.
+    if (VT.getVectorElementType() == MVT::i16)
+      return SDValue();
+    // Dynamic blending was only available from SSE4.1 onward.
+    if (VT.is128BitVector() && !Subtarget->hasSSE41())
+      return SDValue();
+    // Byte blends are only available in AVX2
+    if (VT == MVT::v32i8 && !Subtarget->hasAVX2())
+      return SDValue();
+
+    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
+    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
+
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
+                                          DCI.isBeforeLegalizeOps());
+    if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
+        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
+                                 TLO)) {
+      // If we changed the computation somewhere in the DAG, this change
+      // will affect all users of Cond.
+      // Make sure it is fine and update all the nodes so that we do not
+      // use the generic VSELECT anymore. Otherwise, we may perform
+      // wrong optimizations as we messed up with the actual expectation
+      // for the vector boolean values.
+      if (Cond != TLO.Old) {
+        // Check all uses of that condition operand to check whether it will be
+        // consumed by non-BLEND instructions, which may depend on all bits are
+        // set properly.
+        for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
+             I != E; ++I)
+          if (I->getOpcode() != ISD::VSELECT)
+            // TODO: Add other opcodes eventually lowered into BLEND.
+            return SDValue();
+
+        // Update all the users of the condition, before committing the change,
+        // so that the VSELECT optimizations that expect the correct vector
+        // boolean value will not be triggered.
+        for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
+             I != E; ++I)
+          DAG.ReplaceAllUsesOfValueWith(
+              SDValue(*I, 0),
+              DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
+                          Cond, I->getOperand(1), I->getOperand(2)));
+        DCI.CommitTargetLoweringOpt(TLO);
+        return SDValue();
+      }
+      // At this point, only Cond is changed. Change the condition
+      // just for N to keep the opportunity to optimize all other
+      // users their own way.
+      DAG.ReplaceAllUsesOfValueWith(
+          SDValue(N, 0),
+          DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
+                      TLO.New, N->getOperand(1), N->getOperand(2)));
+      return SDValue();
+    }
+  }
+
+  return SDValue();
+}
+
+// Check whether a boolean test is testing a boolean value generated by
+// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
+// code.
+//
+// Simplify the following patterns:
+// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
+// to (Op EFLAGS Cond)
+//
+// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
+// to (Op EFLAGS !Cond)
+//
+// where Op could be BRCOND or CMOV.
+//
+static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
+  // Quit if not CMP and SUB with its value result used.
+  if (Cmp.getOpcode() != X86ISD::CMP &&
+      (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
+      return SDValue();
+
+  // Quit if not used as a boolean value.
+  if (CC != X86::COND_E && CC != X86::COND_NE)
+    return SDValue();
+
+  // Check CMP operands. One of them should be 0 or 1 and the other should be
+  // an SetCC or extended from it.
+  SDValue Op1 = Cmp.getOperand(0);
+  SDValue Op2 = Cmp.getOperand(1);
+
+  SDValue SetCC;
+  const ConstantSDNode* C = nullptr;
+  bool needOppositeCond = (CC == X86::COND_E);
+  bool checkAgainstTrue = false; // Is it a comparison against 1?
+
+  if ((C = dyn_cast<ConstantSDNode>(Op1)))
+    SetCC = Op2;
+  else if ((C = dyn_cast<ConstantSDNode>(Op2)))
+    SetCC = Op1;
+  else // Quit if all operands are not constants.
+    return SDValue();
+
+  if (C->getZExtValue() == 1) {
+    needOppositeCond = !needOppositeCond;
+    checkAgainstTrue = true;
+  } else if (C->getZExtValue() != 0)
+    // Quit if the constant is neither 0 or 1.
+    return SDValue();
+
+  bool truncatedToBoolWithAnd = false;
+  // Skip (zext $x), (trunc $x), or (and $x, 1) node.
+  while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
+         SetCC.getOpcode() == ISD::TRUNCATE ||
+         SetCC.getOpcode() == ISD::AND) {
+    if (SetCC.getOpcode() == ISD::AND) {
+      int OpIdx = -1;
+      if (isOneConstant(SetCC.getOperand(0)))
+        OpIdx = 1;
+      if (isOneConstant(SetCC.getOperand(1)))
+        OpIdx = 0;
+      if (OpIdx == -1)
+        break;
+      SetCC = SetCC.getOperand(OpIdx);
+      truncatedToBoolWithAnd = true;
+    } else
+      SetCC = SetCC.getOperand(0);
+  }
+
+  switch (SetCC.getOpcode()) {
+  case X86ISD::SETCC_CARRY:
+    // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
+    // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
+    // i.e. it's a comparison against true but the result of SETCC_CARRY is not
+    // truncated to i1 using 'and'.
+    if (checkAgainstTrue && !truncatedToBoolWithAnd)
+      break;
+    assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
+           "Invalid use of SETCC_CARRY!");
+    // FALL THROUGH
+  case X86ISD::SETCC:
+    // Set the condition code or opposite one if necessary.
+    CC = X86::CondCode(SetCC.getConstantOperandVal(0));
+    if (needOppositeCond)
+      CC = X86::GetOppositeBranchCondition(CC);
+    return SetCC.getOperand(1);
+  case X86ISD::CMOV: {
+    // Check whether false/true value has canonical one, i.e. 0 or 1.
+    ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
+    ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
+    // Quit if true value is not a constant.
+    if (!TVal)
+      return SDValue();
+    // Quit if false value is not a constant.
+    if (!FVal) {
+      SDValue Op = SetCC.getOperand(0);
+      // Skip 'zext' or 'trunc' node.
+      if (Op.getOpcode() == ISD::ZERO_EXTEND ||
+          Op.getOpcode() == ISD::TRUNCATE)
+        Op = Op.getOperand(0);
+      // A special case for rdrand/rdseed, where 0 is set if false cond is
+      // found.
+      if ((Op.getOpcode() != X86ISD::RDRAND &&
+           Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
+        return SDValue();
+    }
+    // Quit if false value is not the constant 0 or 1.
+    bool FValIsFalse = true;
+    if (FVal && FVal->getZExtValue() != 0) {
+      if (FVal->getZExtValue() != 1)
+        return SDValue();
+      // If FVal is 1, opposite cond is needed.
+      needOppositeCond = !needOppositeCond;
+      FValIsFalse = false;
+    }
+    // Quit if TVal is not the constant opposite of FVal.
+    if (FValIsFalse && TVal->getZExtValue() != 1)
+      return SDValue();
+    if (!FValIsFalse && TVal->getZExtValue() != 0)
+      return SDValue();
+    CC = X86::CondCode(SetCC.getConstantOperandVal(2));
+    if (needOppositeCond)
+      CC = X86::GetOppositeBranchCondition(CC);
+    return SetCC.getOperand(3);
+  }
+  }
+
+  return SDValue();
+}
+
+/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
+/// Match:
+///   (X86or (X86setcc) (X86setcc))
+///   (X86cmp (and (X86setcc) (X86setcc)), 0)
+static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
+                                           X86::CondCode &CC1, SDValue &Flags,
+                                           bool &isAnd) {
+  if (Cond->getOpcode() == X86ISD::CMP) {
+    if (!isNullConstant(Cond->getOperand(1)))
+      return false;
+
+    Cond = Cond->getOperand(0);
+  }
+
+  isAnd = false;
+
+  SDValue SetCC0, SetCC1;
+  switch (Cond->getOpcode()) {
+  default: return false;
+  case ISD::AND:
+  case X86ISD::AND:
+    isAnd = true;
+    // fallthru
+  case ISD::OR:
+  case X86ISD::OR:
+    SetCC0 = Cond->getOperand(0);
+    SetCC1 = Cond->getOperand(1);
+    break;
+  };
+
+  // Make sure we have SETCC nodes, using the same flags value.
+  if (SetCC0.getOpcode() != X86ISD::SETCC ||
+      SetCC1.getOpcode() != X86ISD::SETCC ||
+      SetCC0->getOperand(1) != SetCC1->getOperand(1))
+    return false;
+
+  CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
+  CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
+  Flags = SetCC0->getOperand(1);
+  return true;
+}
+
+/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
+static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+
+  // If the flag operand isn't dead, don't touch this CMOV.
+  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
+    return SDValue();
+
+  SDValue FalseOp = N->getOperand(0);
+  SDValue TrueOp = N->getOperand(1);
+  X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
+  SDValue Cond = N->getOperand(3);
+
+  if (CC == X86::COND_E || CC == X86::COND_NE) {
+    switch (Cond.getOpcode()) {
+    default: break;
+    case X86ISD::BSR:
+    case X86ISD::BSF:
+      // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
+      if (DAG.isKnownNeverZero(Cond.getOperand(0)))
+        return (CC == X86::COND_E) ? FalseOp : TrueOp;
+    }
+  }
+
+  SDValue Flags;
+
+  Flags = checkBoolTestSetCCCombine(Cond, CC);
+  if (Flags.getNode() &&
+      // Extra check as FCMOV only supports a subset of X86 cond.
+      (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
+    SDValue Ops[] = { FalseOp, TrueOp,
+                      DAG.getConstant(CC, DL, MVT::i8), Flags };
+    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+  }
+
+  // If this is a select between two integer constants, try to do some
+  // optimizations.  Note that the operands are ordered the opposite of SELECT
+  // operands.
+  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
+    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
+      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
+      // larger than FalseC (the false value).
+      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
+        CC = X86::GetOppositeBranchCondition(CC);
+        std::swap(TrueC, FalseC);
+        std::swap(TrueOp, FalseOp);
+      }
+
+      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
+      // This is efficient for any integer data type (including i8/i16) and
+      // shift amount.
+      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
+        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                           DAG.getConstant(CC, DL, MVT::i8), Cond);
+
+        // Zero extend the condition if needed.
+        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
+
+        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
+                           DAG.getConstant(ShAmt, DL, MVT::i8));
+        if (N->getNumValues() == 2)  // Dead flag value?
+          return DCI.CombineTo(N, Cond, SDValue());
+        return Cond;
+      }
+
+      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
+      // for any integer data type, including i8/i16.
+      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                           DAG.getConstant(CC, DL, MVT::i8), Cond);
+
+        // Zero extend the condition if needed.
+        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+                           FalseC->getValueType(0), Cond);
+        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                           SDValue(FalseC, 0));
+
+        if (N->getNumValues() == 2)  // Dead flag value?
+          return DCI.CombineTo(N, Cond, SDValue());
+        return Cond;
+      }
+
+      // Optimize cases that will turn into an LEA instruction.  This requires
+      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+        bool isFastMultiplier = false;
+        if (Diff < 10) {
+          switch ((unsigned char)Diff) {
+          default: break;
+          case 1:  // result = add base, cond
+          case 2:  // result = lea base(    , cond*2)
+          case 3:  // result = lea base(cond, cond*2)
+          case 4:  // result = lea base(    , cond*4)
+          case 5:  // result = lea base(cond, cond*4)
+          case 8:  // result = lea base(    , cond*8)
+          case 9:  // result = lea base(cond, cond*8)
+            isFastMultiplier = true;
+            break;
+          }
+        }
+
+        if (isFastMultiplier) {
+          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                             DAG.getConstant(CC, DL, MVT::i8), Cond);
+          // Zero extend the condition if needed.
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+                             Cond);
+          // Scale the condition by the difference.
+          if (Diff != 1)
+            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+                               DAG.getConstant(Diff, DL, Cond.getValueType()));
+
+          // Add the base if non-zero.
+          if (FalseC->getAPIntValue() != 0)
+            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                               SDValue(FalseC, 0));
+          if (N->getNumValues() == 2)  // Dead flag value?
+            return DCI.CombineTo(N, Cond, SDValue());
+          return Cond;
+        }
+      }
+    }
+  }
+
+  // Handle these cases:
+  //   (select (x != c), e, c) -> select (x != c), e, x),
+  //   (select (x == c), c, e) -> select (x == c), x, e)
+  // where the c is an integer constant, and the "select" is the combination
+  // of CMOV and CMP.
+  //
+  // The rationale for this change is that the conditional-move from a constant
+  // needs two instructions, however, conditional-move from a register needs
+  // only one instruction.
+  //
+  // CAVEAT: By replacing a constant with a symbolic value, it may obscure
+  //  some instruction-combining opportunities. This opt needs to be
+  //  postponed as late as possible.
+  //
+  if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
+    // the DCI.xxxx conditions are provided to postpone the optimization as
+    // late as possible.
+
+    ConstantSDNode *CmpAgainst = nullptr;
+    if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
+        (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
+        !isa<ConstantSDNode>(Cond.getOperand(0))) {
+
+      if (CC == X86::COND_NE &&
+          CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
+        CC = X86::GetOppositeBranchCondition(CC);
+        std::swap(TrueOp, FalseOp);
+      }
+
+      if (CC == X86::COND_E &&
+          CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
+        SDValue Ops[] = { FalseOp, Cond.getOperand(0),
+                          DAG.getConstant(CC, DL, MVT::i8), Cond };
+        return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
+      }
+    }
+  }
+
+  // Fold and/or of setcc's to double CMOV:
+  //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
+  //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
+  //
+  // This combine lets us generate:
+  //   cmovcc1 (jcc1 if we don't have CMOV)
+  //   cmovcc2 (same)
+  // instead of:
+  //   setcc1
+  //   setcc2
+  //   and/or
+  //   cmovne (jne if we don't have CMOV)
+  // When we can't use the CMOV instruction, it might increase branch
+  // mispredicts.
+  // When we can use CMOV, or when there is no mispredict, this improves
+  // throughput and reduces register pressure.
+  //
+  if (CC == X86::COND_NE) {
+    SDValue Flags;
+    X86::CondCode CC0, CC1;
+    bool isAndSetCC;
+    if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
+      if (isAndSetCC) {
+        std::swap(FalseOp, TrueOp);
+        CC0 = X86::GetOppositeBranchCondition(CC0);
+        CC1 = X86::GetOppositeBranchCondition(CC1);
+      }
+
+      SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
+        Flags};
+      SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
+      SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
+      SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
+      return CMOV;
+    }
+  }
+
+  return SDValue();
+}
+
+/// PerformMulCombine - Optimize a single multiply with constant into two
+/// in order to implement it with two cheaper instructions, e.g.
+/// LEA + SHL, LEA + LEA.
+static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  // An imul is usually smaller than the alternative sequence.
+  if (DAG.getMachineFunction().getFunction()->optForMinSize())
+    return SDValue();
+
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i64 && VT != MVT::i32)
+    return SDValue();
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!C)
+    return SDValue();
+  uint64_t MulAmt = C->getZExtValue();
+  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
+    return SDValue();
+
+  uint64_t MulAmt1 = 0;
+  uint64_t MulAmt2 = 0;
+  if ((MulAmt % 9) == 0) {
+    MulAmt1 = 9;
+    MulAmt2 = MulAmt / 9;
+  } else if ((MulAmt % 5) == 0) {
+    MulAmt1 = 5;
+    MulAmt2 = MulAmt / 5;
+  } else if ((MulAmt % 3) == 0) {
+    MulAmt1 = 3;
+    MulAmt2 = MulAmt / 3;
+  }
+
+  SDLoc DL(N);
+  SDValue NewMul;
+  if (MulAmt2 &&
+      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
+
+    if (isPowerOf2_64(MulAmt2) &&
+        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
+      // If second multiplifer is pow2, issue it first. We want the multiply by
+      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
+      // is an add.
+      std::swap(MulAmt1, MulAmt2);
+
+    if (isPowerOf2_64(MulAmt1))
+      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                           DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
+    else
+      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+                           DAG.getConstant(MulAmt1, DL, VT));
+
+    if (isPowerOf2_64(MulAmt2))
+      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
+                           DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
+    else
+      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
+                           DAG.getConstant(MulAmt2, DL, VT));
+  }
+
+  if (!NewMul) {
+    assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
+           && "Both cases that could cause potential overflows should have "
+              "already been handled.");
+    if (isPowerOf2_64(MulAmt - 1))
+      // (mul x, 2^N + 1) => (add (shl x, N), x)
+      NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+                                DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                                DAG.getConstant(Log2_64(MulAmt - 1), DL,
+                                MVT::i8)));
+
+    else if (isPowerOf2_64(MulAmt + 1))
+      // (mul x, 2^N - 1) => (sub (shl x, N), x)
+      NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
+                                N->getOperand(0),
+                                DAG.getConstant(Log2_64(MulAmt + 1),
+                                DL, MVT::i8)), N->getOperand(0));
+  }
+
+  if (NewMul)
+    // Do not add new nodes to DAG combiner worklist.
+    DCI.CombineTo(N, NewMul, false);
+
+  return SDValue();
+}
+
+static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+
+  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
+  // since the result of setcc_c is all zero's or all ones.
+  if (VT.isInteger() && !VT.isVector() &&
+      N1C && N0.getOpcode() == ISD::AND &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    SDValue N00 = N0.getOperand(0);
+    APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+    APInt ShAmt = N1C->getAPIntValue();
+    Mask = Mask.shl(ShAmt);
+    bool MaskOK = false;
+    // We can handle cases concerning bit-widening nodes containing setcc_c if
+    // we carefully interrogate the mask to make sure we are semantics
+    // preserving.
+    // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
+    // of the underlying setcc_c operation if the setcc_c was zero extended.
+    // Consider the following example:
+    //   zext(setcc_c)                 -> i32 0x0000FFFF
+    //   c1                            -> i32 0x0000FFFF
+    //   c2                            -> i32 0x00000001
+    //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
+    //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
+    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+      MaskOK = true;
+    } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
+               N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+      MaskOK = true;
+    } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
+                N00.getOpcode() == ISD::ANY_EXTEND) &&
+               N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+      MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
+    }
+    if (MaskOK && Mask != 0) {
+      SDLoc DL(N);
+      return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
+    }
+  }
+
+  // Hardware support for vector shifts is sparse which makes us scalarize the
+  // vector operations in many cases. Also, on sandybridge ADD is faster than
+  // shl.
+  // (shl V, 1) -> add V,V
+  if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
+    if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
+      assert(N0.getValueType().isVector() && "Invalid vector shift type");
+      // We shift all of the values by one. In many cases we do not have
+      // hardware support for this operation. This is better expressed as an ADD
+      // of two values.
+      if (N1SplatC->getAPIntValue() == 1)
+        return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
+    }
+
+  return SDValue();
+}
+
+static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  unsigned Size = VT.getSizeInBits();
+
+  // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
+  // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
+  // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
+  // depending on sign of (SarConst - [56,48,32,24,16])
+
+  // sexts in X86 are MOVs. The MOVs have the same code size
+  // as above SHIFTs (only SHIFT on 1 has lower code size).
+  // However the MOVs have 2 advantages to a SHIFT:
+  // 1. MOVs can write to a register that differs from source
+  // 2. MOVs accept memory operands
+
+  if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
+      N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
+      N0.getOperand(1).getOpcode() != ISD::Constant)
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+  SDValue N01 = N0.getOperand(1);
+  APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
+  APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
+  EVT CVT = N1.getValueType();
+
+  if (SarConst.isNegative())
+    return SDValue();
+
+  for (MVT SVT : MVT::integer_valuetypes()) {
+    unsigned ShiftSize = SVT.getSizeInBits();
+    // skipping types without corresponding sext/zext and
+    // ShlConst that is not one of [56,48,32,24,16]
+    if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
+      continue;
+    SDLoc DL(N);
+    SDValue NN =
+        DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
+    SarConst = SarConst - (Size - ShiftSize);
+    if (SarConst == 0)
+      return NN;
+    else if (SarConst.isNegative())
+      return DAG.getNode(ISD::SHL, DL, VT, NN,
+                         DAG.getConstant(-SarConst, DL, CVT));
+    else
+      return DAG.getNode(ISD::SRA, DL, VT, NN,
+                         DAG.getConstant(SarConst, DL, CVT));
+  }
+  return SDValue();
+}
+
+/// \brief Returns a vector of 0s if the node in input is a vector logical
+/// shift by a constant amount which is known to be bigger than or equal
+/// to the vector element size in bits.
+static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+
+  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
+      (!Subtarget->hasInt256() ||
+       (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
+    return SDValue();
+
+  SDValue Amt = N->getOperand(1);
+  SDLoc DL(N);
+  if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
+    if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
+      APInt ShiftAmt = AmtSplat->getAPIntValue();
+      unsigned MaxAmount =
+        VT.getSimpleVT().getVectorElementType().getSizeInBits();
+
+      // SSE2/AVX2 logical shifts always return a vector of 0s
+      // if the shift amount is bigger than or equal to
+      // the element size. The constant shift amount will be
+      // encoded as a 8-bit immediate.
+      if (ShiftAmt.trunc(8).uge(MaxAmount))
+        return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
+    }
+
+  return SDValue();
+}
+
+/// PerformShiftCombine - Combine shifts.
+static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
+  if (N->getOpcode() == ISD::SHL)
+    if (SDValue V = PerformSHLCombine(N, DAG))
+      return V;
+
+  if (N->getOpcode() == ISD::SRA)
+    if (SDValue V = PerformSRACombine(N, DAG))
+      return V;
+
+  // Try to fold this logical shift into a zero vector.
+  if (N->getOpcode() != ISD::SRA)
+    if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
+      return V;
+
+  return SDValue();
+}
+
+// CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
+// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
+// and friends.  Likewise for OR -> CMPNEQSS.
+static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
+                            TargetLowering::DAGCombinerInfo &DCI,
+                            const X86Subtarget *Subtarget) {
+  unsigned opcode;
+
+  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
+  // we're requiring SSE2 for both.
+  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue CMP0 = N0->getOperand(1);
+    SDValue CMP1 = N1->getOperand(1);
+    SDLoc DL(N);
+
+    // The SETCCs should both refer to the same CMP.
+    if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
+      return SDValue();
+
+    SDValue CMP00 = CMP0->getOperand(0);
+    SDValue CMP01 = CMP0->getOperand(1);
+    EVT     VT    = CMP00.getValueType();
+
+    if (VT == MVT::f32 || VT == MVT::f64) {
+      bool ExpectingFlags = false;
+      // Check for any users that want flags:
+      for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+           !ExpectingFlags && UI != UE; ++UI)
+        switch (UI->getOpcode()) {
+        default:
+        case ISD::BR_CC:
+        case ISD::BRCOND:
+        case ISD::SELECT:
+          ExpectingFlags = true;
+          break;
+        case ISD::CopyToReg:
+        case ISD::SIGN_EXTEND:
+        case ISD::ZERO_EXTEND:
+        case ISD::ANY_EXTEND:
+          break;
+        }
+
+      if (!ExpectingFlags) {
+        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
+        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
+
+        if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
+          X86::CondCode tmp = cc0;
+          cc0 = cc1;
+          cc1 = tmp;
+        }
+
+        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
+            (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
+          // FIXME: need symbolic constants for these magic numbers.
+          // See X86ATTInstPrinter.cpp:printSSECC().
+          unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
+          if (Subtarget->hasAVX512()) {
+            SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
+                                         CMP01,
+                                         DAG.getConstant(x86cc, DL, MVT::i8));
+            if (N->getValueType(0) != MVT::i1)
+              return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
+                                 FSetCC);
+            return FSetCC;
+          }
+          SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
+                                              CMP00.getValueType(), CMP00, CMP01,
+                                              DAG.getConstant(x86cc, DL,
+                                                              MVT::i8));
+
+          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
+          MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
+
+          if (is64BitFP && !Subtarget->is64Bit()) {
+            // On a 32-bit target, we cannot bitcast the 64-bit float to a
+            // 64-bit integer, since that's not a legal type. Since
+            // OnesOrZeroesF is all ones of all zeroes, we don't need all the
+            // bits, but can do this little dance to extract the lowest 32 bits
+            // and work with those going forward.
+            SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
+                                           OnesOrZeroesF);
+            SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
+            OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
+                                        Vector32, DAG.getIntPtrConstant(0, DL));
+            IntVT = MVT::i32;
+          }
+
+          SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
+          SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
+                                      DAG.getConstant(1, DL, IntVT));
+          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
+                                              ANDed);
+          return OneBitOfTruth;
+        }
+      }
+    }
+  }
+  return SDValue();
+}
+
+/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
+/// so it can be folded inside ANDNP.
+static bool CanFoldXORWithAllOnes(const SDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  // Match direct AllOnes for 128 and 256-bit vectors
+  if (ISD::isBuildVectorAllOnes(N))
+    return true;
+
+  // Look through a bit convert.
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0).getNode();
+
+  // Sometimes the operand may come from a insert_subvector building a 256-bit
+  // allones vector
+  if (VT.is256BitVector() &&
+      N->getOpcode() == ISD::INSERT_SUBVECTOR) {
+    SDValue V1 = N->getOperand(0);
+    SDValue V2 = N->getOperand(1);
+
+    if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
+        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
+        ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
+        ISD::isBuildVectorAllOnes(V2.getNode()))
+      return true;
+  }
+
+  return false;
+}
+
+// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
+// register. In most cases we actually compare or select YMM-sized registers
+// and mixing the two types creates horrible code. This method optimizes
+// some of the transition sequences.
+static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!VT.is256BitVector())
+    return SDValue();
+
+  assert((N->getOpcode() == ISD::ANY_EXTEND ||
+          N->getOpcode() == ISD::ZERO_EXTEND ||
+          N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
+
+  SDValue Narrow = N->getOperand(0);
+  EVT NarrowVT = Narrow->getValueType(0);
+  if (!NarrowVT.is128BitVector())
+    return SDValue();
+
+  if (Narrow->getOpcode() != ISD::XOR &&
+      Narrow->getOpcode() != ISD::AND &&
+      Narrow->getOpcode() != ISD::OR)
+    return SDValue();
+
+  SDValue N0  = Narrow->getOperand(0);
+  SDValue N1  = Narrow->getOperand(1);
+  SDLoc DL(Narrow);
+
+  // The Left side has to be a trunc.
+  if (N0.getOpcode() != ISD::TRUNCATE)
+    return SDValue();
+
+  // The type of the truncated inputs.
+  EVT WideVT = N0->getOperand(0)->getValueType(0);
+  if (WideVT != VT)
+    return SDValue();
+
+  // The right side has to be a 'trunc' or a constant vector.
+  bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
+  ConstantSDNode *RHSConstSplat = nullptr;
+  if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
+    RHSConstSplat = RHSBV->getConstantSplatNode();
+  if (!RHSTrunc && !RHSConstSplat)
+    return SDValue();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
+    return SDValue();
+
+  // Set N0 and N1 to hold the inputs to the new wide operation.
+  N0 = N0->getOperand(0);
+  if (RHSConstSplat) {
+    N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
+                     SDValue(RHSConstSplat, 0));
+    SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
+    N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
+  } else if (RHSTrunc) {
+    N1 = N1->getOperand(0);
+  }
+
+  // Generate the wide operation.
+  SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
+  unsigned Opcode = N->getOpcode();
+  switch (Opcode) {
+  case ISD::ANY_EXTEND:
+    return Op;
+  case ISD::ZERO_EXTEND: {
+    unsigned InBits = NarrowVT.getScalarSizeInBits();
+    APInt Mask = APInt::getAllOnesValue(InBits);
+    Mask = Mask.zext(VT.getScalarSizeInBits());
+    return DAG.getNode(ISD::AND, DL, VT,
+                       Op, DAG.getConstant(Mask, DL, VT));
+  }
+  case ISD::SIGN_EXTEND:
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
+                       Op, DAG.getValueType(NarrowVT));
+  default:
+    llvm_unreachable("Unexpected opcode");
+  }
+}
+
+static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const X86Subtarget *Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+
+  // A vector zext_in_reg may be represented as a shuffle,
+  // feeding into a bitcast (this represents anyext) feeding into
+  // an and with a mask.
+  // We'd like to try to combine that into a shuffle with zero
+  // plus a bitcast, removing the and.
+  if (N0.getOpcode() != ISD::BITCAST ||
+      N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
+    return SDValue();
+
+  // The other side of the AND should be a splat of 2^C, where C
+  // is the number of bits in the source type.
+  if (N1.getOpcode() == ISD::BITCAST)
+    N1 = N1.getOperand(0);
+  if (N1.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+  BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
+
+  ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0));
+  EVT SrcType = Shuffle->getValueType(0);
+
+  // We expect a single-source shuffle
+  if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF)
+    return SDValue();
+
+  unsigned SrcSize = SrcType.getScalarSizeInBits();
+
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (!Vector->isConstantSplat(SplatValue, SplatUndef,
+                                SplatBitSize, HasAnyUndefs))
+    return SDValue();
+
+  unsigned ResSize = N1.getValueType().getScalarSizeInBits();
+  // Make sure the splat matches the mask we expect
+  if (SplatBitSize > ResSize ||
+      (SplatValue + 1).exactLogBase2() != (int)SrcSize)
+    return SDValue();
+
+  // Make sure the input and output size make sense
+  if (SrcSize >= ResSize || ResSize % SrcSize)
+    return SDValue();
+
+  // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...>
+  // The number of u's between each two values depends on the ratio between
+  // the source and dest type.
+  unsigned ZextRatio = ResSize / SrcSize;
+  bool IsZext = true;
+  for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) {
+    if (i % ZextRatio) {
+      if (Shuffle->getMaskElt(i) > 0) {
+        // Expected undef
+        IsZext = false;
+        break;
+      }
+    } else {
+      if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) {
+        // Expected element number
+        IsZext = false;
+        break;
+      }
+    }
+  }
+
+  if (!IsZext)
+    return SDValue();
+
+  // Ok, perform the transformation - replace the shuffle with
+  // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
+  // (instead of undef) where the k elements come from the zero vector.
+  SmallVector<int, 8> Mask;
+  unsigned NumElems = SrcType.getVectorNumElements();
+  for (unsigned i = 0; i < NumElems; ++i)
+    if (i % ZextRatio)
+      Mask.push_back(NumElems);
+    else
+      Mask.push_back(i / ZextRatio);
+
+  SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
+    Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask);
+  return DAG.getBitcast(N0.getValueType(), NewShuffle);
+}
+
+/// If both input operands of a logic op are being cast from floating point
+/// types, try to convert this into a floating point logic node to avoid
+/// unnecessary moves from SSE to integer registers.
+static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
+                                        const X86Subtarget *Subtarget) {
+  unsigned FPOpcode = ISD::DELETED_NODE;
+  if (N->getOpcode() == ISD::AND)
+    FPOpcode = X86ISD::FAND;
+  else if (N->getOpcode() == ISD::OR)
+    FPOpcode = X86ISD::FOR;
+  else if (N->getOpcode() == ISD::XOR)
+    FPOpcode = X86ISD::FXOR;
+
+  assert(FPOpcode != ISD::DELETED_NODE &&
+         "Unexpected input node for FP logic conversion");
+
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+  if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
+      ((Subtarget->hasSSE1() && VT == MVT::i32) ||
+       (Subtarget->hasSSE2() && VT == MVT::i64))) {
+    SDValue N00 = N0.getOperand(0);
+    SDValue N10 = N1.getOperand(0);
+    EVT N00Type = N00.getValueType();
+    EVT N10Type = N10.getValueType();
+    if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
+      SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
+      return DAG.getBitcast(VT, FPLogic);
+    }
+  }
+  return SDValue();
+}
+
+static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const X86Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget))
+    return Zext;
+
+  if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
+    return R;
+
+  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+    return FPLogic;
+
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+
+  // Create BEXTR instructions
+  // BEXTR is ((X >> imm) & (2**size-1))
+  if (VT == MVT::i32 || VT == MVT::i64) {
+    // Check for BEXTR.
+    if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
+        (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
+      ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
+      ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+      if (MaskNode && ShiftNode) {
+        uint64_t Mask = MaskNode->getZExtValue();
+        uint64_t Shift = ShiftNode->getZExtValue();
+        if (isMask_64(Mask)) {
+          uint64_t MaskSize = countPopulation(Mask);
+          if (Shift + MaskSize <= VT.getSizeInBits())
+            return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
+                               DAG.getConstant(Shift | (MaskSize << 8), DL,
+                                               VT));
+        }
+      }
+    } // BEXTR
+
+    return SDValue();
+  }
+
+  // Want to form ANDNP nodes:
+  // 1) In the hopes of then easily combining them with OR and AND nodes
+  //    to form PBLEND/PSIGN.
+  // 2) To match ANDN packed intrinsics
+  if (VT != MVT::v2i64 && VT != MVT::v4i64)
+    return SDValue();
+
+  // Check LHS for vnot
+  if (N0.getOpcode() == ISD::XOR &&
+      //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+      CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
+    return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
+
+  // Check RHS for vnot
+  if (N1.getOpcode() == ISD::XOR &&
+      //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+      CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
+    return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
+
+  return SDValue();
+}
+
+static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const X86Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
+    return R;
+
+  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+    return FPLogic;
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  // look for psign/blend
+  if (VT == MVT::v2i64 || VT == MVT::v4i64) {
+    if (!Subtarget->hasSSSE3() ||
+        (VT == MVT::v4i64 && !Subtarget->hasInt256()))
+      return SDValue();
+
+    // Canonicalize pandn to RHS
+    if (N0.getOpcode() == X86ISD::ANDNP)
+      std::swap(N0, N1);
+    // or (and (m, y), (pandn m, x))
+    if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
+      SDValue Mask = N1.getOperand(0);
+      SDValue X    = N1.getOperand(1);
+      SDValue Y;
+      if (N0.getOperand(0) == Mask)
+        Y = N0.getOperand(1);
+      if (N0.getOperand(1) == Mask)
+        Y = N0.getOperand(0);
+
+      // Check to see if the mask appeared in both the AND and ANDNP and
+      if (!Y.getNode())
+        return SDValue();
+
+      // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
+      // Look through mask bitcast.
+      if (Mask.getOpcode() == ISD::BITCAST)
+        Mask = Mask.getOperand(0);
+      if (X.getOpcode() == ISD::BITCAST)
+        X = X.getOperand(0);
+      if (Y.getOpcode() == ISD::BITCAST)
+        Y = Y.getOperand(0);
+
+      EVT MaskVT = Mask.getValueType();
+
+      // Validate that the Mask operand is a vector sra node.
+      // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
+      // there is no psrai.b
+      unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
+      unsigned SraAmt = ~0;
+      if (Mask.getOpcode() == ISD::SRA) {
+        if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
+          if (auto *AmtConst = AmtBV->getConstantSplatNode())
+            SraAmt = AmtConst->getZExtValue();
+      } else if (Mask.getOpcode() == X86ISD::VSRAI) {
+        SDValue SraC = Mask.getOperand(1);
+        SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
+      }
+      if ((SraAmt + 1) != EltBits)
+        return SDValue();
+
+      SDLoc DL(N);
+
+      // Now we know we at least have a plendvb with the mask val.  See if
+      // we can form a psignb/w/d.
+      // psign = x.type == y.type == mask.type && y = sub(0, x);
+      if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
+          ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
+          X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
+        assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
+               "Unsupported VT for PSIGN");
+        Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
+        return DAG.getBitcast(VT, Mask);
+      }
+      // PBLENDVB only available on SSE 4.1
+      if (!Subtarget->hasSSE41())
+        return SDValue();
+
+      MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
+
+      X = DAG.getBitcast(BlendVT, X);
+      Y = DAG.getBitcast(BlendVT, Y);
+      Mask = DAG.getBitcast(BlendVT, Mask);
+      Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
+      return DAG.getBitcast(VT, Mask);
+    }
+  }
+
+  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+  bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
+
+  // SHLD/SHRD instructions have lower register pressure, but on some
+  // platforms they have higher latency than the equivalent
+  // series of shifts/or that would otherwise be generated.
+  // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
+  // have higher latencies and we are not optimizing for size.
+  if (!OptForSize && Subtarget->isSHLDSlow())
+    return SDValue();
+
+  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
+    std::swap(N0, N1);
+  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
+    return SDValue();
+  if (!N0.hasOneUse() || !N1.hasOneUse())
+    return SDValue();
+
+  SDValue ShAmt0 = N0.getOperand(1);
+  if (ShAmt0.getValueType() != MVT::i8)
+    return SDValue();
+  SDValue ShAmt1 = N1.getOperand(1);
+  if (ShAmt1.getValueType() != MVT::i8)
+    return SDValue();
+  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
+    ShAmt0 = ShAmt0.getOperand(0);
+  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
+    ShAmt1 = ShAmt1.getOperand(0);
+
+  SDLoc DL(N);
+  unsigned Opc = X86ISD::SHLD;
+  SDValue Op0 = N0.getOperand(0);
+  SDValue Op1 = N1.getOperand(0);
+  if (ShAmt0.getOpcode() == ISD::SUB) {
+    Opc = X86ISD::SHRD;
+    std::swap(Op0, Op1);
+    std::swap(ShAmt0, ShAmt1);
+  }
+
+  unsigned Bits = VT.getSizeInBits();
+  if (ShAmt1.getOpcode() == ISD::SUB) {
+    SDValue Sum = ShAmt1.getOperand(0);
+    if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
+      SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
+      if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
+        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
+      if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
+        return DAG.getNode(Opc, DL, VT,
+                           Op0, Op1,
+                           DAG.getNode(ISD::TRUNCATE, DL,
+                                       MVT::i8, ShAmt0));
+    }
+  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
+    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
+    if (ShAmt0C &&
+        ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
+      return DAG.getNode(Opc, DL, VT,
+                         N0.getOperand(0), N1.getOperand(0),
+                         DAG.getNode(ISD::TRUNCATE, DL,
+                                       MVT::i8, ShAmt0));
+  }
+
+  return SDValue();
+}
+
+// Generate NEG and CMOV for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  // Since X86 does not have CMOV for 8-bit integer, we don't convert
+  // 8-bit integer abs to NEG and CMOV.
+  if (VT.isInteger() && VT.getSizeInBits() == 8)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+
+  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+  // and change it to SUB and CMOV.
+  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+      N0.getOpcode() == ISD::ADD &&
+      N0.getOperand(1) == N1 &&
+      N1.getOpcode() == ISD::SRA &&
+      N1.getOperand(0) == N0.getOperand(0))
+    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+      if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
+        // Generate SUB & CMOV.
+        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
+                                  DAG.getConstant(0, DL, VT), N0.getOperand(0));
+
+        SDValue Ops[] = { N0.getOperand(0), Neg,
+                          DAG.getConstant(X86::COND_GE, DL, MVT::i8),
+                          SDValue(Neg.getNode(), 1) };
+        return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
+      }
+  return SDValue();
+}
+
+// Try to turn tests against the signbit in the form of:
+//   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
+// into:
+//   SETGT(X, -1)
+static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
+  // This is only worth doing if the output type is i8.
+  if (N->getValueType(0) != MVT::i8)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // We should be performing an xor against a truncated shift.
+  if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
+    return SDValue();
+
+  // Make sure we are performing an xor against one.
+  if (!isOneConstant(N1))
+    return SDValue();
+
+  // SetCC on x86 zero extends so only act on this if it's a logical shift.
+  SDValue Shift = N0.getOperand(0);
+  if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
+    return SDValue();
+
+  // Make sure we are truncating from one of i16, i32 or i64.
+  EVT ShiftTy = Shift.getValueType();
+  if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
+    return SDValue();
+
+  // Make sure the shift amount extracts the sign bit.
+  if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
+      Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
+    return SDValue();
+
+  // Create a greater-than comparison against -1.
+  // N.B. Using SETGE against 0 works but we want a canonical looking
+  // comparison, using SETGT matches up with what TranslateX86CC.
+  SDLoc DL(N);
+  SDValue ShiftOp = Shift.getOperand(0);
+  EVT ShiftOpTy = ShiftOp.getValueType();
+  SDValue Cond = DAG.getSetCC(DL, MVT::i8, ShiftOp,
+                              DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
+  return Cond;
+}
+
+static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const X86Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
+    return RV;
+
+  if (Subtarget->hasCMov())
+    if (SDValue RV = performIntegerAbsCombine(N, DAG))
+      return RV;
+
+  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+    return FPLogic;
+
+  return SDValue();
+}
+
+/// This function detects the AVG pattern between vectors of unsigned i8/i16,
+/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
+/// X86ISD::AVG instruction.
+static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
+                                const X86Subtarget *Subtarget, SDLoc DL) {
+  if (!VT.isVector() || !VT.isSimple())
+    return SDValue();
+  EVT InVT = In.getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  EVT ScalarVT = VT.getVectorElementType();
+  if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
+        isPowerOf2_32(NumElems)))
+    return SDValue();
+
+  // InScalarVT is the intermediate type in AVG pattern and it should be greater
+  // than the original input type (i8/i16).
+  EVT InScalarVT = InVT.getVectorElementType();
+  if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
+    return SDValue();
+
+  if (Subtarget->hasAVX512()) {
+    if (VT.getSizeInBits() > 512)
+      return SDValue();
+  } else if (Subtarget->hasAVX2()) {
+    if (VT.getSizeInBits() > 256)
+      return SDValue();
+  } else {
+    if (VT.getSizeInBits() > 128)
+      return SDValue();
+  }
+
+  // Detect the following pattern:
+  //
+  //   %1 = zext <N x i8> %a to <N x i32>
+  //   %2 = zext <N x i8> %b to <N x i32>
+  //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
+  //   %4 = add nuw nsw <N x i32> %3, %2
+  //   %5 = lshr <N x i32> %N, <i32 1 x N>
+  //   %6 = trunc <N x i32> %5 to <N x i8>
+  //
+  // In AVX512, the last instruction can also be a trunc store.
+
+  if (In.getOpcode() != ISD::SRL)
+    return SDValue();
+
+  // A lambda checking the given SDValue is a constant vector and each element
+  // is in the range [Min, Max].
+  auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
+    BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
+    if (!BV || !BV->isConstant())
+      return false;
+    for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
+      if (!C)
+        return false;
+      uint64_t Val = C->getZExtValue();
+      if (Val < Min || Val > Max)
+        return false;
+    }
+    return true;
+  };
+
+  // Check if each element of the vector is left-shifted by one.
+  auto LHS = In.getOperand(0);
+  auto RHS = In.getOperand(1);
+  if (!IsConstVectorInRange(RHS, 1, 1))
+    return SDValue();
+  if (LHS.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // Detect a pattern of a + b + 1 where the order doesn't matter.
+  SDValue Operands[3];
+  Operands[0] = LHS.getOperand(0);
+  Operands[1] = LHS.getOperand(1);
+
+  // Take care of the case when one of the operands is a constant vector whose
+  // element is in the range [1, 256].
+  if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
+      Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
+      Operands[0].getOperand(0).getValueType() == VT) {
+    // The pattern is detected. Subtract one from the constant vector, then
+    // demote it and emit X86ISD::AVG instruction.
+    SDValue One = DAG.getConstant(1, DL, InScalarVT);
+    SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT,
+                               SmallVector<SDValue, 8>(NumElems, One));
+    Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones);
+    Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
+    return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
+                       Operands[1]);
+  }
+
+  if (Operands[0].getOpcode() == ISD::ADD)
+    std::swap(Operands[0], Operands[1]);
+  else if (Operands[1].getOpcode() != ISD::ADD)
+    return SDValue();
+  Operands[2] = Operands[1].getOperand(0);
+  Operands[1] = Operands[1].getOperand(1);
+
+  // Now we have three operands of two additions. Check that one of them is a
+  // constant vector with ones, and the other two are promoted from i8/i16.
+  for (int i = 0; i < 3; ++i) {
+    if (!IsConstVectorInRange(Operands[i], 1, 1))
+      continue;
+    std::swap(Operands[i], Operands[2]);
+
+    // Check if Operands[0] and Operands[1] are results of type promotion.
+    for (int j = 0; j < 2; ++j)
+      if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
+          Operands[j].getOperand(0).getValueType() != VT)
+        return SDValue();
+
+    // The pattern is detected, emit X86ISD::AVG instruction.
+    return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
+                       Operands[1].getOperand(0));
+  }
+
+  return SDValue();
+}
+
+/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
+static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
+  LoadSDNode *Ld = cast<LoadSDNode>(N);
+  EVT RegVT = Ld->getValueType(0);
+  EVT MemVT = Ld->getMemoryVT();
+  SDLoc dl(Ld);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // For chips with slow 32-byte unaligned loads, break the 32-byte operation
+  // into two 16-byte operations.
+  ISD::LoadExtType Ext = Ld->getExtensionType();
+  bool Fast;
+  unsigned AddressSpace = Ld->getAddressSpace();
+  unsigned Alignment = Ld->getAlignment();
+  if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
+      Ext == ISD::NON_EXTLOAD &&
+      TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
+                             AddressSpace, Alignment, &Fast) && !Fast) {
+    unsigned NumElems = RegVT.getVectorNumElements();
+    if (NumElems < 2)
+      return SDValue();
+
+    SDValue Ptr = Ld->getBasePtr();
+    SDValue Increment =
+        DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout()));
+
+    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+                                  NumElems/2);
+    SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+                                Ld->getPointerInfo(), Ld->isVolatile(),
+                                Ld->isNonTemporal(), Ld->isInvariant(),
+                                Alignment);
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+    SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+                                Ld->getPointerInfo(), Ld->isVolatile(),
+                                Ld->isNonTemporal(), Ld->isInvariant(),
+                                std::min(16U, Alignment));
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                             Load1.getValue(1),
+                             Load2.getValue(1));
+
+    SDValue NewVec = DAG.getUNDEF(RegVT);
+    NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
+    NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
+    return DCI.CombineTo(N, NewVec, TF, true);
+  }
+
+  return SDValue();
+}
+
+/// PerformMLOADCombine - Resolve extending loads
+static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
+  MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
+  if (Mld->getExtensionType() != ISD::SEXTLOAD)
+    return SDValue();
+
+  EVT VT = Mld->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT LdVT = Mld->getMemoryVT();
+  SDLoc dl(Mld);
+
+  assert(LdVT != VT && "Cannot extend to the same type");
+  unsigned ToSz = VT.getVectorElementType().getSizeInBits();
+  unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
+  // From, To sizes and ElemCount must be pow of two
+  assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+    "Unexpected size for extending masked load");
+
+  unsigned SizeRatio  = ToSz / FromSz;
+  assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
+
+  // Create a type on which we perform the shuffle
+  EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+          LdVT.getScalarType(), NumElems*SizeRatio);
+  assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+  // Convert Src0 value
+  SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
+  if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
+    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
+
+    // Can't shuffle using an illegal type.
+    assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+           "WideVecVT should be legal");
+    WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
+                                    DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
+  }
+  // Prepare the new mask
+  SDValue NewMask;
+  SDValue Mask = Mld->getMask();
+  if (Mask.getValueType() == VT) {
+    // Mask and original value have the same type
+    NewMask = DAG.getBitcast(WideVecVT, Mask);
+    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
+    for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
+      ShuffleVec[i] = NumElems * SizeRatio;
+    NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+                                   DAG.getConstant(0, dl, WideVecVT),
+                                   &ShuffleVec[0]);
+  }
+  else {
+    assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+    unsigned WidenNumElts = NumElems*SizeRatio;
+    unsigned MaskNumElts = VT.getVectorNumElements();
+    EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
+                                     WidenNumElts);
+
+    unsigned NumConcat = WidenNumElts / MaskNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
+    Ops[0] = Mask;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = ZeroVal;
+
+    NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+  }
+
+  SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
+                                     Mld->getBasePtr(), NewMask, WideSrc0,
+                                     Mld->getMemoryVT(), Mld->getMemOperand(),
+                                     ISD::NON_EXTLOAD);
+  SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
+  return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
+}
+/// PerformMSTORECombine - Resolve truncating stores
+static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
+                                    const X86Subtarget *Subtarget) {
+  MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
+  if (!Mst->isTruncatingStore())
+    return SDValue();
+
+  EVT VT = Mst->getValue().getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT StVT = Mst->getMemoryVT();
+  SDLoc dl(Mst);
+
+  assert(StVT != VT && "Cannot truncate to the same type");
+  unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+  unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // The truncating store is legal in some cases. For example
+  // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
+  // are designated for truncate store.
+  // In this case we don't need any further transformations.
+  if (TLI.isTruncStoreLegal(VT, StVT))
+    return SDValue();
+
+  // From, To sizes and ElemCount must be pow of two
+  assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+    "Unexpected size for truncating masked store");
+  // We are going to use the original vector elt for storing.
+  // Accumulated smaller vector elements must be a multiple of the store size.
+  assert (((NumElems * FromSz) % ToSz) == 0 &&
+          "Unexpected ratio for truncating masked store");
+
+  unsigned SizeRatio  = FromSz / ToSz;
+  assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+  // Create a type on which we perform the shuffle
+  EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+          StVT.getScalarType(), NumElems*SizeRatio);
+
+  assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+  SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
+  SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+  for (unsigned i = 0; i != NumElems; ++i)
+    ShuffleVec[i] = i * SizeRatio;
+
+  // Can't shuffle using an illegal type.
+  assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+         "WideVecVT should be legal");
+
+  SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+                                              DAG.getUNDEF(WideVecVT),
+                                              &ShuffleVec[0]);
+
+  SDValue NewMask;
+  SDValue Mask = Mst->getMask();
+  if (Mask.getValueType() == VT) {
+    // Mask and original value have the same type
+    NewMask = DAG.getBitcast(WideVecVT, Mask);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
+    for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
+      ShuffleVec[i] = NumElems*SizeRatio;
+    NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+                                   DAG.getConstant(0, dl, WideVecVT),
+                                   &ShuffleVec[0]);
+  }
+  else {
+    assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+    unsigned WidenNumElts = NumElems*SizeRatio;
+    unsigned MaskNumElts = VT.getVectorNumElements();
+    EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
+                                     WidenNumElts);
+
+    unsigned NumConcat = WidenNumElts / MaskNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
+    Ops[0] = Mask;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = ZeroVal;
+
+    NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+  }
+
+  return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
+                            Mst->getBasePtr(), NewMask, StVT,
+                            Mst->getMemOperand(), false);
+}
+/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
+static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
+                                   const X86Subtarget *Subtarget) {
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  EVT VT = St->getValue().getValueType();
+  EVT StVT = St->getMemoryVT();
+  SDLoc dl(St);
+  SDValue StoredVal = St->getOperand(1);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // If we are saving a concatenation of two XMM registers and 32-byte stores
+  // are slow, such as on Sandy Bridge, perform two 16-byte stores.
+  bool Fast;
+  unsigned AddressSpace = St->getAddressSpace();
+  unsigned Alignment = St->getAlignment();
+  if (VT.is256BitVector() && StVT == VT &&
+      TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                             AddressSpace, Alignment, &Fast) && !Fast) {
+    unsigned NumElems = VT.getVectorNumElements();
+    if (NumElems < 2)
+      return SDValue();
+
+    SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
+    SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
+
+    SDValue Stride =
+        DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout()));
+    SDValue Ptr0 = St->getBasePtr();
+    SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
+
+    SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), Alignment);
+    SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(),
+                                std::min(16U, Alignment));
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+  }
+
+  // Optimize trunc store (of multiple scalars) to shuffle and store.
+  // First, pack all of the elements in one place. Next, store to memory
+  // in fewer chunks.
+  if (St->isTruncatingStore() && VT.isVector()) {
+    // Check if we can detect an AVG pattern from the truncation. If yes,
+    // replace the trunc store by a normal store with the result of X86ISD::AVG
+    // instruction.
+    SDValue Avg =
+        detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl);
+    if (Avg.getNode())
+      return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
+                          St->getPointerInfo(), St->isVolatile(),
+                          St->isNonTemporal(), St->getAlignment());
+
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    unsigned NumElems = VT.getVectorNumElements();
+    assert(StVT != VT && "Cannot truncate to the same type");
+    unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+    unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+    // The truncating store is legal in some cases. For example
+    // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
+    // are designated for truncate store.
+    // In this case we don't need any further transformations.
+    if (TLI.isTruncStoreLegal(VT, StVT))
+      return SDValue();
+
+    // From, To sizes and ElemCount must be pow of two
+    if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
+    // We are going to use the original vector elt for storing.
+    // Accumulated smaller vector elements must be a multiple of the store size.
+    if (0 != (NumElems * FromSz) % ToSz) return SDValue();
+
+    unsigned SizeRatio  = FromSz / ToSz;
+
+    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+    // Create a type on which we perform the shuffle
+    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StVT.getScalarType(), NumElems*SizeRatio);
+
+    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
+
+    // Can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT))
+      return SDValue();
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+                                         DAG.getUNDEF(WideVecVT),
+                                         &ShuffleVec[0]);
+    // At this point all of the data is stored at the bottom of the
+    // register. We now need to save it to mem.
+
+    // Find the largest store unit
+    MVT StoreType = MVT::i8;
+    for (MVT Tp : MVT::integer_valuetypes()) {
+      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
+        StoreType = Tp;
+    }
+
+    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+    if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
+        (64 <= NumElems * ToSz))
+      StoreType = MVT::f64;
+
+    // Bitcast the original vector into a vector of store-size units
+    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
+    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+    SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
+    SmallVector<SDValue, 8> Chains;
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, dl,
+                                        TLI.getPointerTy(DAG.getDataLayout()));
+    SDValue Ptr = St->getBasePtr();
+
+    // Perform one or more big stores into memory.
+    for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
+      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                                   StoreType, ShuffWide,
+                                   DAG.getIntPtrConstant(i, dl));
+      SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+      Chains.push_back(Ch);
+    }
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+  }
+
+  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
+  // the FP state in cases where an emms may be missing.
+  // A preferable solution to the general problem is to figure out the right
+  // places to insert EMMS.  This qualifies as a quick hack.
+
+  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
+  if (VT.getSizeInBits() != 64)
+    return SDValue();
+
+  const Function *F = DAG.getMachineFunction().getFunction();
+  bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
+  bool F64IsLegal =
+      !Subtarget->useSoftFloat() && !NoImplicitFloatOps && Subtarget->hasSSE2();
+  if ((VT.isVector() ||
+       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
+      isa<LoadSDNode>(St->getValue()) &&
+      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
+      St->getChain().hasOneUse() && !St->isVolatile()) {
+    SDNode* LdVal = St->getValue().getNode();
+    LoadSDNode *Ld = nullptr;
+    int TokenFactorIndex = -1;
+    SmallVector<SDValue, 8> Ops;
+    SDNode* ChainVal = St->getChain().getNode();
+    // Must be a store of a load.  We currently handle two cases:  the load
+    // is a direct child, and it's under an intervening TokenFactor.  It is
+    // possible to dig deeper under nested TokenFactors.
+    if (ChainVal == LdVal)
+      Ld = cast<LoadSDNode>(St->getChain());
+    else if (St->getValue().hasOneUse() &&
+             ChainVal->getOpcode() == ISD::TokenFactor) {
+      for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
+        if (ChainVal->getOperand(i).getNode() == LdVal) {
+          TokenFactorIndex = i;
+          Ld = cast<LoadSDNode>(St->getValue());
+        } else
+          Ops.push_back(ChainVal->getOperand(i));
+      }
+    }
+
+    if (!Ld || !ISD::isNormalLoad(Ld))
+      return SDValue();
+
+    // If this is not the MMX case, i.e. we are just turning i64 load/store
+    // into f64 load/store, avoid the transformation if there are multiple
+    // uses of the loaded value.
+    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
+      return SDValue();
+
+    SDLoc LdDL(Ld);
+    SDLoc StDL(N);
+    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
+    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
+    // pair instead.
+    if (Subtarget->is64Bit() || F64IsLegal) {
+      MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
+      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
+                                  Ld->getPointerInfo(), Ld->isVolatile(),
+                                  Ld->isNonTemporal(), Ld->isInvariant(),
+                                  Ld->getAlignment());
+      SDValue NewChain = NewLd.getValue(1);
+      if (TokenFactorIndex != -1) {
+        Ops.push_back(NewChain);
+        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
+      }
+      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
+                          St->getPointerInfo(),
+                          St->isVolatile(), St->isNonTemporal(),
+                          St->getAlignment());
+    }
+
+    // Otherwise, lower to two pairs of 32-bit loads / stores.
+    SDValue LoAddr = Ld->getBasePtr();
+    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
+                                 DAG.getConstant(4, LdDL, MVT::i32));
+
+    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
+                               Ld->getPointerInfo(),
+                               Ld->isVolatile(), Ld->isNonTemporal(),
+                               Ld->isInvariant(), Ld->getAlignment());
+    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
+                               Ld->getPointerInfo().getWithOffset(4),
+                               Ld->isVolatile(), Ld->isNonTemporal(),
+                               Ld->isInvariant(),
+                               MinAlign(Ld->getAlignment(), 4));
+
+    SDValue NewChain = LoLd.getValue(1);
+    if (TokenFactorIndex != -1) {
+      Ops.push_back(LoLd);
+      Ops.push_back(HiLd);
+      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
+    }
+
+    LoAddr = St->getBasePtr();
+    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
+                         DAG.getConstant(4, StDL, MVT::i32));
+
+    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
+                                St->getPointerInfo(),
+                                St->isVolatile(), St->isNonTemporal(),
+                                St->getAlignment());
+    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
+                                St->getPointerInfo().getWithOffset(4),
+                                St->isVolatile(),
+                                St->isNonTemporal(),
+                                MinAlign(St->getAlignment(), 4));
+    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
+  }
+
+  // This is similar to the above case, but here we handle a scalar 64-bit
+  // integer store that is extracted from a vector on a 32-bit target.
+  // If we have SSE2, then we can treat it like a floating-point double
+  // to get past legalization. The execution dependencies fixup pass will
+  // choose the optimal machine instruction for the store if this really is
+  // an integer or v2f32 rather than an f64.
+  if (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit() &&
+      St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    SDValue OldExtract = St->getOperand(1);
+    SDValue ExtOp0 = OldExtract.getOperand(0);
+    unsigned VecSize = ExtOp0.getValueSizeInBits();
+    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
+    SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
+    SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                                     BitCast, OldExtract.getOperand(1));
+    return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
+                        St->getPointerInfo(), St->isVolatile(),
+                        St->isNonTemporal(), St->getAlignment());
+  }
+
+  return SDValue();
+}
+
+/// Return 'true' if this vector operation is "horizontal"
+/// and return the operands for the horizontal operation in LHS and RHS.  A
+/// horizontal operation performs the binary operation on successive elements
+/// of its first operand, then on successive elements of its second operand,
+/// returning the resulting values in a vector.  For example, if
+///   A = < float a0, float a1, float a2, float a3 >
+/// and
+///   B = < float b0, float b1, float b2, float b3 >
+/// then the result of doing a horizontal operation on A and B is
+///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
+/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
+/// A horizontal-op B, for some already available A and B, and if so then LHS is
+/// set to A, RHS to B, and the routine returns 'true'.
+/// Note that the binary operation should have the property that if one of the
+/// operands is UNDEF then the result is UNDEF.
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
+  // Look for the following pattern: if
+  //   A = < float a0, float a1, float a2, float a3 >
+  //   B = < float b0, float b1, float b2, float b3 >
+  // and
+  //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
+  //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
+  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
+  // which is A horizontal-op B.
+
+  // At least one of the operands should be a vector shuffle.
+  if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
+      RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
+    return false;
+
+  MVT VT = LHS.getSimpleValueType();
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for horizontal add/sub");
+
+  // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
+  // operate independently on 128-bit lanes.
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+  assert((NumLaneElts % 2 == 0) &&
+         "Vector type should have an even number of elements in each lane");
+  unsigned HalfLaneElts = NumLaneElts/2;
+
+  // View LHS in the form
+  //   LHS = VECTOR_SHUFFLE A, B, LMask
+  // If LHS is not a shuffle then pretend it is the shuffle
+  //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
+  // NOTE: in what follows a default initialized SDValue represents an UNDEF of
+  // type VT.
+  SDValue A, B;
+  SmallVector<int, 16> LMask(NumElts);
+  if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
+      A = LHS.getOperand(0);
+    if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
+      B = LHS.getOperand(1);
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
+    std::copy(Mask.begin(), Mask.end(), LMask.begin());
+  } else {
+    if (LHS.getOpcode() != ISD::UNDEF)
+      A = LHS;
+    for (unsigned i = 0; i != NumElts; ++i)
+      LMask[i] = i;
+  }
+
+  // Likewise, view RHS in the form
+  //   RHS = VECTOR_SHUFFLE C, D, RMask
+  SDValue C, D;
+  SmallVector<int, 16> RMask(NumElts);
+  if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
+      C = RHS.getOperand(0);
+    if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
+      D = RHS.getOperand(1);
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
+    std::copy(Mask.begin(), Mask.end(), RMask.begin());
+  } else {
+    if (RHS.getOpcode() != ISD::UNDEF)
+      C = RHS;
+    for (unsigned i = 0; i != NumElts; ++i)
+      RMask[i] = i;
+  }
+
+  // Check that the shuffles are both shuffling the same vectors.
+  if (!(A == C && B == D) && !(A == D && B == C))
+    return false;
+
+  // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
+  if (!A.getNode() && !B.getNode())
+    return false;
+
+  // If A and B occur in reverse order in RHS, then "swap" them (which means
+  // rewriting the mask).
+  if (A != C)
+    ShuffleVectorSDNode::commuteMask(RMask);
+
+  // At this point LHS and RHS are equivalent to
+  //   LHS = VECTOR_SHUFFLE A, B, LMask
+  //   RHS = VECTOR_SHUFFLE A, B, RMask
+  // Check that the masks correspond to performing a horizontal operation.
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0; i != NumLaneElts; ++i) {
+      int LIdx = LMask[i+l], RIdx = RMask[i+l];
+
+      // Ignore any UNDEF components.
+      if (LIdx < 0 || RIdx < 0 ||
+          (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
+          (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
+        continue;
+
+      // Check that successive elements are being operated on.  If not, this is
+      // not a horizontal operation.
+      unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
+      int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
+      if (!(LIdx == Index && RIdx == Index + 1) &&
+          !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
+        return false;
+    }
+  }
+
+  LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
+  RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+  return true;
+}
+
+/// Do target-specific dag combines on floating point adds.
+static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Try to synthesize horizontal adds from adds of shuffles.
+  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+       (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+      isHorizontalBinOp(LHS, RHS, true))
+    return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
+  return SDValue();
+}
+
+/// Do target-specific dag combines on floating point subs.
+static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Try to synthesize horizontal subs from subs of shuffles.
+  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+       (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+      isHorizontalBinOp(LHS, RHS, false))
+    return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
+  return SDValue();
+}
+
+/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
+static SDValue
+combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
+                                  SmallVector<SDValue, 8> &Regs) {
+  assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
+                             Regs[0].getValueType() == MVT::v2i64));
+  EVT OutVT = N->getValueType(0);
+  EVT OutSVT = OutVT.getVectorElementType();
+  EVT InVT = Regs[0].getValueType();
+  EVT InSVT = InVT.getVectorElementType();
+  SDLoc DL(N);
+
+  // First, use mask to unset all bits that won't appear in the result.
+  assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
+         "OutSVT can only be either i8 or i16.");
+  SDValue MaskVal =
+      DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT);
+  SDValue MaskVec = DAG.getNode(
+      ISD::BUILD_VECTOR, DL, InVT,
+      SmallVector<SDValue, 8>(InVT.getVectorNumElements(), MaskVal));
+  for (auto &Reg : Regs)
+    Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg);
+
+  MVT UnpackedVT, PackedVT;
+  if (OutSVT == MVT::i8) {
+    UnpackedVT = MVT::v8i16;
+    PackedVT = MVT::v16i8;
+  } else {
+    UnpackedVT = MVT::v4i32;
+    PackedVT = MVT::v8i16;
+  }
+
+  // In each iteration, truncate the type by a half size.
+  auto RegNum = Regs.size();
+  for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
+       j < e; j *= 2, RegNum /= 2) {
+    for (unsigned i = 0; i < RegNum; i++)
+      Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]);
+    for (unsigned i = 0; i < RegNum / 2; i++)
+      Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
+                            Regs[i * 2 + 1]);
+  }
+
+  // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
+  // then extract a subvector as the result since v8i8 is not a legal type.
+  if (OutVT == MVT::v8i8) {
+    Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
+    Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
+                          DAG.getIntPtrConstant(0, DL));
+    return Regs[0];
+  } else if (RegNum > 1) {
+    Regs.resize(RegNum);
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
+  } else
+    return Regs[0];
+}
+
+/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
+static SDValue
+combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
+                                  SmallVector<SDValue, 8> &Regs) {
+  assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
+  EVT OutVT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
+  SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
+  for (auto &Reg : Regs) {
+    Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
+    Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
+  }
+
+  for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
+    Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
+                          Regs[i * 2 + 1]);
+
+  if (Regs.size() > 2) {
+    Regs.resize(Regs.size() / 2);
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
+  } else
+    return Regs[0];
+}
+
+/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
+/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
+/// legalization the truncation will be translated into a BUILD_VECTOR with each
+/// element that is extracted from a vector and then truncated, and it is
+/// diffcult to do this optimization based on them.
+static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
+                                       const X86Subtarget *Subtarget) {
+  EVT OutVT = N->getValueType(0);
+  if (!OutVT.isVector())
+    return SDValue();
+
+  SDValue In = N->getOperand(0);
+  if (!In.getValueType().isSimple())
+    return SDValue();
+
+  EVT InVT = In.getValueType();
+  unsigned NumElems = OutVT.getVectorNumElements();
+
+  // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
+  // SSE2, and we need to take care of it specially.
+  // AVX512 provides vpmovdb.
+  if (!Subtarget->hasSSE2() || Subtarget->hasAVX2())
+    return SDValue();
+
+  EVT OutSVT = OutVT.getVectorElementType();
+  EVT InSVT = InVT.getVectorElementType();
+  if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
+        (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
+        NumElems >= 8))
+    return SDValue();
+
+  // SSSE3's pshufb results in less instructions in the cases below.
+  if (Subtarget->hasSSSE3() && NumElems == 8 &&
+      ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
+       (InSVT == MVT::i32 && OutSVT == MVT::i16)))
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // Split a long vector into vectors of legal type.
+  unsigned RegNum = InVT.getSizeInBits() / 128;
+  SmallVector<SDValue, 8> SubVec(RegNum);
+  if (InSVT == MVT::i32) {
+    for (unsigned i = 0; i < RegNum; i++)
+      SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+                              DAG.getIntPtrConstant(i * 4, DL));
+  } else {
+    for (unsigned i = 0; i < RegNum; i++)
+      SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                              DAG.getIntPtrConstant(i * 2, DL));
+  }
+
+  // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS
+  // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
+  // truncate 2 x v4i32 to v8i16.
+  if (Subtarget->hasSSE41() || OutSVT == MVT::i8)
+    return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
+  else if (InSVT == MVT::i32)
+    return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
+  else
+    return SDValue();
+}
+
+static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  // Try to detect AVG pattern first.
+  SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG,
+                                 Subtarget, SDLoc(N));
+  if (Avg.getNode())
+    return Avg;
+
+  return combineVectorTruncation(N, DAG, Subtarget);
+}
+
+/// Do target-specific dag combines on floating point negations.
+static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  EVT SVT = VT.getScalarType();
+  SDValue Arg = N->getOperand(0);
+  SDLoc DL(N);
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  // If we're negating a FMUL node on a target with FMA, then we can avoid the
+  // use of a constant by performing (-0 - A*B) instead.
+  // FIXME: Check rounding control flags as well once it becomes available.
+  if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
+      Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) {
+    SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
+    return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
+                       Arg.getOperand(1), Zero);
+  }
+
+  // If we're negating a FMA node, then we can adjust the
+  // instruction to include the extra negation.
+  if (Arg.hasOneUse()) {
+    switch (Arg.getOpcode()) {
+    case X86ISD::FMADD:
+      return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
+    case X86ISD::FMSUB:
+      return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
+    case X86ISD::FNMADD:
+      return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
+    case X86ISD::FNMSUB:
+      return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
+    }
+  }
+  return SDValue();
+}
+
+static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
+                              const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (VT.is512BitVector() && !Subtarget->hasDQI()) {
+    // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention.
+    // These logic operations may be executed in the integer domain.
+    SDLoc dl(N);
+    MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
+    MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
+
+    SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0));
+    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1));
+    unsigned IntOpcode = 0;
+    switch (N->getOpcode()) {
+      default: llvm_unreachable("Unexpected FP logic op");
+      case X86ISD::FOR: IntOpcode = ISD::OR; break;
+      case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
+      case X86ISD::FAND: IntOpcode = ISD::AND; break;
+      case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
+    }
+    SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
+    return  DAG.getNode(ISD::BITCAST, dl, VT, IntOp);
+  }
+  return SDValue();
+}
+/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
+static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget *Subtarget) {
+  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
+
+  // F[X]OR(0.0, x) -> x
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(1);
+
+  // F[X]OR(x, 0.0) -> x
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(0);
+
+  return lowerX86FPLogicOp(N, DAG, Subtarget);
+}
+
+/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
+static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
+
+  // Only perform optimizations if UnsafeMath is used.
+  if (!DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
+  // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
+  // into FMINC and FMAXC, which are Commutative operations.
+  unsigned NewOp = 0;
+  switch (N->getOpcode()) {
+    default: llvm_unreachable("unknown opcode");
+    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
+    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
+  }
+
+  return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
+                     N->getOperand(0), N->getOperand(1));
+}
+
+static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG,
+                                            const X86Subtarget *Subtarget) {
+  if (Subtarget->useSoftFloat())
+    return SDValue();
+
+  // TODO: Check for global or instruction-level "nnan". In that case, we
+  //       should be able to lower to FMAX/FMIN alone.
+  // TODO: If an operand is already known to be a NaN or not a NaN, this
+  //       should be an optional swap and FMAX/FMIN.
+
+  EVT VT = N->getValueType(0);
+  if (!((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
+        (Subtarget->hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
+        (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
+    return SDValue();
+
+  // This takes at least 3 instructions, so favor a library call when operating
+  // on a scalar and minimizing code size.
+  if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
+    return SDValue();
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDLoc DL(N);
+  EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
+      DAG.getDataLayout(), *DAG.getContext(), VT);
+
+  // There are 4 possibilities involving NaN inputs, and these are the required
+  // outputs:
+  //                   Op1
+  //               Num     NaN
+  //            ----------------
+  //       Num  |  Max  |  Op0 |
+  // Op0        ----------------
+  //       NaN  |  Op1  |  NaN |
+  //            ----------------
+  //
+  // The SSE FP max/min instructions were not designed for this case, but rather
+  // to implement:
+  //   Min = Op1 < Op0 ? Op1 : Op0
+  //   Max = Op1 > Op0 ? Op1 : Op0
+  //
+  // So they always return Op0 if either input is a NaN. However, we can still
+  // use those instructions for fmaxnum by selecting away a NaN input.
+
+  // If either operand is NaN, the 2nd source operand (Op0) is passed through.
+  auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
+  SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
+  SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
+
+  // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
+  // are NaN, the NaN value of Op1 is the result.
+  auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
+  return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
+}
+
+/// Do target-specific dag combines on X86ISD::FAND nodes.
+static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+  // FAND(0.0, x) -> 0.0
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(0);
+
+  // FAND(x, 0.0) -> 0.0
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(1);
+
+  return lowerX86FPLogicOp(N, DAG, Subtarget);
+}
+
+/// Do target-specific dag combines on X86ISD::FANDN nodes
+static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG,
+                                   const X86Subtarget *Subtarget) {
+  // FANDN(0.0, x) -> x
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(1);
+
+  // FANDN(x, 0.0) -> 0.0
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(1);
+
+  return lowerX86FPLogicOp(N, DAG, Subtarget);
+}
+
+static SDValue PerformBTCombine(SDNode *N,
+                                SelectionDAG &DAG,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  // BT ignores high bits in the bit index operand.
+  SDValue Op1 = N->getOperand(1);
+  if (Op1.hasOneUse()) {
+    unsigned BitWidth = Op1.getValueSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                          !DCI.isBeforeLegalizeOps());
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
+        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
+      DCI.CommitTargetLoweringOpt(TLO);
+  }
+  return SDValue();
+}
+
+static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op = N->getOperand(0);
+  if (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
+  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
+      VT.getVectorElementType().getSizeInBits() ==
+      OpVT.getVectorElementType().getSizeInBits()) {
+    return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
+  }
+  return SDValue();
+}
+
+static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
+                                               const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
+  SDLoc dl(N);
+
+  // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
+  // both SSE and AVX2 since there is no sign-extended shift right
+  // operation on a vector with 64-bit elements.
+  //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
+  // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
+  if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
+      N0.getOpcode() == ISD::SIGN_EXTEND)) {
+    SDValue N00 = N0.getOperand(0);
+
+    // EXTLOAD has a better solution on AVX2,
+    // it may be replaced with X86ISD::VSEXT node.
+    if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
+      if (!ISD::isNormalLoad(N00.getNode()))
+        return SDValue();
+
+    if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
+        SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
+                                  N00, N1);
+      return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
+    }
+  }
+  return SDValue();
+}
+
+/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
+/// Promoting a sign extension ahead of an 'add nsw' exposes opportunities
+/// to combine math ops, use an LEA, or use a complex addressing mode. This can
+/// eliminate extend, add, and shift instructions.
+static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
+                                       const X86Subtarget *Subtarget) {
+  // TODO: This should be valid for other integer types.
+  EVT VT = Sext->getValueType(0);
+  if (VT != MVT::i64)
+    return SDValue();
+
+  // We need an 'add nsw' feeding into the 'sext'.
+  SDValue Add = Sext->getOperand(0);
+  if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap())
+    return SDValue();
+
+  // Having a constant operand to the 'add' ensures that we are not increasing
+  // the instruction count because the constant is extended for free below.
+  // A constant operand can also become the displacement field of an LEA.
+  auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
+  if (!AddOp1)
+    return SDValue();
+
+  // Don't make the 'add' bigger if there's no hope of combining it with some
+  // other 'add' or 'shl' instruction.
+  // TODO: It may be profitable to generate simpler LEA instructions in place
+  // of single 'add' instructions, but the cost model for selecting an LEA
+  // currently has a high threshold.
+  bool HasLEAPotential = false;
+  for (auto *User : Sext->uses()) {
+    if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
+      HasLEAPotential = true;
+      break;
+    }
+  }
+  if (!HasLEAPotential)
+    return SDValue();
+
+  // Everything looks good, so pull the 'sext' ahead of the 'add'.
+  int64_t AddConstant = AddOp1->getSExtValue();
+  SDValue AddOp0 = Add.getOperand(0);
+  SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0);
+  SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
+
+  // The wider add is guaranteed to not wrap because both operands are
+  // sign-extended.
+  SDNodeFlags Flags;
+  Flags.setNoSignedWrap(true);
+  return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags);
+}
+
+static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT SVT = VT.getScalarType();
+  EVT InVT = N0.getValueType();
+  EVT InSVT = InVT.getScalarType();
+  SDLoc DL(N);
+
+  // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
+  // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
+  // This exposes the sext to the sdivrem lowering, so that it directly extends
+  // from AH (which we otherwise need to do contortions to access).
+  if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
+      InVT == MVT::i8 && VT == MVT::i32) {
+    SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
+    SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys,
+                            N0.getOperand(0), N0.getOperand(1));
+    DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
+    return R.getValue(1);
+  }
+
+  if (!DCI.isBeforeLegalizeOps()) {
+    if (InVT == MVT::i1) {
+      SDValue Zero = DAG.getConstant(0, DL, VT);
+      SDValue AllOnes =
+        DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
+      return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
+    }
+    return SDValue();
+  }
+
+  if (VT.isVector() && Subtarget->hasSSE2()) {
+    auto ExtendVecSize = [&DAG](SDLoc DL, SDValue N, unsigned Size) {
+      EVT InVT = N.getValueType();
+      EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
+                                   Size / InVT.getScalarSizeInBits());
+      SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
+                                    DAG.getUNDEF(InVT));
+      Opnds[0] = N;
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
+    };
+
+    // If target-size is less than 128-bits, extend to a type that would extend
+    // to 128 bits, extend that and extract the original target vector.
+    if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits()) &&
+        (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
+        (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
+      unsigned Scale = 128 / VT.getSizeInBits();
+      EVT ExVT =
+          EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
+      SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
+      SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, ExVT, Ex);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
+                         DAG.getIntPtrConstant(0, DL));
+    }
+
+    // If target-size is 128-bits, then convert to ISD::SIGN_EXTEND_VECTOR_INREG
+    // which ensures lowering to X86ISD::VSEXT (pmovsx*).
+    if (VT.getSizeInBits() == 128 &&
+        (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
+        (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
+      SDValue ExOp = ExtendVecSize(DL, N0, 128);
+      return DAG.getSignExtendVectorInReg(ExOp, DL, VT);
+    }
+
+    // On pre-AVX2 targets, split into 128-bit nodes of
+    // ISD::SIGN_EXTEND_VECTOR_INREG.
+    if (!Subtarget->hasInt256() && !(VT.getSizeInBits() % 128) &&
+        (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
+        (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
+      unsigned NumVecs = VT.getSizeInBits() / 128;
+      unsigned NumSubElts = 128 / SVT.getSizeInBits();
+      EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
+      EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
+
+      SmallVector<SDValue, 8> Opnds;
+      for (unsigned i = 0, Offset = 0; i != NumVecs;
+           ++i, Offset += NumSubElts) {
+        SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
+                                     DAG.getIntPtrConstant(Offset, DL));
+        SrcVec = ExtendVecSize(DL, SrcVec, 128);
+        SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT);
+        Opnds.push_back(SrcVec);
+      }
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
+    }
+  }
+
+  if (Subtarget->hasAVX() && VT.is256BitVector())
+    if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
+      return R;
+
+  if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget))
+    return NewAdd;
+
+  return SDValue();
+}
+
+static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget* Subtarget) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  EVT ScalarVT = VT.getScalarType();
+  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA())
+    return SDValue();
+
+  SDValue A = N->getOperand(0);
+  SDValue B = N->getOperand(1);
+  SDValue C = N->getOperand(2);
+
+  bool NegA = (A.getOpcode() == ISD::FNEG);
+  bool NegB = (B.getOpcode() == ISD::FNEG);
+  bool NegC = (C.getOpcode() == ISD::FNEG);
+
+  // Negative multiplication when NegA xor NegB
+  bool NegMul = (NegA != NegB);
+  if (NegA)
+    A = A.getOperand(0);
+  if (NegB)
+    B = B.getOperand(0);
+  if (NegC)
+    C = C.getOperand(0);
+
+  unsigned Opcode;
+  if (!NegMul)
+    Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
+  else
+    Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
+
+  return DAG.getNode(Opcode, dl, VT, A, B, C);
+}
+
+static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
+  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
+  //           (and (i32 x86isd::setcc_carry), 1)
+  // This eliminates the zext. This transformation is necessary because
+  // ISD::SETCC is always legalized to i8.
+  SDLoc dl(N);
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (N0.getOpcode() == ISD::AND &&
+      N0.hasOneUse() &&
+      N0.getOperand(0).hasOneUse()) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+      if (!isOneConstant(N0.getOperand(1)))
+        return SDValue();
+      return DAG.getNode(ISD::AND, dl, VT,
+                         DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
+                                     N00.getOperand(0), N00.getOperand(1)),
+                         DAG.getConstant(1, dl, VT));
+    }
+  }
+
+  if (N0.getOpcode() == ISD::TRUNCATE &&
+      N0.hasOneUse() &&
+      N0.getOperand(0).hasOneUse()) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+      return DAG.getNode(ISD::AND, dl, VT,
+                         DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
+                                     N00.getOperand(0), N00.getOperand(1)),
+                         DAG.getConstant(1, dl, VT));
+    }
+  }
+
+  if (VT.is256BitVector())
+    if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
+      return R;
+
+  // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
+  // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
+  // This exposes the zext to the udivrem lowering, so that it directly extends
+  // from AH (which we otherwise need to do contortions to access).
+  if (N0.getOpcode() == ISD::UDIVREM &&
+      N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
+      VT == MVT::i32) {
+    SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
+    SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
+                            N0.getOperand(0), N0.getOperand(1));
+    DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
+    return R.getValue(1);
+  }
+
+  return SDValue();
+}
+
+// Optimize x == -y --> x+y == 0
+//          x != -y --> x+y != 0
+static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget* Subtarget) {
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
+    if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
+      SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
+                                 LHS.getOperand(1));
+      return DAG.getSetCC(DL, N->getValueType(0), addV,
+                          DAG.getConstant(0, DL, addV.getValueType()), CC);
+    }
+  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
+    if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
+      SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
+                                 RHS.getOperand(1));
+      return DAG.getSetCC(DL, N->getValueType(0), addV,
+                          DAG.getConstant(0, DL, addV.getValueType()), CC);
+    }
+
+  if (VT.getScalarType() == MVT::i1 &&
+      (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
+    bool IsSEXT0 =
+        (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
+        (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
+    bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+    if (!IsSEXT0 || !IsVZero1) {
+      // Swap the operands and update the condition code.
+      std::swap(LHS, RHS);
+      CC = ISD::getSetCCSwappedOperands(CC);
+
+      IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
+                (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
+      IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
+    }
+
+    if (IsSEXT0 && IsVZero1) {
+      assert(VT == LHS.getOperand(0).getValueType() &&
+             "Uexpected operand type");
+      if (CC == ISD::SETGT)
+        return DAG.getConstant(0, DL, VT);
+      if (CC == ISD::SETLE)
+        return DAG.getConstant(1, DL, VT);
+      if (CC == ISD::SETEQ || CC == ISD::SETGE)
+        return DAG.getNOT(DL, LHS.getOperand(0), VT);
+
+      assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
+             "Unexpected condition code!");
+      return LHS.getOperand(0);
+    }
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  // Gather and Scatter instructions use k-registers for masks. The type of
+  // the masks is v*i1. So the mask will be truncated anyway.
+  // The SIGN_EXTEND_INREG my be dropped.
+  SDValue Mask = N->getOperand(2);
+  if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+    NewOps[2] = Mask.getOperand(0);
+    DAG.UpdateNodeOperands(N, NewOps);
+  }
+  return SDValue();
+}
+
+// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
+// as "sbb reg,reg", since it can be extended without zext and produces
+// an all-ones bit which is more useful than 0/1 in some cases.
+static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
+                               MVT VT) {
+  if (VT == MVT::i8)
+    return DAG.getNode(ISD::AND, DL, VT,
+                       DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
+                                   DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                                   EFLAGS),
+                       DAG.getConstant(1, DL, VT));
+  assert (VT == MVT::i1 && "Unexpected type for SECCC node");
+  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
+                     DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
+                                 DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                                 EFLAGS));
+}
+
+// Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
+static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
+  SDValue EFLAGS = N->getOperand(1);
+
+  if (CC == X86::COND_A) {
+    // Try to convert COND_A into COND_B in an attempt to facilitate
+    // materializing "setb reg".
+    //
+    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
+    // cannot take an immediate as its first operand.
+    //
+    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+        EFLAGS.getValueType().isInteger() &&
+        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+      SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
+                                   EFLAGS.getNode()->getVTList(),
+                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+      return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
+    }
+  }
+
+  // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
+  // a zext and produces an all-ones bit which is more useful than 0/1 in some
+  // cases.
+  if (CC == X86::COND_B)
+    return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
+
+  if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) {
+    SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
+    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
+  }
+
+  return SDValue();
+}
+
+// Optimize branch condition evaluation.
+//
+static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+  SDValue Chain = N->getOperand(0);
+  SDValue Dest = N->getOperand(1);
+  SDValue EFLAGS = N->getOperand(3);
+  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
+
+  if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) {
+    SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
+    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
+                       Flags);
+  }
+
+  return SDValue();
+}
+
+static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
+                                                         SelectionDAG &DAG) {
+  // Take advantage of vector comparisons producing 0 or -1 in each lane to
+  // optimize away operation when it's from a constant.
+  //
+  // The general transformation is:
+  //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
+  //       AND(VECTOR_CMP(x,y), constant2)
+  //    constant2 = UNARYOP(constant)
+
+  // Early exit if this isn't a vector operation, the operand of the
+  // unary operation isn't a bitwise AND, or if the sizes of the operations
+  // aren't the same.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
+      N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
+      VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
+    return SDValue();
+
+  // Now check that the other operand of the AND is a constant. We could
+  // make the transformation for non-constant splats as well, but it's unclear
+  // that would be a benefit as it would not eliminate any operations, just
+  // perform one more step in scalar code before moving to the vector unit.
+  if (BuildVectorSDNode *BV =
+          dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
+    // Bail out if the vector isn't a constant.
+    if (!BV->isConstant())
+      return SDValue();
+
+    // Everything checks out. Build up the new and improved node.
+    SDLoc DL(N);
+    EVT IntVT = BV->getValueType(0);
+    // Create a new constant of the appropriate type for the transformed
+    // DAG.
+    SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
+    // The AND node needs bitcasts to/from an integer vector type around it.
+    SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
+    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
+                                 N->getOperand(0)->getOperand(0), MaskConst);
+    SDValue Res = DAG.getBitcast(VT, NewAnd);
+    return Res;
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
+                                        const X86Subtarget *Subtarget) {
+  SDValue Op0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT InVT = Op0.getValueType();
+  EVT InSVT = InVT.getScalarType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
+  // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
+  if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
+    SDLoc dl(N);
+    EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                 InVT.getVectorNumElements());
+    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
+
+    if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
+      return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
+
+    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
+                                        const X86Subtarget *Subtarget) {
+  // First try to optimize away the conversion entirely when it's
+  // conditionally from a constant. Vectors only.
+  if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
+    return Res;
+
+  // Now move on to more general possibilities.
+  SDValue Op0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT InVT = Op0.getValueType();
+  EVT InSVT = InVT.getScalarType();
+
+  // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
+  // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
+  if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
+    SDLoc dl(N);
+    EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                 InVT.getVectorNumElements());
+    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
+    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
+  }
+
+  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
+  // a 32-bit target where SSE doesn't support i64->FP operations.
+  if (!Subtarget->useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
+    LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
+    EVT LdVT = Ld->getValueType(0);
+
+    // This transformation is not supported if the result type is f16
+    if (VT == MVT::f16)
+      return SDValue();
+
+    if (!Ld->isVolatile() && !VT.isVector() &&
+        ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
+        !Subtarget->is64Bit() && LdVT == MVT::i64) {
+      SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
+          SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
+      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
+      return FILDChain;
+    }
+  }
+  return SDValue();
+}
+
+// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
+static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
+                                 X86TargetLowering::DAGCombinerInfo &DCI) {
+  // If the LHS and RHS of the ADC node are zero, then it can't overflow and
+  // the result is either zero or one (depending on the input carry bit).
+  // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
+  if (X86::isZeroNode(N->getOperand(0)) &&
+      X86::isZeroNode(N->getOperand(1)) &&
+      // We don't have a good way to replace an EFLAGS use, so only do this when
+      // dead right now.
+      SDValue(N, 1).use_empty()) {
+    SDLoc DL(N);
+    EVT VT = N->getValueType(0);
+    SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
+    SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
+                               DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                                           DAG.getConstant(X86::COND_B, DL,
+                                                           MVT::i8),
+                                           N->getOperand(2)),
+                               DAG.getConstant(1, DL, VT));
+    return DCI.CombineTo(N, Res1, CarryOut);
+  }
+
+  return SDValue();
+}
+
+// fold (add Y, (sete  X, 0)) -> adc  0, Y
+//      (add Y, (setne X, 0)) -> sbb -1, Y
+//      (sub (sete  X, 0), Y) -> sbb  0, Y
+//      (sub (setne X, 0), Y) -> adc -1, Y
+static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+
+  // Look through ZExts.
+  SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
+  if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
+    return SDValue();
+
+  SDValue SetCC = Ext.getOperand(0);
+  if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
+    return SDValue();
+
+  X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
+  if (CC != X86::COND_E && CC != X86::COND_NE)
+    return SDValue();
+
+  SDValue Cmp = SetCC.getOperand(1);
+  if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
+      !X86::isZeroNode(Cmp.getOperand(1)) ||
+      !Cmp.getOperand(0).getValueType().isInteger())
+    return SDValue();
+
+  SDValue CmpOp0 = Cmp.getOperand(0);
+  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
+                               DAG.getConstant(1, DL, CmpOp0.getValueType()));
+
+  SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
+  if (CC == X86::COND_NE)
+    return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
+                       DL, OtherVal.getValueType(), OtherVal,
+                       DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
+                       NewCmp);
+  return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
+                     DL, OtherVal.getValueType(), OtherVal,
+                     DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
+}
+
+/// PerformADDCombine - Do target-specific dag combines on integer adds.
+static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // Try to synthesize horizontal adds from adds of shuffles.
+  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+       (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+      isHorizontalBinOp(Op0, Op1, true))
+    return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
+
+  return OptimizeConditionalInDecrement(N, DAG);
+}
+
+static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget *Subtarget) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // X86 can't encode an immediate LHS of a sub. See if we can push the
+  // negation into a preceding instruction.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
+    // If the RHS of the sub is a XOR with one use and a constant, invert the
+    // immediate. Then add one to the LHS of the sub so we can turn
+    // X-Y -> X+~Y+1, saving one register.
+    if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
+        isa<ConstantSDNode>(Op1.getOperand(1))) {
+      APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
+      EVT VT = Op0.getValueType();
+      SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
+                                   Op1.getOperand(0),
+                                   DAG.getConstant(~XorC, SDLoc(Op1), VT));
+      return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
+                         DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
+    }
+  }
+
+  // Try to synthesize horizontal adds from adds of shuffles.
+  EVT VT = N->getValueType(0);
+  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+       (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+      isHorizontalBinOp(Op0, Op1, true))
+    return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
+
+  return OptimizeConditionalInDecrement(N, DAG);
+}
+
+/// performVZEXTCombine - Performs build vector combines
+static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+  MVT VT = N->getSimpleValueType(0);
+  SDValue Op = N->getOperand(0);
+  MVT OpVT = Op.getSimpleValueType();
+  MVT OpEltVT = OpVT.getVectorElementType();
+  unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
+
+  // (vzext (bitcast (vzext (x)) -> (vzext x)
+  SDValue V = Op;
+  while (V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+
+  if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
+    MVT InnerVT = V.getSimpleValueType();
+    MVT InnerEltVT = InnerVT.getVectorElementType();
+
+    // If the element sizes match exactly, we can just do one larger vzext. This
+    // is always an exact type match as vzext operates on integer types.
+    if (OpEltVT == InnerEltVT) {
+      assert(OpVT == InnerVT && "Types must match for vzext!");
+      return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
+    }
+
+    // The only other way we can combine them is if only a single element of the
+    // inner vzext is used in the input to the outer vzext.
+    if (InnerEltVT.getSizeInBits() < InputBits)
+      return SDValue();
+
+    // In this case, the inner vzext is completely dead because we're going to
+    // only look at bits inside of the low element. Just do the outer vzext on
+    // a bitcast of the input to the inner.
+    return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
+  }
+
+  // Check if we can bypass extracting and re-inserting an element of an input
+  // vector. Essentially:
+  // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
+  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+      V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
+    SDValue ExtractedV = V.getOperand(0);
+    SDValue OrigV = ExtractedV.getOperand(0);
+    if (isNullConstant(ExtractedV.getOperand(1))) {
+        MVT OrigVT = OrigV.getSimpleValueType();
+        // Extract a subvector if necessary...
+        if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
+          int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
+          OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
+                                    OrigVT.getVectorNumElements() / Ratio);
+          OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
+                              DAG.getIntPtrConstant(0, DL));
+        }
+        Op = DAG.getBitcast(OpVT, OrigV);
+        return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
+      }
+  }
+
+  return SDValue();
+}
+
+SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::EXTRACT_VECTOR_ELT:
+    return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
+  case ISD::VSELECT:
+  case ISD::SELECT:
+  case X86ISD::SHRUNKBLEND:
+    return PerformSELECTCombine(N, DAG, DCI, Subtarget);
+  case ISD::BITCAST:        return PerformBITCASTCombine(N, DAG, Subtarget);
+  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
+  case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
+  case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
+  case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
+  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
+  case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
+  case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
+  case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
+  case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
+  case ISD::MLOAD:          return PerformMLOADCombine(N, DAG, DCI, Subtarget);
+  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
+  case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
+  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
+  case ISD::UINT_TO_FP:     return PerformUINT_TO_FPCombine(N, DAG, Subtarget);
+  case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
+  case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
+  case ISD::FNEG:           return PerformFNEGCombine(N, DAG, Subtarget);
+  case ISD::TRUNCATE:       return PerformTRUNCATECombine(N, DAG, Subtarget);
+  case X86ISD::FXOR:
+  case X86ISD::FOR:         return PerformFORCombine(N, DAG, Subtarget);
+  case X86ISD::FMIN:
+  case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:        return performFMinNumFMaxNumCombine(N, DAG,
+                                                                Subtarget);
+  case X86ISD::FAND:        return PerformFANDCombine(N, DAG, Subtarget);
+  case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG, Subtarget);
+  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
+  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
+// TODO: refactor the [SU]DIVREM8_[SZ]EXT_HREG code so that it's not duplicated.
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
+  case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
+  case ISD::SIGN_EXTEND_INREG:
+    return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
+  case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
+  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
+  case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
+  case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
+  case X86ISD::SHUFP:       // Handle all target specific shuffles
+  case X86ISD::PALIGNR:
+  case X86ISD::BLENDI:
+  case X86ISD::UNPCKH:
+  case X86ISD::UNPCKL:
+  case X86ISD::MOVHLPS:
+  case X86ISD::MOVLHPS:
+  case X86ISD::PSHUFB:
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD:
+  case X86ISD::VPERMILPI:
+  case X86ISD::VPERM2X128:
+  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
+  case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
+  case ISD::MGATHER:
+  case ISD::MSCATTER:       return PerformGatherScatterCombine(N, DAG);
+  }
+
+  return SDValue();
+}
+
+/// isTypeDesirableForOp - Return true if the target has native support for
+/// the specified value type and it is 'desirable' to use the type for the
+/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
+/// instruction encodings are longer and some i16 instructions are slow.
+bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
+  if (!isTypeLegal(VT))
+    return false;
+  if (VT != MVT::i16)
+    return true;
+
+  switch (Opc) {
+  default:
+    return true;
+  case ISD::LOAD:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SUB:
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    return false;
+  }
+}
+
+/// This function checks if any of the users of EFLAGS copies the EFLAGS. We
+/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
+/// we don't adjust the stack we clobber the first frame index.
+/// See X86InstrInfo::copyPhysReg.
+bool X86TargetLowering::hasCopyImplyingStackAdjustment(
+    MachineFunction *MF) const {
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  return any_of(MRI.reg_instructions(X86::EFLAGS),
+                [](const MachineInstr &RI) { return RI.isCopy(); });
+}
+
+/// IsDesirableToPromoteOp - This method query the target whether it is
+/// beneficial for dag combiner to promote the specified node. If true, it
+/// should return the desired promotion type by reference.
+bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
+  EVT VT = Op.getValueType();
+  if (VT != MVT::i16)
+    return false;
+
+  bool Promote = false;
+  bool Commute = false;
+  switch (Op.getOpcode()) {
+  default: break;
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(Op);
+    // If the non-extending load has a single use and it's not live out, then it
+    // might be folded.
+    if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
+                                                     Op.hasOneUse()*/) {
+      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+             UE = Op.getNode()->use_end(); UI != UE; ++UI) {
+        // The only case where we'd want to promote LOAD (rather then it being
+        // promoted as an operand is when it's only use is liveout.
+        if (UI->getOpcode() != ISD::CopyToReg)
+          return false;
+      }
+    }
+    Promote = true;
+    break;
+  }
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+    Promote = true;
+    break;
+  case ISD::SHL:
+  case ISD::SRL: {
+    SDValue N0 = Op.getOperand(0);
+    // Look out for (store (shl (load), x)).
+    if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
+      return false;
+    Promote = true;
+    break;
+  }
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    Commute = true;
+    // fallthrough
+  case ISD::SUB: {
+    SDValue N0 = Op.getOperand(0);
+    SDValue N1 = Op.getOperand(1);
+    if (!Commute && MayFoldLoad(N1))
+      return false;
+    // Avoid disabling potential load folding opportunities.
+    if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
+      return false;
+    if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
+      return false;
+    Promote = true;
+  }
+  }
+
+  PVT = MVT::i32;
+  return Promote;
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+// Helper to match a string separated by whitespace.
+static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
+  S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
+
+  for (StringRef Piece : Pieces) {
+    if (!S.startswith(Piece)) // Check if the piece matches.
+      return false;
+
+    S = S.substr(Piece.size());
+    StringRef::size_type Pos = S.find_first_not_of(" \t");
+    if (Pos == 0) // We matched a prefix.
+      return false;
+
+    S = S.substr(Pos);
+  }
+
+  return S.empty();
+}
+
+static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
+
+  if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
+    if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
+        std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
+        std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
+
+      if (AsmPieces.size() == 3)
+        return true;
+      else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
+  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+
+  std::string AsmStr = IA->getAsmString();
+
+  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+  if (!Ty || Ty->getBitWidth() % 16 != 0)
+    return false;
+
+  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
+  SmallVector<StringRef, 4> AsmPieces;
+  SplitString(AsmStr, AsmPieces, ";\n");
+
+  switch (AsmPieces.size()) {
+  default: return false;
+  case 1:
+    // FIXME: this should verify that we are targeting a 486 or better.  If not,
+    // we will turn this bswap into something that will be lowered to logical
+    // ops instead of emitting the bswap asm.  For now, we don't support 486 or
+    // lower so don't worry about this.
+    // bswap $0
+    if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
+        matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
+        matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
+        matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
+        matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
+        matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
+      // No need to check constraints, nothing other than the equivalent of
+      // "=r,0" would be valid here.
+      return IntrinsicLowering::LowerToByteSwap(CI);
+    }
+
+    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
+    if (CI->getType()->isIntegerTy(16) &&
+        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+        (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
+         matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
+      AsmPieces.clear();
+      StringRef ConstraintsStr = IA->getConstraintString();
+      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
+      if (clobbersFlagRegisters(AsmPieces))
+        return IntrinsicLowering::LowerToByteSwap(CI);
+    }
+    break;
+  case 3:
+    if (CI->getType()->isIntegerTy(32) &&
+        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+        matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
+        matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
+        matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
+      AsmPieces.clear();
+      StringRef ConstraintsStr = IA->getConstraintString();
+      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
+      if (clobbersFlagRegisters(AsmPieces))
+        return IntrinsicLowering::LowerToByteSwap(CI);
+    }
+
+    if (CI->getType()->isIntegerTy(64)) {
+      InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
+      if (Constraints.size() >= 2 &&
+          Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
+          Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
+        // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
+        if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
+            matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
+            matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
+          return IntrinsicLowering::LowerToByteSwap(CI);
+      }
+    }
+    break;
+  }
+  return false;
+}
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+X86TargetLowering::ConstraintType
+X86TargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'R':
+    case 'q':
+    case 'Q':
+    case 'f':
+    case 't':
+    case 'u':
+    case 'y':
+    case 'x':
+    case 'Y':
+    case 'l':
+      return C_RegisterClass;
+    case 'a':
+    case 'b':
+    case 'c':
+    case 'd':
+    case 'S':
+    case 'D':
+    case 'A':
+      return C_Register;
+    case 'I':
+    case 'J':
+    case 'K':
+    case 'L':
+    case 'M':
+    case 'N':
+    case 'G':
+    case 'C':
+    case 'e':
+    case 'Z':
+      return C_Other;
+    default:
+      break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+  X86TargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+  case 'R':
+  case 'q':
+  case 'Q':
+  case 'a':
+  case 'b':
+  case 'c':
+  case 'd':
+  case 'S':
+  case 'D':
+  case 'A':
+    if (CallOperandVal->getType()->isIntegerTy())
+      weight = CW_SpecificReg;
+    break;
+  case 'f':
+  case 't':
+  case 'u':
+    if (type->isFloatingPointTy())
+      weight = CW_SpecificReg;
+    break;
+  case 'y':
+    if (type->isX86_MMXTy() && Subtarget->hasMMX())
+      weight = CW_SpecificReg;
+    break;
+  case 'x':
+  case 'Y':
+    if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
+        ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
+      weight = CW_Register;
+    break;
+  case 'I':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
+      if (C->getZExtValue() <= 31)
+        weight = CW_Constant;
+    }
+    break;
+  case 'J':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 63)
+        weight = CW_Constant;
+    }
+    break;
+  case 'K':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
+        weight = CW_Constant;
+    }
+    break;
+  case 'L':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
+        weight = CW_Constant;
+    }
+    break;
+  case 'M':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 3)
+        weight = CW_Constant;
+    }
+    break;
+  case 'N':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 0xff)
+        weight = CW_Constant;
+    }
+    break;
+  case 'G':
+  case 'C':
+    if (isa<ConstantFP>(CallOperandVal)) {
+      weight = CW_Constant;
+    }
+    break;
+  case 'e':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if ((C->getSExtValue() >= -0x80000000LL) &&
+          (C->getSExtValue() <= 0x7fffffffLL))
+        weight = CW_Constant;
+    }
+    break;
+  case 'Z':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 0xffffffff)
+        weight = CW_Constant;
+    }
+    break;
+  }
+  return weight;
+}
+
+/// LowerXConstraint - try to replace an X constraint, which matches anything,
+/// with another that has more specific requirements based on the type of the
+/// corresponding operand.
+const char *X86TargetLowering::
+LowerXConstraint(EVT ConstraintVT) const {
+  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
+  // 'f' like normal targets.
+  if (ConstraintVT.isFloatingPoint()) {
+    if (Subtarget->hasSSE2())
+      return "Y";
+    if (Subtarget->hasSSE1())
+      return "x";
+  }
+
+  return TargetLowering::LowerXConstraint(ConstraintVT);
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     std::string &Constraint,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result;
+
+  // Only support length 1 constraints for now.
+  if (Constraint.length() > 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default: break;
+  case 'I':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 31) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'J':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 63) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'K':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (isInt<8>(C->getSExtValue())) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'L':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
+          (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
+        Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'M':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 3) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'N':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 255) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'O':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 127) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'e': {
+    // 32-bit signed value
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+                                           C->getSExtValue())) {
+        // Widen to 64 bits here to get it sign extended.
+        Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
+        break;
+      }
+    // FIXME gcc accepts some relocatable values here too, but only in certain
+    // memory models; it's complicated.
+    }
+    return;
+  }
+  case 'Z': {
+    // 32-bit unsigned value
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+                                           C->getZExtValue())) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    // FIXME gcc accepts some relocatable values here too, but only in certain
+    // memory models; it's complicated.
+    return;
+  }
+  case 'i': {
+    // Literal immediates are always ok.
+    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
+      // Widen to 64 bits here to get it sign extended.
+      Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
+      break;
+    }
+
+    // In any sort of PIC mode addresses need to be computed at runtime by
+    // adding in a register or some sort of table lookup.  These can't
+    // be used as immediates.
+    if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
+      return;
+
+    // If we are in non-pic codegen mode, we allow the address of a global (with
+    // an optional displacement) to be used with 'i'.
+    GlobalAddressSDNode *GA = nullptr;
+    int64_t Offset = 0;
+
+    // Match either (GA), (GA+C), (GA+C1+C2), etc.
+    while (1) {
+      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
+        Offset += GA->getOffset();
+        break;
+      } else if (Op.getOpcode() == ISD::ADD) {
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+          Offset += C->getZExtValue();
+          Op = Op.getOperand(0);
+          continue;
+        }
+      } else if (Op.getOpcode() == ISD::SUB) {
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+          Offset += -C->getZExtValue();
+          Op = Op.getOperand(0);
+          continue;
+        }
+      }
+
+      // Otherwise, this isn't something we can handle, reject it.
+      return;
+    }
+
+    const GlobalValue *GV = GA->getGlobal();
+    // If we require an extra load to get this address, as in PIC mode, we
+    // can't accept it.
+    if (isGlobalStubReference(
+            Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
+      return;
+
+    Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
+                                        GA->getValueType(0), Offset);
+    break;
+  }
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                StringRef Constraint,
+                                                MVT VT) const {
+  // First, see if this is a constraint that directly corresponds to an LLVM
+  // register class.
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default: break;
+      // TODO: Slight differences here in allocation order and leaving
+      // RIP in the class. Do they matter any more here than they do
+      // in the normal allocation?
+    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
+      if (Subtarget->is64Bit()) {
+        if (VT == MVT::i32 || VT == MVT::f32)
+          return std::make_pair(0U, &X86::GR32RegClass);
+        if (VT == MVT::i16)
+          return std::make_pair(0U, &X86::GR16RegClass);
+        if (VT == MVT::i8 || VT == MVT::i1)
+          return std::make_pair(0U, &X86::GR8RegClass);
+        if (VT == MVT::i64 || VT == MVT::f64)
+          return std::make_pair(0U, &X86::GR64RegClass);
+        break;
+      }
+      // 32-bit fallthrough
+    case 'Q':   // Q_REGS
+      if (VT == MVT::i32 || VT == MVT::f32)
+        return std::make_pair(0U, &X86::GR32_ABCDRegClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, &X86::GR16_ABCDRegClass);
+      if (VT == MVT::i8 || VT == MVT::i1)
+        return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
+      if (VT == MVT::i64)
+        return std::make_pair(0U, &X86::GR64_ABCDRegClass);
+      break;
+    case 'r':   // GENERAL_REGS
+    case 'l':   // INDEX_REGS
+      if (VT == MVT::i8 || VT == MVT::i1)
+        return std::make_pair(0U, &X86::GR8RegClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, &X86::GR16RegClass);
+      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
+        return std::make_pair(0U, &X86::GR32RegClass);
+      return std::make_pair(0U, &X86::GR64RegClass);
+    case 'R':   // LEGACY_REGS
+      if (VT == MVT::i8 || VT == MVT::i1)
+        return std::make_pair(0U, &X86::GR8_NOREXRegClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, &X86::GR16_NOREXRegClass);
+      if (VT == MVT::i32 || !Subtarget->is64Bit())
+        return std::make_pair(0U, &X86::GR32_NOREXRegClass);
+      return std::make_pair(0U, &X86::GR64_NOREXRegClass);
+    case 'f':  // FP Stack registers.
+      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
+      // value to the correct fpstack register class.
+      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
+        return std::make_pair(0U, &X86::RFP32RegClass);
+      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
+        return std::make_pair(0U, &X86::RFP64RegClass);
+      return std::make_pair(0U, &X86::RFP80RegClass);
+    case 'y':   // MMX_REGS if MMX allowed.
+      if (!Subtarget->hasMMX()) break;
+      return std::make_pair(0U, &X86::VR64RegClass);
+    case 'Y':   // SSE_REGS if SSE2 allowed
+      if (!Subtarget->hasSSE2()) break;
+      // FALL THROUGH.
+    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
+      if (!Subtarget->hasSSE1()) break;
+
+      switch (VT.SimpleTy) {
+      default: break;
+      // Scalar SSE types.
+      case MVT::f32:
+      case MVT::i32:
+        return std::make_pair(0U, &X86::FR32RegClass);
+      case MVT::f64:
+      case MVT::i64:
+        return std::make_pair(0U, &X86::FR64RegClass);
+      // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
+      // Vector types.
+      case MVT::v16i8:
+      case MVT::v8i16:
+      case MVT::v4i32:
+      case MVT::v2i64:
+      case MVT::v4f32:
+      case MVT::v2f64:
+        return std::make_pair(0U, &X86::VR128RegClass);
+      // AVX types.
+      case MVT::v32i8:
+      case MVT::v16i16:
+      case MVT::v8i32:
+      case MVT::v4i64:
+      case MVT::v8f32:
+      case MVT::v4f64:
+        return std::make_pair(0U, &X86::VR256RegClass);
+      case MVT::v8f64:
+      case MVT::v16f32:
+      case MVT::v16i32:
+      case MVT::v8i64:
+        return std::make_pair(0U, &X86::VR512RegClass);
+      }
+      break;
+    }
+  }
+
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  std::pair<unsigned, const TargetRegisterClass*> Res;
+  Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+
+  // Not found as a standard register?
+  if (!Res.second) {
+    // Map st(0) -> st(7) -> ST0
+    if (Constraint.size() == 7 && Constraint[0] == '{' &&
+        tolower(Constraint[1]) == 's' &&
+        tolower(Constraint[2]) == 't' &&
+        Constraint[3] == '(' &&
+        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
+        Constraint[5] == ')' &&
+        Constraint[6] == '}') {
+
+      Res.first = X86::FP0+Constraint[4]-'0';
+      Res.second = &X86::RFP80RegClass;
+      return Res;
+    }
+
+    // GCC allows "st(0)" to be called just plain "st".
+    if (StringRef("{st}").equals_lower(Constraint)) {
+      Res.first = X86::FP0;
+      Res.second = &X86::RFP80RegClass;
+      return Res;
+    }
+
+    // flags -> EFLAGS
+    if (StringRef("{flags}").equals_lower(Constraint)) {
+      Res.first = X86::EFLAGS;
+      Res.second = &X86::CCRRegClass;
+      return Res;
+    }
+
+    // 'A' means EAX + EDX.
+    if (Constraint == "A") {
+      Res.first = X86::EAX;
+      Res.second = &X86::GR32_ADRegClass;
+      return Res;
+    }
+    return Res;
+  }
+
+  // Otherwise, check to see if this is a register class of the wrong value
+  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
+  // turn into {ax},{dx}.
+  // MVT::Other is used to specify clobber names.
+  if (Res.second->hasType(VT) || VT == MVT::Other)
+    return Res;   // Correct type already, nothing to do.
+
+  // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
+  // return "eax". This should even work for things like getting 64bit integer
+  // registers when given an f64 type.
+  const TargetRegisterClass *Class = Res.second;
+  if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass ||
+      Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) {
+    unsigned Size = VT.getSizeInBits();
+    if (Size == 1) Size = 8;
+    unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
+    if (DestReg > 0) {
+      Res.first = DestReg;
+      Res.second = Size == 8 ? &X86::GR8RegClass
+                 : Size == 16 ? &X86::GR16RegClass
+                 : Size == 32 ? &X86::GR32RegClass
+                 : &X86::GR64RegClass;
+      assert(Res.second->contains(Res.first) && "Register in register class");
+    } else {
+      // No register found/type mismatch.
+      Res.first = 0;
+      Res.second = nullptr;
+    }
+  } else if (Class == &X86::FR32RegClass || Class == &X86::FR64RegClass ||
+             Class == &X86::VR128RegClass || Class == &X86::VR256RegClass ||
+             Class == &X86::FR32XRegClass || Class == &X86::FR64XRegClass ||
+             Class == &X86::VR128XRegClass || Class == &X86::VR256XRegClass ||
+             Class == &X86::VR512RegClass) {
+    // Handle references to XMM physical registers that got mapped into the
+    // wrong class.  This can happen with constraints like {xmm0} where the
+    // target independent register mapper will just pick the first match it can
+    // find, ignoring the required type.
+
+    // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
+    if (VT == MVT::f32 || VT == MVT::i32)
+      Res.second = &X86::FR32RegClass;
+    else if (VT == MVT::f64 || VT == MVT::i64)
+      Res.second = &X86::FR64RegClass;
+    else if (X86::VR128RegClass.hasType(VT))
+      Res.second = &X86::VR128RegClass;
+    else if (X86::VR256RegClass.hasType(VT))
+      Res.second = &X86::VR256RegClass;
+    else if (X86::VR512RegClass.hasType(VT))
+      Res.second = &X86::VR512RegClass;
+    else {
+      // Type mismatch and not a clobber: Return an error;
+      Res.first = 0;
+      Res.second = nullptr;
+    }
+  }
+
+  return Res;
+}
+
+int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
+                                            const AddrMode &AM, Type *Ty,
+                                            unsigned AS) const {
+  // Scaling factors are not free at all.
+  // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
+  // will take 2 allocations in the out of order engine instead of 1
+  // for plain addressing mode, i.e. inst (reg1).
+  // E.g.,
+  // vaddps (%rsi,%drx), %ymm0, %ymm1
+  // Requires two allocations (one for the load, one for the computation)
+  // whereas:
+  // vaddps (%rsi), %ymm0, %ymm1
+  // Requires just 1 allocation, i.e., freeing allocations for other operations
+  // and having less micro operations to execute.
+  //
+  // For some X86 architectures, this is even worse because for instance for
+  // stores, the complex addressing mode forces the instruction to use the
+  // "load" ports instead of the dedicated "store" port.
+  // E.g., on Haswell:
+  // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
+  // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
+  if (isLegalAddressingMode(DL, AM, Ty, AS))
+    // Scale represents reg2 * scale, thus account for 1
+    // as soon as we use a second register.
+    return AM.Scale != 0;
+  return -1;
+}
+
+bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+  // Integer division on x86 is expensive. However, when aggressively optimizing
+  // for code size, we prefer to use a div instruction, as it is usually smaller
+  // than the alternative sequence.
+  // The exception to this is vector division. Since x86 doesn't have vector
+  // integer division, leaving the division as-is is a loss even in terms of
+  // size, because it will have to be scalarized, while the alternative code
+  // sequence can be performed in vector form.
+  bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
+                                   Attribute::MinSize);
+  return OptSize && !VT.isVector();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
new file mode 100644
index 0000000..8bb0e5f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
@@ -0,0 +1,1157 @@
+//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
+#define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+namespace llvm {
+  class X86Subtarget;
+  class X86TargetMachine;
+
+  namespace X86ISD {
+    // X86 Specific DAG Nodes
+    enum NodeType : unsigned {
+      // Start the numbering where the builtin ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// Bit scan forward.
+      BSF,
+      /// Bit scan reverse.
+      BSR,
+
+      /// Double shift instructions. These correspond to
+      /// X86::SHLDxx and X86::SHRDxx instructions.
+      SHLD,
+      SHRD,
+
+      /// Bitwise logical AND of floating point values. This corresponds
+      /// to X86::ANDPS or X86::ANDPD.
+      FAND,
+
+      /// Bitwise logical OR of floating point values. This corresponds
+      /// to X86::ORPS or X86::ORPD.
+      FOR,
+
+      /// Bitwise logical XOR of floating point values. This corresponds
+      /// to X86::XORPS or X86::XORPD.
+      FXOR,
+
+      ///  Bitwise logical ANDNOT of floating point values. This
+      /// corresponds to X86::ANDNPS or X86::ANDNPD.
+      FANDN,
+
+      /// These operations represent an abstract X86 call
+      /// instruction, which includes a bunch of information.  In particular the
+      /// operands of these node are:
+      ///
+      ///     #0 - The incoming token chain
+      ///     #1 - The callee
+      ///     #2 - The number of arg bytes the caller pushes on the stack.
+      ///     #3 - The number of arg bytes the callee pops off the stack.
+      ///     #4 - The value to pass in AL/AX/EAX (optional)
+      ///     #5 - The value to pass in DL/DX/EDX (optional)
+      ///
+      /// The result values of these nodes are:
+      ///
+      ///     #0 - The outgoing token chain
+      ///     #1 - The first register result value (optional)
+      ///     #2 - The second register result value (optional)
+      ///
+      CALL,
+
+      /// This operation implements the lowering for readcyclecounter
+      RDTSC_DAG,
+
+      /// X86 Read Time-Stamp Counter and Processor ID.
+      RDTSCP_DAG,
+
+      /// X86 Read Performance Monitoring Counters.
+      RDPMC_DAG,
+
+      /// X86 compare and logical compare instructions.
+      CMP, COMI, UCOMI,
+
+      /// X86 bit-test instructions.
+      BT,
+
+      /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
+      /// operand, usually produced by a CMP instruction.
+      SETCC,
+
+      /// X86 Select
+      SELECT,
+
+      // Same as SETCC except it's materialized with a sbb and the value is all
+      // one's or all zero's.
+      SETCC_CARRY,  // R = carry_bit ? ~0 : 0
+
+      /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
+      /// Operands are two FP values to compare; result is a mask of
+      /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
+      FSETCC,
+
+      /// X86 MOVMSK{pd|ps}, extracts sign bits of two or four FP values,
+      /// result in an integer GPR.  Needs masking for scalar result.
+      FGETSIGNx86,
+
+      /// X86 conditional moves. Operand 0 and operand 1 are the two values
+      /// to select from. Operand 2 is the condition code, and operand 3 is the
+      /// flag operand produced by a CMP or TEST instruction. It also writes a
+      /// flag result.
+      CMOV,
+
+      /// X86 conditional branches. Operand 0 is the chain operand, operand 1
+      /// is the block to branch if condition is true, operand 2 is the
+      /// condition code, and operand 3 is the flag operand produced by a CMP
+      /// or TEST instruction.
+      BRCOND,
+
+      /// Return with a flag operand. Operand 0 is the chain operand, operand
+      /// 1 is the number of bytes of stack to pop.
+      RET_FLAG,
+
+      /// Return from interrupt. Operand 0 is the number of bytes to pop.
+      IRET,
+
+      /// Repeat fill, corresponds to X86::REP_STOSx.
+      REP_STOS,
+
+      /// Repeat move, corresponds to X86::REP_MOVSx.
+      REP_MOVS,
+
+      /// On Darwin, this node represents the result of the popl
+      /// at function entry, used for PIC code.
+      GlobalBaseReg,
+
+      /// A wrapper node for TargetConstantPool,
+      /// TargetExternalSymbol, and TargetGlobalAddress.
+      Wrapper,
+
+      /// Special wrapper used under X86-64 PIC mode for RIP
+      /// relative displacements.
+      WrapperRIP,
+
+      /// Copies a 64-bit value from the low word of an XMM vector
+      /// to an MMX vector.  If you think this is too close to the previous
+      /// mnemonic, so do I; blame Intel.
+      MOVDQ2Q,
+
+      /// Copies a 32-bit value from the low word of a MMX
+      /// vector to a GPR.
+      MMX_MOVD2W,
+
+      /// Copies a GPR into the low 32-bit word of a MMX vector
+      /// and zero out the high word.
+      MMX_MOVW2D,
+
+      /// Extract an 8-bit value from a vector and zero extend it to
+      /// i32, corresponds to X86::PEXTRB.
+      PEXTRB,
+
+      /// Extract a 16-bit value from a vector and zero extend it to
+      /// i32, corresponds to X86::PEXTRW.
+      PEXTRW,
+
+      /// Insert any element of a 4 x float vector into any element
+      /// of a destination 4 x floatvector.
+      INSERTPS,
+
+      /// Insert the lower 8-bits of a 32-bit value to a vector,
+      /// corresponds to X86::PINSRB.
+      PINSRB,
+
+      /// Insert the lower 16-bits of a 32-bit value to a vector,
+      /// corresponds to X86::PINSRW.
+      PINSRW, MMX_PINSRW,
+
+      /// Shuffle 16 8-bit values within a vector.
+      PSHUFB,
+
+      /// Compute Sum of Absolute Differences.
+      PSADBW,
+      /// Compute Double Block Packed Sum-Absolute-Differences
+      DBPSADBW,
+
+      /// Bitwise Logical AND NOT of Packed FP values.
+      ANDNP,
+
+      /// Copy integer sign.
+      PSIGN,
+
+      /// Blend where the selector is an immediate.
+      BLENDI,
+
+      /// Blend where the condition has been shrunk.
+      /// This is used to emphasize that the condition mask is
+      /// no more valid for generic VSELECT optimizations.
+      SHRUNKBLEND,
+
+      /// Combined add and sub on an FP vector.
+      ADDSUB,
+
+      //  FP vector ops with rounding mode.
+      FADD_RND,
+      FSUB_RND,
+      FMUL_RND,
+      FDIV_RND,
+      FMAX_RND,
+      FMIN_RND,
+      FSQRT_RND,
+
+      // FP vector get exponent 
+      FGETEXP_RND,
+      // Extract Normalized Mantissas
+      VGETMANT,
+      // FP Scale
+      SCALEF,
+      // Integer add/sub with unsigned saturation.
+      ADDUS,
+      SUBUS,
+      // Integer add/sub with signed saturation.
+      ADDS,
+      SUBS,
+      // Unsigned Integer average 
+      AVG,
+      /// Integer horizontal add.
+      HADD,
+
+      /// Integer horizontal sub.
+      HSUB,
+
+      /// Floating point horizontal add.
+      FHADD,
+
+      /// Floating point horizontal sub.
+      FHSUB,
+
+      // Integer absolute value
+      ABS,
+
+      // Detect Conflicts Within a Vector
+      CONFLICT,
+
+      /// Floating point max and min.
+      FMAX, FMIN,
+
+      /// Commutative FMIN and FMAX.
+      FMAXC, FMINC,
+
+      /// Floating point reciprocal-sqrt and reciprocal approximation.
+      /// Note that these typically require refinement
+      /// in order to obtain suitable precision.
+      FRSQRT, FRCP,
+
+      // Thread Local Storage.
+      TLSADDR,
+
+      // Thread Local Storage. A call to get the start address
+      // of the TLS block for the current module.
+      TLSBASEADDR,
+
+      // Thread Local Storage.  When calling to an OS provided
+      // thunk at the address from an earlier relocation.
+      TLSCALL,
+
+      // Exception Handling helpers.
+      EH_RETURN,
+
+      // SjLj exception handling setjmp.
+      EH_SJLJ_SETJMP,
+
+      // SjLj exception handling longjmp.
+      EH_SJLJ_LONGJMP,
+
+      /// Tail call return. See X86TargetLowering::LowerCall for
+      /// the list of operands.
+      TC_RETURN,
+
+      // Vector move to low scalar and zero higher vector elements.
+      VZEXT_MOVL,
+
+      // Vector integer zero-extend.
+      VZEXT,
+
+      // Vector integer signed-extend.
+      VSEXT,
+
+      // Vector integer truncate.
+      VTRUNC,
+      // Vector integer truncate with unsigned/signed saturation.
+      VTRUNCUS, VTRUNCS,
+
+      // Vector FP extend.
+      VFPEXT,
+
+      // Vector FP round.
+      VFPROUND,
+
+      // Vector signed/unsigned integer to double.
+      CVTDQ2PD, CVTUDQ2PD,
+
+      // Convert a vector to mask, set bits base on MSB.
+      CVT2MASK,
+
+      // 128-bit vector logical left / right shift
+      VSHLDQ, VSRLDQ,
+
+      // Vector shift elements
+      VSHL, VSRL, VSRA,
+
+      // Vector shift elements by immediate
+      VSHLI, VSRLI, VSRAI,
+
+      // Vector packed double/float comparison.
+      CMPP,
+
+      // Vector integer comparisons.
+      PCMPEQ, PCMPGT,
+      // Vector integer comparisons, the result is in a mask vector.
+      PCMPEQM, PCMPGTM,
+
+      /// Vector comparison generating mask bits for fp and
+      /// integer signed and unsigned data types.
+      CMPM,
+      CMPMU,
+      // Vector comparison with rounding mode for FP values
+      CMPM_RND,
+
+      // Arithmetic operations with FLAGS results.
+      ADD, SUB, ADC, SBB, SMUL,
+      INC, DEC, OR, XOR, AND,
+
+      BEXTR,  // Bit field extract
+
+      UMUL, // LOW, HI, FLAGS = umul LHS, RHS
+
+      // 8-bit SMUL/UMUL - AX, FLAGS = smul8/umul8 AL, RHS
+      SMUL8, UMUL8,
+
+      // 8-bit divrem that zero-extend the high result (AH).
+      UDIVREM8_ZEXT_HREG,
+      SDIVREM8_SEXT_HREG,
+
+      // X86-specific multiply by immediate.
+      MUL_IMM,
+
+      // Vector bitwise comparisons.
+      PTEST,
+
+      // Vector packed fp sign bitwise comparisons.
+      TESTP,
+
+      // Vector "test" in AVX-512, the result is in a mask vector.
+      TESTM,
+      TESTNM,
+
+      // OR/AND test for masks
+      KORTEST,
+      KTEST,
+
+      // Several flavors of instructions with vector shuffle behaviors.
+      PACKSS,
+      PACKUS,
+      // Intra-lane alignr
+      PALIGNR,
+      // AVX512 inter-lane alignr
+      VALIGN,
+      PSHUFD,
+      PSHUFHW,
+      PSHUFLW,
+      SHUFP,
+      //Shuffle Packed Values at 128-bit granularity
+      SHUF128,
+      MOVDDUP,
+      MOVSHDUP,
+      MOVSLDUP,
+      MOVLHPS,
+      MOVLHPD,
+      MOVHLPS,
+      MOVLPS,
+      MOVLPD,
+      MOVSD,
+      MOVSS,
+      UNPCKL,
+      UNPCKH,
+      VPERMILPV,
+      VPERMILPI,
+      VPERMV,
+      VPERMV3,
+      VPERMIV3,
+      VPERMI,
+      VPERM2X128,
+      // Bitwise ternary logic
+      VPTERNLOG,
+      // Fix Up Special Packed Float32/64 values
+      VFIXUPIMM,
+      // Range Restriction Calculation For Packed Pairs of Float32/64 values
+      VRANGE,
+      // Reduce - Perform Reduction Transformation on scalar\packed FP
+      VREDUCE,
+      // RndScale - Round FP Values To Include A Given Number Of Fraction Bits
+      VRNDSCALE,
+      // VFPCLASS - Tests Types Of a FP Values for packed types.
+      VFPCLASS, 
+      // VFPCLASSS - Tests Types Of a FP Values for scalar types.
+      VFPCLASSS, 
+      // Broadcast scalar to vector
+      VBROADCAST,
+      // Broadcast mask to vector
+      VBROADCASTM,
+      // Broadcast subvector to vector
+      SUBV_BROADCAST,
+      // Insert/Extract vector element
+      VINSERT,
+      VEXTRACT,
+
+      /// SSE4A Extraction and Insertion.
+      EXTRQI, INSERTQI,
+
+      // XOP variable/immediate rotations
+      VPROT, VPROTI,
+      // XOP arithmetic/logical shifts
+      VPSHA, VPSHL,
+      // XOP signed/unsigned integer comparisons
+      VPCOM, VPCOMU,
+
+      // Vector multiply packed unsigned doubleword integers
+      PMULUDQ,
+      // Vector multiply packed signed doubleword integers
+      PMULDQ,
+      // Vector Multiply Packed UnsignedIntegers with Round and Scale
+      MULHRS,
+      // Multiply and Add Packed Integers
+      VPMADDUBSW, VPMADDWD,
+      // FMA nodes
+      FMADD,
+      FNMADD,
+      FMSUB,
+      FNMSUB,
+      FMADDSUB,
+      FMSUBADD,
+      // FMA with rounding mode
+      FMADD_RND,
+      FNMADD_RND,
+      FMSUB_RND,
+      FNMSUB_RND,
+      FMADDSUB_RND,
+      FMSUBADD_RND,
+
+      // Compress and expand
+      COMPRESS,
+      EXPAND,
+
+      //Convert Unsigned/Integer to Scalar Floating-Point Value
+      //with rounding mode
+      SINT_TO_FP_RND,
+      UINT_TO_FP_RND,
+
+      // Vector float/double to signed/unsigned integer.
+      FP_TO_SINT_RND, FP_TO_UINT_RND,
+      // Save xmm argument registers to the stack, according to %al. An operator
+      // is needed so that this can be expanded with control flow.
+      VASTART_SAVE_XMM_REGS,
+
+      // Windows's _chkstk call to do stack probing.
+      WIN_ALLOCA,
+
+      // For allocating variable amounts of stack space when using
+      // segmented stacks. Check if the current stacklet has enough space, and
+      // falls back to heap allocation if not.
+      SEG_ALLOCA,
+
+      // Memory barrier
+      MEMBARRIER,
+      MFENCE,
+      SFENCE,
+      LFENCE,
+
+      // Store FP status word into i16 register.
+      FNSTSW16r,
+
+      // Store contents of %ah into %eflags.
+      SAHF,
+
+      // Get a random integer and indicate whether it is valid in CF.
+      RDRAND,
+
+      // Get a NIST SP800-90B & C compliant random integer and
+      // indicate whether it is valid in CF.
+      RDSEED,
+
+      PCMPISTRI,
+      PCMPESTRI,
+
+      // Test if in transactional execution.
+      XTEST,
+
+      // ERI instructions
+      RSQRT28, RCP28, EXP2,
+
+      // Compare and swap.
+      LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      LCMPXCHG8_DAG,
+      LCMPXCHG16_DAG,
+
+      // Load, scalar_to_vector, and zero extend.
+      VZEXT_LOAD,
+
+      // Store FP control world into i16 memory.
+      FNSTCW16m,
+
+      /// This instruction implements FP_TO_SINT with the
+      /// integer destination in memory and a FP reg source.  This corresponds
+      /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+      /// has two inputs (token chain and address) and two outputs (int value
+      /// and token chain).
+      FP_TO_INT16_IN_MEM,
+      FP_TO_INT32_IN_MEM,
+      FP_TO_INT64_IN_MEM,
+
+      /// This instruction implements SINT_TO_FP with the
+      /// integer source in memory and FP reg result.  This corresponds to the
+      /// X86::FILD*m instructions. It has three inputs (token chain, address,
+      /// and source type) and two outputs (FP value and token chain). FILD_FLAG
+      /// also produces a flag).
+      FILD,
+      FILD_FLAG,
+
+      /// This instruction implements an extending load to FP stack slots.
+      /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+      /// operand, ptr to load from, and a ValueType node indicating the type
+      /// to load to.
+      FLD,
+
+      /// This instruction implements a truncating store to FP stack
+      /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+      /// chain operand, value to store, address, and a ValueType to store it
+      /// as.
+      FST,
+
+      /// This instruction grabs the address of the next argument
+      /// from a va_list. (reads and modifies the va_list in memory)
+      VAARG_64
+
+      // WARNING: Do not add anything in the end unless you want the node to
+      // have memop! In fact, starting from ATOMADD64_DAG all opcodes will be
+      // thought as target memory ops!
+    };
+  }
+
+  /// Define some predicates that are used for node matching.
+  namespace X86 {
+    /// Return true if the specified
+    /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
+    /// suitable for input to VEXTRACTF128, VEXTRACTI128 instructions.
+    bool isVEXTRACT128Index(SDNode *N);
+
+    /// Return true if the specified
+    /// INSERT_SUBVECTOR operand specifies a subvector insert that is
+    /// suitable for input to VINSERTF128, VINSERTI128 instructions.
+    bool isVINSERT128Index(SDNode *N);
+
+    /// Return true if the specified
+    /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
+    /// suitable for input to VEXTRACTF64X4, VEXTRACTI64X4 instructions.
+    bool isVEXTRACT256Index(SDNode *N);
+
+    /// Return true if the specified
+    /// INSERT_SUBVECTOR operand specifies a subvector insert that is
+    /// suitable for input to VINSERTF64X4, VINSERTI64X4 instructions.
+    bool isVINSERT256Index(SDNode *N);
+
+    /// Return the appropriate
+    /// immediate to extract the specified EXTRACT_SUBVECTOR index
+    /// with VEXTRACTF128, VEXTRACTI128 instructions.
+    unsigned getExtractVEXTRACT128Immediate(SDNode *N);
+
+    /// Return the appropriate
+    /// immediate to insert at the specified INSERT_SUBVECTOR index
+    /// with VINSERTF128, VINSERT128 instructions.
+    unsigned getInsertVINSERT128Immediate(SDNode *N);
+
+    /// Return the appropriate
+    /// immediate to extract the specified EXTRACT_SUBVECTOR index
+    /// with VEXTRACTF64X4, VEXTRACTI64x4 instructions.
+    unsigned getExtractVEXTRACT256Immediate(SDNode *N);
+
+    /// Return the appropriate
+    /// immediate to insert at the specified INSERT_SUBVECTOR index
+    /// with VINSERTF64x4, VINSERTI64x4 instructions.
+    unsigned getInsertVINSERT256Immediate(SDNode *N);
+
+    /// Returns true if Elt is a constant zero or floating point constant +0.0.
+    bool isZeroNode(SDValue Elt);
+
+    /// Returns true of the given offset can be
+    /// fit into displacement field of the instruction.
+    bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+                                      bool hasSymbolicDisplacement = true);
+
+
+    /// Determines whether the callee is required to pop its
+    /// own arguments. Callee pop is necessary to support tail calls.
+    bool isCalleePop(CallingConv::ID CallingConv,
+                     bool is64Bit, bool IsVarArg, bool TailCallOpt);
+
+  }
+
+  //===--------------------------------------------------------------------===//
+  //  X86 Implementation of the TargetLowering interface
+  class X86TargetLowering final : public TargetLowering {
+  public:
+    explicit X86TargetLowering(const X86TargetMachine &TM,
+                               const X86Subtarget &STI);
+
+    unsigned getJumpTableEncoding() const override;
+    bool useSoftFloat() const override;
+
+    MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+      return MVT::i8;
+    }
+
+    const MCExpr *
+    LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                              const MachineBasicBlock *MBB, unsigned uid,
+                              MCContext &Ctx) const override;
+
+    /// Returns relocation base for the given PIC jumptable.
+    SDValue getPICJumpTableRelocBase(SDValue Table,
+                                     SelectionDAG &DAG) const override;
+    const MCExpr *
+    getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+                                 unsigned JTI, MCContext &Ctx) const override;
+
+    /// Return the desired alignment for ByVal aggregate
+    /// function arguments in the caller parameter area. For X86, aggregates
+    /// that contains are placed at 16-byte boundaries while the rest are at
+    /// 4-byte boundaries.
+    unsigned getByValTypeAlignment(Type *Ty,
+                                   const DataLayout &DL) const override;
+
+    /// Returns the target specific optimal type for load
+    /// and store operations as a result of memset, memcpy, and memmove
+    /// lowering. If DstAlign is zero that means it's safe to destination
+    /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+    /// means there isn't a need to check it against alignment requirement,
+    /// probably because the source does not need to be loaded. If 'IsMemset' is
+    /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+    /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+    /// source is constant so it does not need to be loaded.
+    /// It returns EVT::Other if the type should be determined using generic
+    /// target-independent logic.
+    EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+                            bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+                            MachineFunction &MF) const override;
+
+    /// Returns true if it's safe to use load / store of the
+    /// specified type to expand memcpy / memset inline. This is mostly true
+    /// for all types except for some special cases. For example, on X86
+    /// targets without SSE2 f64 load / store are done with fldl / fstpl which
+    /// also does type conversion. Note the specified type doesn't have to be
+    /// legal as the hook is used before type legalization.
+    bool isSafeMemOpType(MVT VT) const override;
+
+    /// Returns true if the target allows unaligned memory accesses of the
+    /// specified type. Returns whether it is "fast" in the last argument.
+    bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align,
+                                       bool *Fast) const override;
+
+    /// Provide custom lowering hooks for some operations.
+    ///
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+    /// Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
+
+
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+    /// Return true if the target has native support for
+    /// the specified value type and it is 'desirable' to use the type for the
+    /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
+    /// instruction encodings are longer and some i16 instructions are slow.
+    bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;
+
+    /// Return true if the target has native support for the
+    /// specified value type and it is 'desirable' to use the type. e.g. On x86
+    /// i16 is legal, but undesirable since i16 instruction encodings are longer
+    /// and some i16 instructions are slow.
+    bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
+
+    /// Return true if the MachineFunction contains a COPY which would imply
+    /// HasOpaqueSPAdjustment.
+    bool hasCopyImplyingStackAdjustment(MachineFunction *MF) const override;
+
+    MachineBasicBlock *
+      EmitInstrWithCustomInserter(MachineInstr *MI,
+                                  MachineBasicBlock *MBB) const override;
+
+
+    /// This method returns the name of a target specific DAG node.
+    const char *getTargetNodeName(unsigned Opcode) const override;
+
+    bool isCheapToSpeculateCttz() const override;
+
+    bool isCheapToSpeculateCtlz() const override;
+
+    /// Return the value type to use for ISD::SETCC.
+    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                           EVT VT) const override;
+
+    /// Determine which of the bits specified in Mask are known to be either
+    /// zero or one and return them in the KnownZero/KnownOne bitsets.
+    void computeKnownBitsForTargetNode(const SDValue Op,
+                                       APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth = 0) const override;
+
+    /// Determine the number of bits in the operation that are sign bits.
+    unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                             const SelectionDAG &DAG,
+                                             unsigned Depth) const override;
+
+    bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
+                        int64_t &Offset) const override;
+
+    SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+
+    bool ExpandInlineAsm(CallInst *CI) const override;
+
+    ConstraintType getConstraintType(StringRef Constraint) const override;
+
+    /// Examine constraint string and operand type and determine a weight value.
+    /// The operand object must already have been set up with the operand type.
+    ConstraintWeight
+      getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                     const char *constraint) const override;
+
+    const char *LowerXConstraint(EVT ConstraintVT) const override;
+
+    /// Lower the specified operand into the Ops vector. If it is invalid, don't
+    /// add anything to Ops. If hasMemory is true it means one of the asm
+    /// constraint of the inline asm instruction being processed is 'm'.
+    void LowerAsmOperandForConstraint(SDValue Op,
+                                      std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const override;
+
+    unsigned
+    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+      if (ConstraintCode == "i")
+        return InlineAsm::Constraint_i;
+      else if (ConstraintCode == "o")
+        return InlineAsm::Constraint_o;
+      else if (ConstraintCode == "v")
+        return InlineAsm::Constraint_v;
+      else if (ConstraintCode == "X")
+        return InlineAsm::Constraint_X;
+      return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+    }
+
+    /// Given a physical register constraint
+    /// (e.g. {edx}), return the register number and the register class for the
+    /// register.  This should only be used for C_Register constraints.  On
+    /// error, this returns a register number of 0.
+    std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                 StringRef Constraint, MVT VT) const override;
+
+    /// Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
+
+    /// Return true if the specified immediate is legal
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalICmpImmediate(int64_t Imm) const override;
+
+    /// Return true if the specified immediate is legal
+    /// add immediate, that is the target has add instructions which can
+    /// add a register and the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalAddImmediate(int64_t Imm) const override;
+
+    /// \brief Return the cost of the scaling factor used in the addressing
+    /// mode represented by AM for this target, for a load/store
+    /// of the specified type.
+    /// If the AM is supported, the return value must be >= 0.
+    /// If the AM is not supported, it returns a negative value.
+    int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+                             unsigned AS) const override;
+
+    bool isVectorShiftByScalarCheap(Type *Ty) const override;
+
+    /// Return true if it's free to truncate a value of
+    /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
+    /// register EAX to i16 by referencing its sub-register AX.
+    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+    bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+    bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
+
+    /// Return true if any actual instruction that defines a
+    /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
+    /// register. This does not necessarily include registers defined in
+    /// unknown ways, such as incoming arguments, or copies from unknown
+    /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
+    /// does not necessarily apply to truncate instructions. e.g. on x86-64,
+    /// all instructions that define 32-bit values implicit zero-extend the
+    /// result out to 64 bits.
+    bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+    bool isZExtFree(EVT VT1, EVT VT2) const override;
+    bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+    /// Return true if folding a vector load into ExtVal (a sign, zero, or any
+    /// extend node) is profitable.
+    bool isVectorLoadExtDesirable(SDValue) const override;
+
+    /// Return true if an FMA operation is faster than a pair of fmul and fadd
+    /// instructions. fmuladd intrinsics will be expanded to FMAs when this
+    /// method returns true, otherwise fmuladd is expanded to fmul + fadd.
+    bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+
+    /// Return true if it's profitable to narrow
+    /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
+    /// from i32 to i8 but not from i32 to i16.
+    bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
+
+    /// Returns true if the target can instruction select the
+    /// specified FP immediate natively. If false, the legalizer will
+    /// materialize the FP immediate as a load from a constant pool.
+    bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+    /// Targets can use this to indicate that they only support *some*
+    /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
+    /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
+    /// be legal.
+    bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+                            EVT VT) const override;
+
+    /// Similar to isShuffleMaskLegal. This is used by Targets can use this to
+    /// indicate if there is a suitable VECTOR_SHUFFLE that can be used to
+    /// replace a VAND with a constant pool entry.
+    bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+                                EVT VT) const override;
+
+    /// If true, then instruction selection should
+    /// seek to shrink the FP constant of the specified type to a smaller type
+    /// in order to save space and / or reduce runtime.
+    bool ShouldShrinkFPConstant(EVT VT) const override {
+      // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
+      // expensive than a straight movsd. On the other hand, it's important to
+      // shrink long double fp constant since fldt is very slow.
+      return !X86ScalarSSEf64 || VT == MVT::f80;
+    }
+
+    /// Return true if we believe it is correct and profitable to reduce the
+    /// load node to a smaller type.
+    bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
+                               EVT NewVT) const override;
+
+    /// Return true if the specified scalar FP type is computed in an SSE
+    /// register, not on the X87 floating point stack.
+    bool isScalarFPTypeInSSEReg(EVT VT) const {
+      return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+             (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
+    }
+
+    /// \brief Returns true if it is beneficial to convert a load of a constant
+    /// to just the constant itself.
+    bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                           Type *Ty) const override;
+
+    /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
+    /// with this index.
+    bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override;
+
+    /// Intel processors have a unified instruction and data cache
+    const char * getClearCacheBuiltinName() const override {
+      return nullptr; // nothing to do, move along.
+    }
+
+    unsigned getRegisterByName(const char* RegName, EVT VT,
+                               SelectionDAG &DAG) const override;
+
+    /// If a physical register, this returns the register that receives the
+    /// exception address on entry to an EH pad.
+    unsigned
+    getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+    /// If a physical register, this returns the register that receives the
+    /// exception typeid on entry to a landing pad.
+    unsigned
+    getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+
+    /// This method returns a target specific FastISel object,
+    /// or null if the target does not support "fast" ISel.
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo) const override;
+
+    /// Return true if the target stores stack protector cookies at a fixed
+    /// offset in some non-standard address space, and populates the address
+    /// space and offset as appropriate.
+    bool getStackCookieLocation(unsigned &AddressSpace,
+                                unsigned &Offset) const override;
+
+    /// Return true if the target stores SafeStack pointer at a fixed offset in
+    /// some non-standard address space, and populates the address space and
+    /// offset as appropriate.
+    Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
+
+    SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
+                      SelectionDAG &DAG) const;
+
+    bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+
+    bool useLoadStackGuardNode() const override;
+    /// \brief Customize the preferred legalization strategy for certain types.
+    LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
+
+    bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+
+  protected:
+    std::pair<const TargetRegisterClass *, uint8_t>
+    findRepresentativeClass(const TargetRegisterInfo *TRI,
+                            MVT VT) const override;
+
+  private:
+    /// Keep a pointer to the X86Subtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const X86Subtarget *Subtarget;
+
+    /// Select between SSE or x87 floating point ops.
+    /// When SSE is available, use it for f32 operations.
+    /// When SSE2 is available, use it for f64 operations.
+    bool X86ScalarSSEf32;
+    bool X86ScalarSSEf64;
+
+    /// A list of legal FP immediates.
+    std::vector<APFloat> LegalFPImmediates;
+
+    /// Indicate that this x86 target can instruction
+    /// select the specified FP immediate natively.
+    void addLegalFPImmediate(const APFloat& Imm) {
+      LegalFPImmediates.push_back(Imm);
+    }
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            SDLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerMemArgument(SDValue Chain,
+                             CallingConv::ID CallConv,
+                             const SmallVectorImpl<ISD::InputArg> &ArgInfo,
+                             SDLoc dl, SelectionDAG &DAG,
+                             const CCValAssign &VA,  MachineFrameInfo *MFI,
+                              unsigned i) const;
+    SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+                             SDLoc dl, SelectionDAG &DAG,
+                             const CCValAssign &VA,
+                             ISD::ArgFlagsTy Flags) const;
+
+    // Call lowering helpers.
+
+    /// Check whether the call is eligible for tail call optimization. Targets
+    /// that want to do tail call optimization should implement this function.
+    bool IsEligibleForTailCallOptimization(SDValue Callee,
+                                           CallingConv::ID CalleeCC,
+                                           bool isVarArg,
+                                           bool isCalleeStructRet,
+                                           bool isCallerStructRet,
+                                           Type *RetTy,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                           SelectionDAG& DAG) const;
+    SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
+                                SDValue Chain, bool IsTailCall, bool Is64Bit,
+                                int FPDiff, SDLoc dl) const;
+
+    unsigned GetAlignedArgumentStackSize(unsigned StackSize,
+                                         SelectionDAG &DAG) const;
+
+    std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+                                               bool isSigned,
+                                               bool isReplace) const;
+
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const;
+    SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
+                               int64_t Offset, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerToBT(SDValue And, ISD::CondCode CC,
+                      SDLoc dl, SelectionDAG &DAG) const;
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           SDLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue LowerCall(CallLoweringInfo &CLI,
+                      SmallVectorImpl<SDValue> &InVals) const override;
+
+    SDValue LowerReturn(SDValue Chain,
+                        CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        SDLoc dl, SelectionDAG &DAG) const override;
+
+    bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+
+    bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+
+    EVT getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
+                                 ISD::NodeType ExtendKind) const override;
+
+    bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                        bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const override;
+
+    const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
+
+    TargetLoweringBase::AtomicExpansionKind
+    shouldExpandAtomicLoadInIR(LoadInst *SI) const override;
+    bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+    TargetLoweringBase::AtomicExpansionKind
+    shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+
+    LoadInst *
+    lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
+
+    bool needsCmpXchgNb(Type *MemType) const;
+
+    // Utility function to emit the low-level va_arg code for X86-64.
+    MachineBasicBlock *EmitVAARG64WithCustomInserter(
+                       MachineInstr *MI,
+                       MachineBasicBlock *MBB) const;
+
+    /// Utility function to emit the xmm reg save portion of va_start.
+    MachineBasicBlock *EmitVAStartSaveXMMRegsWithCustomInserter(
+                                                   MachineInstr *BInstr,
+                                                   MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredSelect(MachineInstr *I,
+                                         MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr *I,
+                                           MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI,
+                                              MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredCatchRet(MachineInstr *MI,
+                                           MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredCatchPad(MachineInstr *MI,
+                                           MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI,
+                                            MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI,
+                                          MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI,
+                                        MachineBasicBlock *MBB) const;
+
+    MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI,
+                                         MachineBasicBlock *MBB) const;
+
+    MachineBasicBlock *emitFMA3Instr(MachineInstr *MI,
+                                     MachineBasicBlock *MBB) const;
+
+    /// Emit nodes that will be selected as "test Op0,Op0", or something
+    /// equivalent, for use with the given x86 condition code.
+    SDValue EmitTest(SDValue Op0, unsigned X86CC, SDLoc dl,
+                     SelectionDAG &DAG) const;
+
+    /// Emit nodes that will be selected as "cmp Op0,Op1", or something
+    /// equivalent, for use with the given x86 condition code.
+    SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SDLoc dl,
+                    SelectionDAG &DAG) const;
+
+    /// Convert a comparison if required by the subtarget.
+    SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
+
+    /// Use rsqrt* to speed up sqrt calculations.
+    SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+                             unsigned &RefinementSteps,
+                             bool &UseOneConstNR) const override;
+
+    /// Use rcp* to speed up fdiv calculations.
+    SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+                             unsigned &RefinementSteps) const override;
+
+    /// Reassociate floating point divisions into multiply by reciprocal.
+    unsigned combineRepeatedFPDivisors() const override;
+  };
+
+  namespace X86 {
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo);
+  }
+}
+
+#endif    // X86ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/X86/X86Instr3DNow.td b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
new file mode 100644
index 0000000..ba1aede
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
@@ -0,0 +1,103 @@
+//===-- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the 3DNow! instruction set, which extends MMX to support
+// floating point and also adds a few more random instructions for good measure.
+//
+//===----------------------------------------------------------------------===//
+
+class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat>
+      : I<o, F, outs, ins, asm, pat>, TB, Requires<[Has3DNow]> {
+}
+
+class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+      : I3DNow<o, F, (outs VR64:$dst), ins,
+          !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>,
+        Has3DNow0F0FOpcode {
+  // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
+  let isAsmParserOnly = 1;
+  let Constraints = "$src1 = $dst";
+}
+
+class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+      : I3DNow<o, F, (outs VR64:$dst), ins,
+          !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>,
+        Has3DNow0F0FOpcode {
+  // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
+  let isAsmParserOnly = 1;
+}
+
+multiclass I3DNow_binop_rm<bits<8> opc, string Mn> {
+  def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, []>;
+  def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>;
+}
+
+multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+  def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>;
+  def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1,
+        (bitconvert (load_mmx addr:$src2))))]>;
+}
+
+multiclass I3DNow_conv_rm<bits<8> opc, string Mn> {
+  def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src1), Mn, []>;
+  def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src1), Mn, []>;
+}
+
+multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+  def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>;
+  def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn))
+        (bitconvert (load_mmx addr:$src))))]>;
+}
+
+defm PAVGUSB  : I3DNow_binop_rm_int<0xBF, "pavgusb">;
+defm PF2ID    : I3DNow_conv_rm_int<0x1D, "pf2id">;
+defm PFACC    : I3DNow_binop_rm_int<0xAE, "pfacc">;
+defm PFADD    : I3DNow_binop_rm_int<0x9E, "pfadd">;
+defm PFCMPEQ  : I3DNow_binop_rm_int<0xB0, "pfcmpeq">;
+defm PFCMPGE  : I3DNow_binop_rm_int<0x90, "pfcmpge">;
+defm PFCMPGT  : I3DNow_binop_rm_int<0xA0, "pfcmpgt">;
+defm PFMAX    : I3DNow_binop_rm_int<0xA4, "pfmax">;
+defm PFMIN    : I3DNow_binop_rm_int<0x94, "pfmin">;
+defm PFMUL    : I3DNow_binop_rm_int<0xB4, "pfmul">;
+defm PFRCP    : I3DNow_conv_rm_int<0x96, "pfrcp">;
+defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">;
+defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">;
+defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">;
+defm PFRSQRT  : I3DNow_conv_rm_int<0x97, "pfrsqrt">;
+defm PFSUB    : I3DNow_binop_rm_int<0x9A, "pfsub">;
+defm PFSUBR   : I3DNow_binop_rm_int<0xAA, "pfsubr">;
+defm PI2FD    : I3DNow_conv_rm_int<0x0D, "pi2fd">;
+defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw">;
+
+
+def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
+                   [(int_x86_mmx_femms)]>;
+
+def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr),
+                      "prefetch\t$addr",
+                      [(prefetch addr:$addr, (i32 0), imm, (i32 1))]>;
+
+def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
+                  [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))]>, TB,
+                Requires<[HasPrefetchW]>;
+
+// "3DNowA" instructions
+defm PF2IW    : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">;
+defm PI2FW    : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">;
+defm PFNACC   : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">;
+defm PFPNACC  : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">;
+defm PSWAPD   : I3DNow_conv_rm_int<0xBB, "pswapd", "a">;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
new file mode 100644
index 0000000..0a27c33
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -0,0 +1,7519 @@
+//===-- X86InstrAVX512.td - AVX512 Instruction Set ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 AVX512 instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+// Group template arguments that can be derived from the vector type (EltNum x
+// EltVT).  These are things like the register class for the writemask, etc.
+// The idea is to pass one of these as the template argument rather than the
+// individual arguments.
+// The template is also used for scalar types, in this case numelts is 1.
+class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
+                      string suffix = ""> {
+  RegisterClass RC = rc;
+  ValueType EltVT = eltvt;
+  int NumElts = numelts;
+
+  // Corresponding mask register class.
+  RegisterClass KRC = !cast<RegisterClass>("VK" # NumElts);
+
+  // Corresponding write-mask register class.
+  RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM");
+
+  // The GPR register class that can hold the write mask.  Use GR8 for fewer
+  // than 8 elements.  Use shift-right and equal to work around the lack of
+  // !lt in tablegen.
+  RegisterClass MRC =
+    !cast<RegisterClass>("GR" #
+                         !if (!eq (!srl(NumElts, 3), 0), 8, NumElts));
+
+  // Suffix used in the instruction mnemonic.
+  string Suffix = suffix;
+
+  // VTName is a string name for vector VT. For vector types it will be
+  // v # NumElts # EltVT, so for vector of 8 elements of i32 it will be v8i32
+  // It is a little bit complex for scalar types, where NumElts = 1.
+  // In this case we build v4f32 or v2f64
+  string VTName = "v" # !if (!eq (NumElts, 1),
+                        !if (!eq (EltVT.Size, 32), 4,
+                        !if (!eq (EltVT.Size, 64), 2, NumElts)), NumElts) # EltVT;
+
+  // The vector VT.
+  ValueType VT = !cast<ValueType>(VTName);
+
+  string EltTypeName = !cast<string>(EltVT);
+  // Size of the element type in bits, e.g. 32 for v16i32.
+  string EltSizeName = !subst("i", "", !subst("f", "", EltTypeName));
+  int EltSize = EltVT.Size;
+
+  // "i" for integer types and "f" for floating-point types
+  string TypeVariantName = !subst(EltSizeName, "", EltTypeName);
+
+  // Size of RC in bits, e.g. 512 for VR512.
+  int Size = VT.Size;
+
+  // The corresponding memory operand, e.g. i512mem for VR512.
+  X86MemOperand MemOp = !cast<X86MemOperand>(TypeVariantName # Size # "mem");
+  X86MemOperand ScalarMemOp = !cast<X86MemOperand>(EltVT # "mem");
+
+  // Load patterns
+  // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64
+  //       due to load promotion during legalization
+  PatFrag LdFrag = !cast<PatFrag>("load" #
+                                  !if (!eq (TypeVariantName, "i"),
+                                       !if (!eq (Size, 128), "v2i64",
+                                       !if (!eq (Size, 256), "v4i64",
+                                            VTName)), VTName));
+
+  PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" #
+                          !if (!eq (TypeVariantName, "i"),
+                                !if (!eq (Size, 128), "v2i64",
+                                !if (!eq (Size, 256), "v4i64",
+                                !if (!eq (Size, 512),
+                                    !if (!eq (EltSize, 64), "v8i64", "v16i32"),
+                                    VTName))), VTName));
+
+  PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
+
+  // The corresponding float type, e.g. v16f32 for v16i32
+  // Note: For EltSize < 32, FloatVT is illegal and TableGen
+  //       fails to compile, so we choose FloatVT = VT
+  ValueType FloatVT = !cast<ValueType>(
+                        !if (!eq (!srl(EltSize,5),0),
+                             VTName,
+                             !if (!eq(TypeVariantName, "i"),
+                                  "v" # NumElts # "f" # EltSize,
+                                  VTName)));
+
+  // The string to specify embedded broadcast in assembly.
+  string BroadcastStr = "{1to" # NumElts # "}";
+
+  // 8-bit compressed displacement tuple/subvector format.  This is only
+  // defined for NumElts <= 8.
+  CD8VForm CD8TupleForm = !if (!eq (!srl(NumElts, 4), 0),
+                               !cast<CD8VForm>("CD8VT" # NumElts), ?);
+
+  SubRegIndex SubRegIdx = !if (!eq (Size, 128), sub_xmm,
+                          !if (!eq (Size, 256), sub_ymm, ?));
+
+  Domain ExeDomain = !if (!eq (EltTypeName, "f32"), SSEPackedSingle,
+                     !if (!eq (EltTypeName, "f64"), SSEPackedDouble,
+                     SSEPackedInt));
+
+  RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X);
+
+  // A vector type of the same width with element type i32.  This is used to
+  // create the canonical constant zero node ImmAllZerosV.
+  ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32");
+  dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV)));
+
+  string ZSuffix = !if (!eq (Size, 128), "Z128",
+                   !if (!eq (Size, 256), "Z256", "Z"));
+}
+
+def v64i8_info  : X86VectorVTInfo<64,  i8, VR512, "b">;
+def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">;
+def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">;
+def v8i64_info  : X86VectorVTInfo<8,  i64, VR512, "q">;
+def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">;
+def v8f64_info  : X86VectorVTInfo<8,  f64, VR512, "pd">;
+
+// "x" in v32i8x_info means RC = VR256X
+def v32i8x_info  : X86VectorVTInfo<32,  i8, VR256X, "b">;
+def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">;
+def v8i32x_info  : X86VectorVTInfo<8,  i32, VR256X, "d">;
+def v4i64x_info  : X86VectorVTInfo<4,  i64, VR256X, "q">;
+def v8f32x_info  : X86VectorVTInfo<8,  f32, VR256X, "ps">;
+def v4f64x_info  : X86VectorVTInfo<4,  f64, VR256X, "pd">;
+
+def v16i8x_info  : X86VectorVTInfo<16,  i8, VR128X, "b">;
+def v8i16x_info  : X86VectorVTInfo<8,  i16, VR128X, "w">;
+def v4i32x_info  : X86VectorVTInfo<4,  i32, VR128X, "d">;
+def v2i64x_info  : X86VectorVTInfo<2,  i64, VR128X, "q">;
+def v4f32x_info  : X86VectorVTInfo<4,  f32, VR128X, "ps">;
+def v2f64x_info  : X86VectorVTInfo<2,  f64, VR128X, "pd">;
+
+// We map scalar types to the smallest (128-bit) vector type
+// with the appropriate element type. This allows to use the same masking logic.
+def i32x_info    : X86VectorVTInfo<1,  i32, GR32, "si">;
+def i64x_info    : X86VectorVTInfo<1,  i64, GR64, "sq">;
+def f32x_info    : X86VectorVTInfo<1,  f32, VR128X, "ss">;
+def f64x_info    : X86VectorVTInfo<1,  f64, VR128X, "sd">;
+
+class AVX512VLVectorVTInfo<X86VectorVTInfo i512, X86VectorVTInfo i256,
+                           X86VectorVTInfo i128> {
+  X86VectorVTInfo info512 = i512;
+  X86VectorVTInfo info256 = i256;
+  X86VectorVTInfo info128 = i128;
+}
+
+def avx512vl_i8_info  : AVX512VLVectorVTInfo<v64i8_info, v32i8x_info,
+                                             v16i8x_info>;
+def avx512vl_i16_info : AVX512VLVectorVTInfo<v32i16_info, v16i16x_info,
+                                             v8i16x_info>;
+def avx512vl_i32_info : AVX512VLVectorVTInfo<v16i32_info, v8i32x_info,
+                                             v4i32x_info>;
+def avx512vl_i64_info : AVX512VLVectorVTInfo<v8i64_info, v4i64x_info,
+                                             v2i64x_info>;
+def avx512vl_f32_info : AVX512VLVectorVTInfo<v16f32_info, v8f32x_info,
+                                             v4f32x_info>;
+def avx512vl_f64_info : AVX512VLVectorVTInfo<v8f64_info, v4f64x_info,
+                                             v2f64x_info>;
+
+// This multiclass generates the masking variants from the non-masking
+// variant.  It only provides the assembly pieces for the masking variants.
+// It assumes custom ISel patterns for masking which can be provided as
+// template arguments.
+multiclass AVX512_maskable_custom<bits<8> O, Format F,
+                                  dag Outs,
+                                  dag Ins, dag MaskingIns, dag ZeroMaskingIns,
+                                  string OpcodeStr,
+                                  string AttSrcAsm, string IntelSrcAsm,
+                                  list<dag> Pattern,
+                                  list<dag> MaskingPattern,
+                                  list<dag> ZeroMaskingPattern,
+                                  string MaskingConstraint = "",
+                                  InstrItinClass itin = NoItinerary,
+                                  bit IsCommutable = 0> {
+  let isCommutable = IsCommutable in
+    def NAME: AVX512<O, F, Outs, Ins,
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
+                                     "$dst , "#IntelSrcAsm#"}",
+                       Pattern, itin>;
+
+  // Prefer over VMOV*rrk Pat<>
+  let AddedComplexity = 20 in
+    def NAME#k: AVX512<O, F, Outs, MaskingIns,
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
+                                     "$dst {${mask}}, "#IntelSrcAsm#"}",
+                       MaskingPattern, itin>,
+              EVEX_K {
+      // In case of the 3src subclass this is overridden with a let.
+      string Constraints = MaskingConstraint;
+  }
+  let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<>
+    def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
+                                     "$dst {${mask}} {z}, "#IntelSrcAsm#"}",
+                       ZeroMaskingPattern,
+                       itin>,
+              EVEX_KZ;
+}
+
+
+// Common base class of AVX512_maskable and AVX512_maskable_3src.
+multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
+                                  dag Outs,
+                                  dag Ins, dag MaskingIns, dag ZeroMaskingIns,
+                                  string OpcodeStr,
+                                  string AttSrcAsm, string IntelSrcAsm,
+                                  dag RHS, dag MaskingRHS,
+                                  SDNode Select = vselect,
+                                  string MaskingConstraint = "",
+                                  InstrItinClass itin = NoItinerary,
+                                  bit IsCommutable = 0> :
+  AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
+                         AttSrcAsm, IntelSrcAsm,
+                         [(set _.RC:$dst, RHS)],
+                         [(set _.RC:$dst, MaskingRHS)],
+                         [(set _.RC:$dst,
+                               (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
+                         MaskingConstraint, NoItinerary, IsCommutable>;
+
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the vector instruction.  In the masking case, the
+// perserved vector elements come from a new dummy input operand tied to $dst.
+multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
+                           dag Outs, dag Ins, string OpcodeStr,
+                           string AttSrcAsm, string IntelSrcAsm,
+                           dag RHS,
+                           InstrItinClass itin = NoItinerary,
+                           bit IsCommutable = 0> :
+   AVX512_maskable_common<O, F, _, Outs, Ins,
+                          !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+                          !con((ins _.KRCWM:$mask), Ins),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+                          (vselect _.KRCWM:$mask, RHS, _.RC:$src0), vselect,
+                          "$src0 = $dst", itin, IsCommutable>;
+
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the scalar instruction.
+multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
+                           dag Outs, dag Ins, string OpcodeStr,
+                           string AttSrcAsm, string IntelSrcAsm,
+                           dag RHS,
+                           InstrItinClass itin = NoItinerary,
+                           bit IsCommutable = 0> :
+   AVX512_maskable_common<O, F, _, Outs, Ins,
+                          !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+                          !con((ins _.KRCWM:$mask), Ins),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+                          (X86select _.KRCWM:$mask, RHS, _.RC:$src0), X86select,
+                          "$src0 = $dst", itin, IsCommutable>;
+
+// Similar to AVX512_maskable but in this case one of the source operands
+// ($src1) is already tied to $dst so we just use that for the preserved
+// vector elements.  NOTE that the NonTiedIns (the ins dag) should exclude
+// $src1.
+multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
+                                dag Outs, dag NonTiedIns, string OpcodeStr,
+                                string AttSrcAsm, string IntelSrcAsm,
+                                dag RHS> :
+   AVX512_maskable_common<O, F, _, Outs,
+                          !con((ins _.RC:$src1), NonTiedIns),
+                          !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+                          !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+                          (vselect _.KRCWM:$mask, RHS, _.RC:$src1)>;
+
+// Similar to AVX512_maskable_3rc but in this case the input VT for the tied
+// operand differs from the output VT. This requires a bitconvert on
+// the preserved vector going into the vselect.
+multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT,
+                                     X86VectorVTInfo InVT,
+                                     dag Outs, dag NonTiedIns, string OpcodeStr,
+                                     string AttSrcAsm, string IntelSrcAsm,
+                                     dag RHS> :
+   AVX512_maskable_common<O, F, OutVT, Outs,
+                          !con((ins InVT.RC:$src1), NonTiedIns),
+                          !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
+                          !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+                          (vselect InVT.KRCWM:$mask, RHS,
+                           (bitconvert InVT.RC:$src1))>;
+
+multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
+                                     dag Outs, dag NonTiedIns, string OpcodeStr,
+                                     string AttSrcAsm, string IntelSrcAsm,
+                                     dag RHS> :
+   AVX512_maskable_common<O, F, _, Outs,
+                          !con((ins _.RC:$src1), NonTiedIns),
+                          !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+                          !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+                          (X86select _.KRCWM:$mask, RHS, _.RC:$src1)>;
+
+multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
+                                  dag Outs, dag Ins,
+                                  string OpcodeStr,
+                                  string AttSrcAsm, string IntelSrcAsm,
+                                  list<dag> Pattern> :
+   AVX512_maskable_custom<O, F, Outs, Ins,
+                          !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+                          !con((ins _.KRCWM:$mask), Ins),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [],
+                          "$src0 = $dst">;
+
+
+// Instruction with mask that puts result in mask register,
+// like "compare" and "vptest"
+multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
+                                  dag Outs,
+                                  dag Ins, dag MaskingIns,
+                                  string OpcodeStr,
+                                  string AttSrcAsm, string IntelSrcAsm,
+                                  list<dag> Pattern,
+                                  list<dag> MaskingPattern,
+                                  string Round = "",
+                                  InstrItinClass itin = NoItinerary> {
+    def NAME: AVX512<O, F, Outs, Ins,
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst "#Round#"|"#
+                                     "$dst "#Round#", "#IntelSrcAsm#"}",
+                       Pattern, itin>;
+
+    def NAME#k: AVX512<O, F, Outs, MaskingIns,
+                       OpcodeStr#"\t{"#Round#AttSrcAsm#", $dst {${mask}}|"#
+                                     "$dst {${mask}}, "#IntelSrcAsm#Round#"}",
+                       MaskingPattern, itin>, EVEX_K;
+}
+
+multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
+                                  dag Outs,
+                                  dag Ins, dag MaskingIns,
+                                  string OpcodeStr,
+                                  string AttSrcAsm, string IntelSrcAsm,
+                                  dag RHS, dag MaskingRHS,
+                                  string Round = "",
+                                  InstrItinClass itin = NoItinerary> :
+  AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr,
+                         AttSrcAsm, IntelSrcAsm,
+                         [(set _.KRC:$dst, RHS)],
+                         [(set _.KRC:$dst, MaskingRHS)],
+                         Round, NoItinerary>;
+
+multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _,
+                           dag Outs, dag Ins, string OpcodeStr,
+                           string AttSrcAsm, string IntelSrcAsm,
+                           dag RHS, string Round = "",
+                           InstrItinClass itin = NoItinerary> :
+   AVX512_maskable_common_cmp<O, F, _, Outs, Ins,
+                          !con((ins _.KRCWM:$mask), Ins),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+                          (and _.KRCWM:$mask, RHS),
+                          Round, itin>;
+
+multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _,
+                           dag Outs, dag Ins, string OpcodeStr,
+                           string AttSrcAsm, string IntelSrcAsm> :
+   AVX512_maskable_custom_cmp<O, F, Outs,
+                             Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr,
+                             AttSrcAsm, IntelSrcAsm,
+                             [],[],"", NoItinerary>;
+
+// Bitcasts between 512-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasAVX512] in {
+  def : Pat<(v8f64  (bitconvert (v8i64 VR512:$src))),  (v8f64 VR512:$src)>;
+  def : Pat<(v8f64  (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>;
+  def : Pat<(v8f64  (bitconvert (v32i16 VR512:$src))),  (v8f64 VR512:$src)>;
+  def : Pat<(v8f64  (bitconvert (v64i8 VR512:$src))), (v8f64 VR512:$src)>;
+  def : Pat<(v8f64  (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>;
+  def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))),  (v16f32 VR512:$src)>;
+  def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>;
+  def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>;
+  def : Pat<(v16f32 (bitconvert (v64i8 VR512:$src))), (v16f32 VR512:$src)>;
+  def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))),  (v16f32 VR512:$src)>;
+  def : Pat<(v8i64  (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>;
+  def : Pat<(v8i64  (bitconvert (v32i16 VR512:$src))), (v8i64 VR512:$src)>;
+  def : Pat<(v8i64  (bitconvert (v64i8 VR512:$src))), (v8i64 VR512:$src)>;
+  def : Pat<(v8i64  (bitconvert (v8f64 VR512:$src))),  (v8i64 VR512:$src)>;
+  def : Pat<(v8i64  (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>;
+  def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>;
+  def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>;
+  def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))),  (v16i32 VR512:$src)>;
+  def : Pat<(v16i32 (bitconvert (v64i8 VR512:$src))),  (v16i32 VR512:$src)>;
+  def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))),  (v16i32 VR512:$src)>;
+  def : Pat<(v32i16 (bitconvert (v8i64 VR512:$src))), (v32i16 VR512:$src)>;
+  def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))),  (v32i16 VR512:$src)>;
+  def : Pat<(v32i16 (bitconvert (v64i8 VR512:$src))),  (v32i16 VR512:$src)>;
+  def : Pat<(v32i16 (bitconvert (v8f64 VR512:$src))),  (v32i16 VR512:$src)>;
+  def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
+  def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
+  def : Pat<(v64i8  (bitconvert (v8i64 VR512:$src))), (v64i8 VR512:$src)>;
+  def : Pat<(v64i8  (bitconvert (v16i32 VR512:$src))), (v64i8 VR512:$src)>;
+  def : Pat<(v64i8  (bitconvert (v32i16 VR512:$src))), (v64i8 VR512:$src)>;
+  def : Pat<(v64i8  (bitconvert (v8f64 VR512:$src))),  (v64i8 VR512:$src)>;
+  def : Pat<(v64i8  (bitconvert (v16f32 VR512:$src))), (v64i8 VR512:$src)>;
+
+  def : Pat<(v2i64 (bitconvert (v4i32 VR128X:$src))), (v2i64 VR128X:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 VR128X:$src))), (v2i64 VR128X:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 VR128X:$src))), (v2i64 VR128X:$src)>;
+  def : Pat<(v2i64 (bitconvert (v2f64 VR128X:$src))), (v2i64 VR128X:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4f32 VR128X:$src))), (v2i64 VR128X:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 VR128X:$src))), (v4i32 VR128X:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 VR128X:$src))), (v4i32 VR128X:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 VR128X:$src))), (v4i32 VR128X:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 VR128X:$src))), (v4i32 VR128X:$src)>;
+  def : Pat<(v4i32 (bitconvert (v4f32 VR128X:$src))), (v4i32 VR128X:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 VR128X:$src))), (v8i16 VR128X:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 VR128X:$src))), (v8i16 VR128X:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 VR128X:$src))), (v8i16 VR128X:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 VR128X:$src))), (v8i16 VR128X:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 VR128X:$src))), (v8i16 VR128X:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 VR128X:$src))), (v16i8 VR128X:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 VR128X:$src))), (v16i8 VR128X:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 VR128X:$src))), (v16i8 VR128X:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 VR128X:$src))), (v16i8 VR128X:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 VR128X:$src))), (v16i8 VR128X:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 VR128X:$src))), (v4f32 VR128X:$src)>;
+  def : Pat<(v4f32 (bitconvert (v4i32 VR128X:$src))), (v4f32 VR128X:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8i16 VR128X:$src))), (v4f32 VR128X:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 VR128X:$src))), (v4f32 VR128X:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 VR128X:$src))), (v4f32 VR128X:$src)>;
+  def : Pat<(v2f64 (bitconvert (v2i64 VR128X:$src))), (v2f64 VR128X:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4i32 VR128X:$src))), (v2f64 VR128X:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 VR128X:$src))), (v2f64 VR128X:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 VR128X:$src))), (v2f64 VR128X:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 VR128X:$src))), (v2f64 VR128X:$src)>;
+
+// Bitcasts between 256-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+  def : Pat<(v4f64  (bitconvert (v8f32 VR256X:$src))),  (v4f64 VR256X:$src)>;
+  def : Pat<(v4f64  (bitconvert (v8i32 VR256X:$src))),  (v4f64 VR256X:$src)>;
+  def : Pat<(v4f64  (bitconvert (v4i64 VR256X:$src))),  (v4f64 VR256X:$src)>;
+  def : Pat<(v4f64  (bitconvert (v16i16 VR256X:$src))), (v4f64 VR256X:$src)>;
+  def : Pat<(v4f64  (bitconvert (v32i8 VR256X:$src))),  (v4f64 VR256X:$src)>;
+  def : Pat<(v8f32  (bitconvert (v8i32 VR256X:$src))),  (v8f32 VR256X:$src)>;
+  def : Pat<(v8f32  (bitconvert (v4i64 VR256X:$src))),  (v8f32 VR256X:$src)>;
+  def : Pat<(v8f32  (bitconvert (v4f64 VR256X:$src))),  (v8f32 VR256X:$src)>;
+  def : Pat<(v8f32  (bitconvert (v32i8 VR256X:$src))),  (v8f32 VR256X:$src)>;
+  def : Pat<(v8f32  (bitconvert (v16i16 VR256X:$src))), (v8f32 VR256X:$src)>;
+  def : Pat<(v4i64  (bitconvert (v8f32 VR256X:$src))),  (v4i64 VR256X:$src)>;
+  def : Pat<(v4i64  (bitconvert (v8i32 VR256X:$src))),  (v4i64 VR256X:$src)>;
+  def : Pat<(v4i64  (bitconvert (v4f64 VR256X:$src))),  (v4i64 VR256X:$src)>;
+  def : Pat<(v4i64  (bitconvert (v32i8 VR256X:$src))),  (v4i64 VR256X:$src)>;
+  def : Pat<(v4i64  (bitconvert (v16i16 VR256X:$src))), (v4i64 VR256X:$src)>;
+  def : Pat<(v32i8  (bitconvert (v4f64 VR256X:$src))),  (v32i8 VR256X:$src)>;
+  def : Pat<(v32i8  (bitconvert (v4i64 VR256X:$src))),  (v32i8 VR256X:$src)>;
+  def : Pat<(v32i8  (bitconvert (v8f32 VR256X:$src))),  (v32i8 VR256X:$src)>;
+  def : Pat<(v32i8  (bitconvert (v8i32 VR256X:$src))),  (v32i8 VR256X:$src)>;
+  def : Pat<(v32i8  (bitconvert (v16i16 VR256X:$src))), (v32i8 VR256X:$src)>;
+  def : Pat<(v8i32  (bitconvert (v32i8 VR256X:$src))),  (v8i32 VR256X:$src)>;
+  def : Pat<(v8i32  (bitconvert (v16i16 VR256X:$src))), (v8i32 VR256X:$src)>;
+  def : Pat<(v8i32  (bitconvert (v8f32 VR256X:$src))),  (v8i32 VR256X:$src)>;
+  def : Pat<(v8i32  (bitconvert (v4i64 VR256X:$src))),  (v8i32 VR256X:$src)>;
+  def : Pat<(v8i32  (bitconvert (v4f64 VR256X:$src))),  (v8i32 VR256X:$src)>;
+  def : Pat<(v16i16 (bitconvert (v8f32 VR256X:$src))),  (v16i16 VR256X:$src)>;
+  def : Pat<(v16i16 (bitconvert (v8i32 VR256X:$src))),  (v16i16 VR256X:$src)>;
+  def : Pat<(v16i16 (bitconvert (v4i64 VR256X:$src))),  (v16i16 VR256X:$src)>;
+  def : Pat<(v16i16 (bitconvert (v4f64 VR256X:$src))),  (v16i16 VR256X:$src)>;
+  def : Pat<(v16i16 (bitconvert (v32i8 VR256X:$src))),  (v16i16 VR256X:$src)>;
+}
+
+//
+// AVX-512: VPXOR instruction writes zero to its upper part, it's safe build zeros.
+//
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, Predicates = [HasAVX512] in {
+def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "",
+               [(set VR512:$dst, (v16f32 immAllZerosV))]>;
+}
+
+let Predicates = [HasAVX512] in {
+def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v16i32 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - VECTOR INSERT
+//
+multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To,
+                                                       PatFrag vinsert_insert> {
+  let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
+    defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
+                   (ins To.RC:$src1, From.RC:$src2, i32u8imm:$src3),
+                   "vinsert" # From.EltTypeName # "x" # From.NumElts,
+                   "$src3, $src2, $src1", "$src1, $src2, $src3",
+                   (vinsert_insert:$src3 (To.VT To.RC:$src1),
+                                         (From.VT From.RC:$src2),
+                                         (iPTR imm))>, AVX512AIi8Base, EVEX_4V;
+
+  let mayLoad = 1 in
+    defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
+                   (ins To.RC:$src1, From.MemOp:$src2, i32u8imm:$src3),
+                   "vinsert" # From.EltTypeName # "x" # From.NumElts,
+                   "$src3, $src2, $src1", "$src1, $src2, $src3",
+                   (vinsert_insert:$src3 (To.VT To.RC:$src1),
+                               (From.VT (bitconvert (From.LdFrag addr:$src2))),
+                               (iPTR imm))>, AVX512AIi8Base, EVEX_4V,
+                   EVEX_CD8<From.EltSize, From.CD8TupleForm>;
+  }
+}
+
+multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
+                       X86VectorVTInfo To, PatFrag vinsert_insert,
+                       SDNodeXForm INSERT_get_vinsert_imm , list<Predicate> p> {
+  let Predicates = p in {
+    def : Pat<(vinsert_insert:$ins
+                     (To.VT To.RC:$src1), (From.VT From.RC:$src2), (iPTR imm)),
+              (To.VT (!cast<Instruction>(InstrStr#"rr")
+                     To.RC:$src1, From.RC:$src2,
+                     (INSERT_get_vinsert_imm To.RC:$ins)))>;
+
+    def : Pat<(vinsert_insert:$ins
+                  (To.VT To.RC:$src1),
+                  (From.VT (bitconvert (From.LdFrag addr:$src2))),
+                  (iPTR imm)),
+              (To.VT (!cast<Instruction>(InstrStr#"rm")
+                  To.RC:$src1, addr:$src2,
+                  (INSERT_get_vinsert_imm To.RC:$ins)))>;
+  }
+}
+
+multiclass vinsert_for_type<ValueType EltVT32, int Opcode128,
+                            ValueType EltVT64, int Opcode256> {
+
+  let Predicates = [HasVLX] in
+    defm NAME # "32x4Z256" : vinsert_for_size<Opcode128,
+                                 X86VectorVTInfo< 4, EltVT32, VR128X>,
+                                 X86VectorVTInfo< 8, EltVT32, VR256X>,
+                                 vinsert128_insert>, EVEX_V256;
+
+  defm NAME # "32x4Z" : vinsert_for_size<Opcode128,
+                                 X86VectorVTInfo< 4, EltVT32, VR128X>,
+                                 X86VectorVTInfo<16, EltVT32, VR512>,
+                                 vinsert128_insert>, EVEX_V512;
+
+  defm NAME # "64x4Z" : vinsert_for_size<Opcode256,
+                                 X86VectorVTInfo< 4, EltVT64, VR256X>,
+                                 X86VectorVTInfo< 8, EltVT64, VR512>,
+                                 vinsert256_insert>, VEX_W, EVEX_V512;
+
+  let Predicates = [HasVLX, HasDQI] in
+    defm NAME # "64x2Z256" : vinsert_for_size<Opcode128,
+                                   X86VectorVTInfo< 2, EltVT64, VR128X>,
+                                   X86VectorVTInfo< 4, EltVT64, VR256X>,
+                                   vinsert128_insert>, VEX_W, EVEX_V256;
+
+  let Predicates = [HasDQI] in {
+    defm NAME # "64x2Z" : vinsert_for_size<Opcode128,
+                                 X86VectorVTInfo< 2, EltVT64, VR128X>,
+                                 X86VectorVTInfo< 8, EltVT64, VR512>,
+                                 vinsert128_insert>, VEX_W, EVEX_V512;
+
+    defm NAME # "32x8Z" : vinsert_for_size<Opcode256,
+                                   X86VectorVTInfo< 8, EltVT32, VR256X>,
+                                   X86VectorVTInfo<16, EltVT32, VR512>,
+                                   vinsert256_insert>, EVEX_V512;
+  }
+}
+
+defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a>;
+defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>;
+
+// Codegen pattern with the alternative types,
+// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
+defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
+
+defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
+
+defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info,
+              vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info,
+              vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
+
+// Codegen pattern with the alternative types insert VEC128 into VEC256
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+// Codegen pattern with the alternative types insert VEC128 into VEC512
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info,
+               vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+// Codegen pattern with the alternative types insert VEC256 into VEC512
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
+              vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
+              vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+
+// vinsertps - insert f32 to XMM
+def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
+      (ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
+      "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+      [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+      EVEX_4V;
+def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
+      (ins VR128X:$src1, f32mem:$src2, u8imm:$src3),
+      "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+      [(set VR128X:$dst, (X86insertps VR128X:$src1,
+                          (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+                          imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 VECTOR EXTRACT
+//---
+
+multiclass vextract_for_size_first_position_lowering<X86VectorVTInfo From,
+                                                     X86VectorVTInfo To> {
+  // A subvector extract from the first vector position is
+  // a subregister copy that needs no instruction.
+  def NAME # To.NumElts:
+      Pat<(To.VT (extract_subvector (From.VT From.RC:$src),(iPTR 0))),
+          (To.VT (EXTRACT_SUBREG (From.VT From.RC:$src), To.SubRegIdx))>;
+}
+
+multiclass vextract_for_size<int Opcode,
+                                    X86VectorVTInfo From, X86VectorVTInfo To,
+                                    PatFrag vextract_extract> :
+  vextract_for_size_first_position_lowering<From, To> {
+
+  let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
+    // use AVX512_maskable_in_asm (AVX512_maskable can't be used due to
+    // vextract_extract), we interesting only in patterns without mask,
+    // intrinsics pattern match generated bellow.
+    defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst),
+                (ins From.RC:$src1, i32u8imm:$idx),
+                "vextract" # To.EltTypeName # "x" # To.NumElts,
+                "$idx, $src1", "$src1, $idx",
+                [(set To.RC:$dst, (vextract_extract:$idx (From.VT From.RC:$src1),
+                                                         (iPTR imm)))]>,
+              AVX512AIi8Base, EVEX;
+    let mayStore = 1 in {
+      def rm  : AVX512AIi8<Opcode, MRMDestMem, (outs),
+                      (ins To.MemOp:$dst, From.RC:$src1, i32u8imm:$src2),
+                      "vextract" # To.EltTypeName # "x" # To.NumElts #
+                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      []>, EVEX;
+
+      def rmk : AVX512AIi8<Opcode, MRMDestMem, (outs),
+                      (ins To.MemOp:$dst, To.KRCWM:$mask,
+                                          From.RC:$src1, i32u8imm:$src2),
+                       "vextract" # To.EltTypeName # "x" # To.NumElts #
+                            "\t{$src2, $src1, $dst {${mask}}|"
+                            "$dst {${mask}}, $src1, $src2}",
+                      []>, EVEX_K, EVEX;
+    }//mayStore = 1
+  }
+
+  // Intrinsic call with masking.
+  def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
+                              "x" # To.NumElts # "_" # From.Size)
+                From.RC:$src1, (iPTR imm:$idx), To.RC:$src0, To.MRC:$mask),
+            (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+                                From.ZSuffix # "rrk")
+                To.RC:$src0,
+                (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM),
+                From.RC:$src1, imm:$idx)>;
+
+  // Intrinsic call with zero-masking.
+  def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
+                              "x" # To.NumElts # "_" # From.Size)
+                From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, To.MRC:$mask),
+            (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+                                From.ZSuffix # "rrkz")
+                (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM),
+                From.RC:$src1, imm:$idx)>;
+
+  // Intrinsic call without masking.
+  def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
+                              "x" # To.NumElts # "_" # From.Size)
+                From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)),
+            (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+                                From.ZSuffix # "rr")
+                From.RC:$src1, imm:$idx)>;
+}
+
+// Codegen pattern for the alternative types
+multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
+                X86VectorVTInfo To, PatFrag vextract_extract,
+                SDNodeXForm EXTRACT_get_vextract_imm, list<Predicate> p> :
+  vextract_for_size_first_position_lowering<From, To> {
+
+  let Predicates = p in
+     def : Pat<(vextract_extract:$ext (From.VT From.RC:$src1), (iPTR imm)),
+               (To.VT (!cast<Instruction>(InstrStr#"rr")
+                          From.RC:$src1,
+                          (EXTRACT_get_vextract_imm To.RC:$ext)))>;
+}
+
+multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
+                                             ValueType EltVT64, int Opcode256> {
+  defm NAME # "32x4Z" : vextract_for_size<Opcode128,
+                                 X86VectorVTInfo<16, EltVT32, VR512>,
+                                 X86VectorVTInfo< 4, EltVT32, VR128X>,
+                                 vextract128_extract>,
+                                     EVEX_V512, EVEX_CD8<32, CD8VT4>;
+  defm NAME # "64x4Z" : vextract_for_size<Opcode256,
+                                 X86VectorVTInfo< 8, EltVT64, VR512>,
+                                 X86VectorVTInfo< 4, EltVT64, VR256X>,
+                                 vextract256_extract>,
+                                     VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>;
+  let Predicates = [HasVLX] in
+    defm NAME # "32x4Z256" : vextract_for_size<Opcode128,
+                                 X86VectorVTInfo< 8, EltVT32, VR256X>,
+                                 X86VectorVTInfo< 4, EltVT32, VR128X>,
+                                 vextract128_extract>,
+                                     EVEX_V256, EVEX_CD8<32, CD8VT4>;
+  let Predicates = [HasVLX, HasDQI] in
+    defm NAME # "64x2Z256" : vextract_for_size<Opcode128,
+                                 X86VectorVTInfo< 4, EltVT64, VR256X>,
+                                 X86VectorVTInfo< 2, EltVT64, VR128X>,
+                                 vextract128_extract>,
+                                     VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>;
+  let Predicates = [HasDQI] in {
+    defm NAME # "64x2Z" : vextract_for_size<Opcode128,
+                                 X86VectorVTInfo< 8, EltVT64, VR512>,
+                                 X86VectorVTInfo< 2, EltVT64, VR128X>,
+                                 vextract128_extract>,
+                                     VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>;
+    defm NAME # "32x8Z" : vextract_for_size<Opcode256,
+                                 X86VectorVTInfo<16, EltVT32, VR512>,
+                                 X86VectorVTInfo< 8, EltVT32, VR256X>,
+                                 vextract256_extract>,
+                                     EVEX_V512, EVEX_CD8<32, CD8VT8>;
+  }
+}
+
+defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b>;
+defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>;
+
+// extract_subvector codegen patterns with the alternative types.
+// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
+defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info,
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info,
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
+
+defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info,
+          vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info,
+          vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
+
+defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info,
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info,
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
+
+// Codegen pattern with the alternative types extract VEC128 from VEC512
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
+                 vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info,
+                 vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+// Codegen pattern with the alternative types extract VEC256 from VEC512
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
+                 vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
+                 vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+
+// A 128-bit subvector insert to the first 512-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
+          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+          sub_ymm)>;
+def : Pat<(insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
+          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+          sub_ymm)>;
+def : Pat<(insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
+          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+          sub_ymm)>;
+def : Pat<(insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+          sub_ymm)>;
+
+def : Pat<(insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v16i16 VR256X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v32i8 VR256X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+
+// vextractps - extract 32 bits from XMM
+def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
+      (ins VR128X:$src1, u8imm:$src2),
+      "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
+      EVEX;
+
+def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs),
+      (ins f32mem:$dst, VR128X:$src1, u8imm:$src2),
+      "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2),
+                          addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>;
+
+//===---------------------------------------------------------------------===//
+// AVX-512 BROADCAST
+//---
+
+multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
+                            X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
+
+  defm r : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+                   (ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src",
+                   (DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))>,
+                   T8PD, EVEX;
+  let mayLoad = 1 in
+    defm m : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+                     (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+                     (DestInfo.VT (X86VBroadcast
+                                     (SrcInfo.ScalarLdFrag addr:$src)))>,
+                     T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>;
+}
+
+multiclass avx512_fp_broadcast_vl<bits<8> opc, string OpcodeStr,
+                                                       AVX512VLVectorVTInfo _> {
+  defm Z  : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
+                             EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z256  : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+                             EVEX_V256;
+  }
+}
+
+let ExeDomain = SSEPackedSingle in {
+  defm VBROADCASTSS  : avx512_fp_broadcast_vl<0x18, "vbroadcastss",
+                                         avx512vl_f32_info>;
+   let Predicates = [HasVLX] in {
+     defm VBROADCASTSSZ128  : avx512_broadcast_rm<0x18, "vbroadcastss",
+                                         v4f32x_info, v4f32x_info>, EVEX_V128;
+   }
+}
+
+let ExeDomain = SSEPackedDouble in {
+  defm VBROADCASTSD  : avx512_fp_broadcast_vl<0x19, "vbroadcastsd",
+                                         avx512vl_f64_info>, VEX_W;
+}
+
+// avx512_broadcast_pat introduces patterns for broadcast with a scalar argument.
+// Later, we can canonize broadcast instructions before ISel phase and
+// eliminate additional patterns on ISel.
+// SrcRC_v and SrcRC_s are RegisterClasses for vector and scalar
+// representations of source
+multiclass avx512_broadcast_pat<string InstName, SDNode OpNode,
+                                X86VectorVTInfo _, RegisterClass SrcRC_v,
+                                RegisterClass SrcRC_s> {
+  def : Pat<(_.VT (OpNode  (_.EltVT SrcRC_s:$src))),
+            (!cast<Instruction>(InstName##"r")
+              (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>;
+
+  let AddedComplexity = 30 in {
+    def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                (OpNode (_.EltVT SrcRC_s:$src)), _.RC:$src0)),
+              (!cast<Instruction>(InstName##"rk") _.RC:$src0, _.KRCWM:$mask,
+                (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>;
+
+    def : Pat<(_.VT(vselect _.KRCWM:$mask,
+                (OpNode (_.EltVT SrcRC_s:$src)), _.ImmAllZerosV)),
+              (!cast<Instruction>(InstName##"rkz") _.KRCWM:$mask,
+                (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>;
+  }
+}
+
+defm : avx512_broadcast_pat<"VBROADCASTSSZ", X86VBroadcast, v16f32_info,
+                            VR128X, FR32X>;
+defm : avx512_broadcast_pat<"VBROADCASTSDZ", X86VBroadcast, v8f64_info,
+                            VR128X, FR64X>;
+
+let Predicates = [HasVLX] in {
+  defm : avx512_broadcast_pat<"VBROADCASTSSZ256", X86VBroadcast,
+                              v8f32x_info, VR128X, FR32X>;
+  defm : avx512_broadcast_pat<"VBROADCASTSSZ128", X86VBroadcast,
+                              v4f32x_info, VR128X, FR32X>;
+  defm : avx512_broadcast_pat<"VBROADCASTSDZ256", X86VBroadcast,
+                              v4f64x_info, VR128X, FR64X>;
+}
+
+def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))),
+          (VBROADCASTSSZm addr:$src)>;
+def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))),
+          (VBROADCASTSDZm addr:$src)>;
+
+def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src),
+          (VBROADCASTSSZm addr:$src)>;
+def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
+          (VBROADCASTSDZm addr:$src)>;
+
+multiclass avx512_int_broadcast_reg<bits<8> opc, X86VectorVTInfo _,
+                                    RegisterClass SrcRC> {
+  defm r : AVX512_maskable_in_asm<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                           (ins SrcRC:$src),  "vpbroadcast"##_.Suffix,
+                           "$src", "$src", []>, T8PD, EVEX;
+}
+
+multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _,
+                                       RegisterClass SrcRC, Predicate prd> {
+  let Predicates = [prd] in
+    defm Z : avx512_int_broadcast_reg<opc, _.info512, SrcRC>, EVEX_V512;
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_int_broadcast_reg<opc, _.info256, SrcRC>, EVEX_V256;
+    defm Z128 : avx512_int_broadcast_reg<opc, _.info128, SrcRC>, EVEX_V128;
+  }
+}
+
+defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, GR32,
+                                                 HasBWI>;
+defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, GR32,
+                                                 HasBWI>;
+defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, GR32,
+                                                 HasAVX512>;
+defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, GR64,
+                                                 HasAVX512>, VEX_W;
+
+def : Pat <(v16i32 (X86vzext VK16WM:$mask)),
+           (VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>;
+
+def : Pat <(v8i64 (X86vzext VK8WM:$mask)),
+           (VPBROADCASTQrZrkz VK8WM:$mask, (i64 (MOV64ri 0x1)))>;
+
+def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))),
+        (VPBROADCASTDrZr GR32:$src)>;
+def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))),
+        (VPBROADCASTQrZr GR64:$src)>;
+
+def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))),
+        (VPBROADCASTDrZr GR32:$src)>;
+def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_i64_512 (i64 GR64:$src))),
+        (VPBROADCASTQrZr GR64:$src)>;
+
+def : Pat<(v16i32 (int_x86_avx512_mask_pbroadcast_d_gpr_512 (i32 GR32:$src),
+                   (v16i32 immAllZerosV), (i16 GR16:$mask))),
+          (VPBROADCASTDrZrkz (COPY_TO_REGCLASS GR16:$mask, VK16WM), GR32:$src)>;
+def : Pat<(v8i64 (int_x86_avx512_mask_pbroadcast_q_gpr_512 (i64 GR64:$src),
+                   (bc_v8i64 (v16i32 immAllZerosV)), (i8 GR8:$mask))),
+          (VPBROADCASTQrZrkz (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>;
+
+// Provide aliases for broadcast from the same register class that
+// automatically does the extract.
+multiclass avx512_int_broadcast_rm_lowering<X86VectorVTInfo DestInfo,
+                                            X86VectorVTInfo SrcInfo> {
+  def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))),
+            (!cast<Instruction>(NAME#DestInfo.ZSuffix#"r")
+                (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm))>;
+}
+
+multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr,
+                                        AVX512VLVectorVTInfo _, Predicate prd> {
+  let Predicates = [prd] in {
+    defm Z :   avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
+               avx512_int_broadcast_rm_lowering<_.info512, _.info256>,
+                                  EVEX_V512;
+    // Defined separately to avoid redefinition.
+    defm Z_Alt : avx512_int_broadcast_rm_lowering<_.info512, _.info512>;
+  }
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+                avx512_int_broadcast_rm_lowering<_.info256, _.info256>,
+                                 EVEX_V256;
+    defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>,
+                                 EVEX_V128;
+  }
+}
+
+defm VPBROADCASTB  : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb",
+                                           avx512vl_i8_info, HasBWI>;
+defm VPBROADCASTW  : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw",
+                                           avx512vl_i16_info, HasBWI>;
+defm VPBROADCASTD  : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd",
+                                           avx512vl_i32_info, HasAVX512>;
+defm VPBROADCASTQ  : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
+                                           avx512vl_i64_info, HasAVX512>, VEX_W;
+
+multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
+                          X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
+  let mayLoad = 1 in 
+    defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), 
+                             (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
+                             (_Dst.VT (X86SubVBroadcast
+                               (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>, 
+                              AVX5128IBase, EVEX;
+}
+
+defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
+                       v16i32_info, v4i32x_info>,
+                       EVEX_V512, EVEX_CD8<32, CD8VT4>;
+defm VBROADCASTF32X4 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
+                       v16f32_info, v4f32x_info>,
+                       EVEX_V512, EVEX_CD8<32, CD8VT4>;
+defm VBROADCASTI64X4 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti64x4",
+                       v8i64_info, v4i64x_info>, VEX_W,
+                       EVEX_V512, EVEX_CD8<64, CD8VT4>;
+defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4",
+                       v8f64_info, v4f64x_info>, VEX_W,
+                       EVEX_V512, EVEX_CD8<64, CD8VT4>;
+
+let Predicates = [HasVLX] in {
+defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
+                           v8i32x_info, v4i32x_info>,
+                           EVEX_V256, EVEX_CD8<32, CD8VT4>;
+defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
+                           v8f32x_info, v4f32x_info>,
+                           EVEX_V256, EVEX_CD8<32, CD8VT4>;
+}
+let Predicates = [HasVLX, HasDQI] in {
+defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2",
+                           v4i64x_info, v2i64x_info>, VEX_W,
+                           EVEX_V256, EVEX_CD8<64, CD8VT2>;
+defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2",
+                           v4f64x_info, v2f64x_info>, VEX_W,
+                           EVEX_V256, EVEX_CD8<64, CD8VT2>;
+}
+let Predicates = [HasDQI] in {
+defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2",
+                       v8i64_info, v2i64x_info>, VEX_W,
+                       EVEX_V512, EVEX_CD8<64, CD8VT2>;
+defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti32x8",
+                       v16i32_info, v8i32x_info>,
+                       EVEX_V512, EVEX_CD8<32, CD8VT8>;
+defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2",
+                       v8f64_info, v2f64x_info>, VEX_W,
+                       EVEX_V512, EVEX_CD8<64, CD8VT2>;
+defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8",
+                       v16f32_info, v8f32x_info>,
+                       EVEX_V512, EVEX_CD8<32, CD8VT8>;
+}
+
+multiclass avx512_broadcast_32x2<bits<8> opc, string OpcodeStr,
+                                 X86VectorVTInfo _Dst, X86VectorVTInfo _Src,
+                                 SDNode OpNode = X86SubVBroadcast> {
+
+  defm r : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
+                   (ins _Src.RC:$src), OpcodeStr, "$src", "$src",
+                   (_Dst.VT (OpNode (_Src.VT _Src.RC:$src)))>,
+                   T8PD, EVEX;
+  let mayLoad = 1 in
+    defm m : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+                   (ins _Src.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+                   (_Dst.VT (OpNode
+                              (_Src.VT (scalar_to_vector(loadi64 addr:$src)))))>,
+                   T8PD, EVEX, EVEX_CD8<_Src.EltSize, CD8VT2>;
+}
+
+multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
+                             AVX512VLVectorVTInfo _> {
+  let Predicates = [HasDQI] in
+    defm Z :    avx512_broadcast_32x2<opc, OpcodeStr, _.info512, _.info128>,
+                                  EVEX_V512;
+  let Predicates = [HasDQI, HasVLX] in
+    defm Z256 : avx512_broadcast_32x2<opc, OpcodeStr, _.info256, _.info128>,
+                                  EVEX_V256;
+}
+
+multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
+                                                       AVX512VLVectorVTInfo _> :
+  avx512_common_broadcast_32x2<opc, OpcodeStr, _> {
+
+  let Predicates = [HasDQI, HasVLX] in
+    defm Z128 : avx512_broadcast_32x2<opc, OpcodeStr, _.info128, _.info128,
+                                      X86SubV32x2Broadcast>, EVEX_V128;
+}
+
+defm VPBROADCASTI32X2  : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",
+                                           avx512vl_i32_info>;
+defm VPBROADCASTF32X2  : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
+                                           avx512vl_f32_info>;
+
+def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))),
+          (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
+def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))),
+          (VBROADCASTSSZr (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm))>;
+
+def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))),
+          (VBROADCASTSDZr (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;
+def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))),
+          (VBROADCASTSDZr (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v16f32 (X86VBroadcast FR32X:$src)),
+          (VBROADCASTSSZr (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
+def : Pat<(v8f64 (X86VBroadcast FR64X:$src)),
+          (VBROADCASTSDZr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+
+//===----------------------------------------------------------------------===//
+// AVX-512 BROADCAST MASK TO VECTOR REGISTER
+//---
+multiclass avx512_mask_broadcastm<bits<8> opc, string OpcodeStr,
+                                  X86VectorVTInfo _, RegisterClass KRC> {
+  def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.RC:$dst), (ins KRC:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>, EVEX;
+}
+
+multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr, 
+                                 AVX512VLVectorVTInfo VTInfo, RegisterClass KRC> {
+  let Predicates = [HasCDI] in
+    defm Z : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info512, KRC>, EVEX_V512;
+  let Predicates = [HasCDI, HasVLX] in {
+    defm Z256 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info256, KRC>, EVEX_V256;
+    defm Z128 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info128, KRC>, EVEX_V128;
+  }
+}
+
+defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d",
+                                               avx512vl_i32_info, VK16>;
+defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q",
+                                               avx512vl_i64_info, VK8>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// -- VPERMI2 - 3 source operands form --
+multiclass avx512_perm_i<bits<8> opc, string OpcodeStr,
+                         X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+let Constraints = "$src1 = $dst" in {
+  defm rr: AVX512_maskable_3src_cast<opc, MRMSrcReg, _, IdxVT, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V,
+         AVX5128IBase;
+
+  let mayLoad = 1 in
+  defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
+            (ins _.RC:$src2, _.MemOp:$src3),
+            OpcodeStr, "$src3, $src2", "$src2, $src3",
+            (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2,
+                   (_.VT (bitconvert (_.LdFrag addr:$src3)))))>,
+            EVEX_4V, AVX5128IBase;
+  }
+}
+multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
+                            X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+  let mayLoad = 1, Constraints = "$src1 = $dst" in
+  defm rmb: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
+              (ins _.RC:$src2, _.ScalarMemOp:$src3),
+              OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
+              !strconcat("$src2, ${src3}", _.BroadcastStr ),
+              (_.VT (X86VPermi2X IdxVT.RC:$src1,
+               _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>,
+              AVX5128IBase, EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr,
+                               AVX512VLVectorVTInfo VTInfo,
+                               AVX512VLVectorVTInfo ShuffleMask> {
+  defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512,
+                           ShuffleMask.info512>,
+            avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info512,
+                             ShuffleMask.info512>, EVEX_V512;
+  let Predicates = [HasVLX] in {
+  defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128,
+                               ShuffleMask.info128>,
+                 avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info128,
+                                  ShuffleMask.info128>, EVEX_V128;
+  defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256,
+                               ShuffleMask.info256>,
+                 avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info256,
+                                  ShuffleMask.info256>,  EVEX_V256;
+  }
+}
+
+multiclass avx512_perm_i_sizes_w<bits<8> opc, string OpcodeStr,
+                                 AVX512VLVectorVTInfo VTInfo,
+                                 AVX512VLVectorVTInfo Idx> {
+  let Predicates = [HasBWI] in
+  defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512,
+                           Idx.info512>, EVEX_V512;
+  let Predicates = [HasBWI, HasVLX] in {
+  defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128,
+                               Idx.info128>, EVEX_V128;
+  defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256,
+                               Idx.info256>,  EVEX_V256;
+  }
+}
+
+defm VPERMI2D  : avx512_perm_i_sizes<0x76, "vpermi2d",
+                  avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMI2Q  : avx512_perm_i_sizes<0x76, "vpermi2q",
+                  avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMI2W  : avx512_perm_i_sizes_w<0x75, "vpermi2w",
+                  avx512vl_i16_info, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps",
+                  avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd",
+                  avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// VPERMT2
+multiclass avx512_perm_t<bits<8> opc, string OpcodeStr,
+                         X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+let Constraints = "$src1 = $dst" in {
+  defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins IdxVT.RC:$src2, _.RC:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3))>, EVEX_4V,
+         AVX5128IBase;
+
+  let mayLoad = 1 in
+  defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+            (ins IdxVT.RC:$src2, _.MemOp:$src3),
+            OpcodeStr, "$src3, $src2", "$src2, $src3",
+            (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
+                   (bitconvert (_.LdFrag addr:$src3))))>,
+            EVEX_4V, AVX5128IBase;
+  }
+}
+multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
+                            X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+  let mayLoad = 1, Constraints = "$src1 = $dst" in
+  defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+              (ins IdxVT.RC:$src2, _.ScalarMemOp:$src3),
+              OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
+              !strconcat("$src2, ${src3}", _.BroadcastStr ),
+              (_.VT (X86VPermt2 _.RC:$src1,
+               IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>,
+              AVX5128IBase, EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr,
+                               AVX512VLVectorVTInfo VTInfo,
+                               AVX512VLVectorVTInfo ShuffleMask> {
+  defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512,
+                              ShuffleMask.info512>,
+            avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info512,
+                              ShuffleMask.info512>, EVEX_V512;
+  let Predicates = [HasVLX] in {
+  defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128,
+                              ShuffleMask.info128>,
+                 avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info128,
+                              ShuffleMask.info128>, EVEX_V128;
+  defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256,
+                              ShuffleMask.info256>,
+                 avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info256,
+                              ShuffleMask.info256>, EVEX_V256;
+  }
+}
+
+multiclass avx512_perm_t_sizes_w<bits<8> opc, string OpcodeStr,
+                                 AVX512VLVectorVTInfo VTInfo,
+                                 AVX512VLVectorVTInfo Idx> {
+  let Predicates = [HasBWI] in
+  defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512,
+                           Idx.info512>, EVEX_V512;
+  let Predicates = [HasBWI, HasVLX] in {
+  defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128,
+                               Idx.info128>, EVEX_V128;
+  defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256,
+                               Idx.info256>, EVEX_V256;
+  }
+}
+
+defm VPERMT2D  : avx512_perm_t_sizes<0x7E, "vpermt2d",
+                  avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMT2Q  : avx512_perm_t_sizes<0x7E, "vpermt2q",
+                  avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMT2W  : avx512_perm_t_sizes_w<0x7D, "vpermt2w",
+                  avx512vl_i16_info, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps",
+                  avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd",
+                  avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - BLEND using mask
+//
+multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+             (ins _.RC:$src1, _.RC:$src2),
+             !strconcat(OpcodeStr,
+             "\t{$src2, $src1, ${dst} |${dst}, $src1, $src2}"),
+             []>, EVEX_4V;
+  def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+             (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+             !strconcat(OpcodeStr,
+             "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+             [(set _.RC:$dst, (X86select _.KRCWM:$mask, (_.VT _.RC:$src1),
+                 (_.VT _.RC:$src2)))]>, EVEX_4V, EVEX_K;
+  def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+             (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+             !strconcat(OpcodeStr,
+             "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
+             []>, EVEX_4V, EVEX_KZ;
+  let mayLoad = 1 in {
+  def rm  : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+             (ins _.RC:$src1, _.MemOp:$src2),
+             !strconcat(OpcodeStr,
+             "\t{$src2, $src1, ${dst} |${dst},  $src1, $src2}"),
+             []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+  def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+             (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
+             !strconcat(OpcodeStr,
+             "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+             [(set _.RC:$dst, (X86select _.KRCWM:$mask, (_.VT _.RC:$src1),
+              (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
+              EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>;
+  def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+             (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
+             !strconcat(OpcodeStr,
+             "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
+             []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>;
+  }
+  }
+}
+multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+
+  def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+      (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
+       !strconcat(OpcodeStr,
+            "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+            "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
+      [(set _.RC:$dst,(X86select _.KRCWM:$mask, (_.VT _.RC:$src1),
+                       (X86VBroadcast (_.ScalarLdFrag addr:$src2))))]>,
+      EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+
+  def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+      (ins _.RC:$src1, _.ScalarMemOp:$src2),
+       !strconcat(OpcodeStr,
+            "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
+            "$dst, $src1, ${src2}", _.BroadcastStr, "}"),
+      []>,  EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+
+}
+
+multiclass blendmask_dq <bits<8> opc, string OpcodeStr,
+                                 AVX512VLVectorVTInfo VTInfo> {
+  defm Z : avx512_blendmask      <opc, OpcodeStr, VTInfo.info512>,
+           avx512_blendmask_rmb  <opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z256 : avx512_blendmask<opc, OpcodeStr, VTInfo.info256>,
+                avx512_blendmask_rmb  <opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
+    defm Z128 : avx512_blendmask<opc, OpcodeStr, VTInfo.info128>,
+                avx512_blendmask_rmb  <opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+  }
+}
+
+multiclass blendmask_bw <bits<8> opc, string OpcodeStr,
+                         AVX512VLVectorVTInfo VTInfo> {
+  let Predicates = [HasBWI] in
+    defm Z : avx512_blendmask    <opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+
+  let Predicates = [HasBWI, HasVLX] in {
+    defm Z256 : avx512_blendmask <opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
+    defm Z128 : avx512_blendmask <opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+  }
+}
+
+
+defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", avx512vl_f32_info>;
+defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", avx512vl_f64_info>, VEX_W;
+defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", avx512vl_i32_info>;
+defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", avx512vl_i64_info>, VEX_W;
+defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>;
+defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W;
+
+
+let Predicates = [HasAVX512] in {
+def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
+                            (v8f32 VR256X:$src2))),
+            (EXTRACT_SUBREG
+              (v16f32 (VBLENDMPSZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
+                            (v8i32 VR256X:$src2))),
+            (EXTRACT_SUBREG
+                (v16i32 (VPBLENDMDZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+}
+//===----------------------------------------------------------------------===//
+// Compare Instructions
+//===----------------------------------------------------------------------===//
+
+// avx512_cmp_scalar - AVX512 CMPSS and CMPSD
+
+multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>{
+
+  defm  rr_Int  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+                      (outs _.KRC:$dst),
+                      (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
+                      "vcmp${cc}"#_.Suffix,
+                      "$src2, $src1", "$src1, $src2",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              imm:$cc)>, EVEX_4V;
+  let mayLoad = 1 in
+    defm  rm_Int  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+                      (outs _.KRC:$dst),
+                      (ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
+                      "vcmp${cc}"#_.Suffix,
+                      "$src2, $src1", "$src1, $src2",
+                      (OpNode (_.VT _.RC:$src1),
+                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+                          imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+
+  defm  rrb_Int  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+                     (outs _.KRC:$dst),
+                     (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
+                     "vcmp${cc}"#_.Suffix,
+                     "{sae}, $src2, $src1", "$src1, $src2,{sae}",
+                     (OpNodeRnd (_.VT _.RC:$src1),
+                                (_.VT _.RC:$src2),
+                                imm:$cc,
+                                (i32 FROUND_NO_EXC))>, EVEX_4V, EVEX_B;
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    defm  rri_alt  : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+                        (outs VK1:$dst),
+                        (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+                        "vcmp"#_.Suffix,
+                        "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V;
+    defm  rmi_alt  : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
+                        (outs _.KRC:$dst),
+                        (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+                        "vcmp"#_.Suffix,
+                        "$cc, $src2, $src1", "$src1, $src2, $cc">,
+                        EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+
+    defm  rrb_alt  : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+                       (outs _.KRC:$dst),
+                       (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+                       "vcmp"#_.Suffix,
+                       "$cc,{sae}, $src2, $src1","$src1, $src2,{sae}, $cc">,
+                       EVEX_4V, EVEX_B;
+  }// let isAsmParserOnly = 1, hasSideEffects = 0
+
+  let isCodeGenOnly = 1 in {
+    def rr : AVX512Ii8<0xC2, MRMSrcReg,
+                (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, AVXCC:$cc),
+                !strconcat("vcmp${cc}", _.Suffix,
+                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set _.KRC:$dst, (OpNode _.FRC:$src1,
+                                          _.FRC:$src2,
+                                          imm:$cc))],
+                IIC_SSE_ALU_F32S_RR>, EVEX_4V;
+    let mayLoad = 1 in
+      def rm : AVX512Ii8<0xC2, MRMSrcMem,
+                (outs _.KRC:$dst),
+                (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
+                !strconcat("vcmp${cc}", _.Suffix,
+                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set _.KRC:$dst, (OpNode _.FRC:$src1,
+                                          (_.ScalarLdFrag addr:$src2),
+                                          imm:$cc))],
+                IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+  }
+}
+
+let Predicates = [HasAVX512] in {
+  defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd>,
+                                   AVX512XSIi8Base;
+  defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd>,
+                                   AVX512XDIi8Base, VEX_W;
+}
+
+multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+              X86VectorVTInfo _> {
+  def rr : AVX512BI<opc, MRMSrcReg,
+             (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))],
+             IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+  let mayLoad = 1 in
+  def rm : AVX512BI<opc, MRMSrcMem,
+             (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+                                     (_.VT (bitconvert (_.LdFrag addr:$src2)))))],
+             IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  def rrk : AVX512BI<opc, MRMSrcReg,
+              (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
+                          "$dst {${mask}}, $src1, $src2}"),
+              [(set _.KRC:$dst, (and _.KRCWM:$mask,
+                                   (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))],
+              IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+  let mayLoad = 1 in
+  def rmk : AVX512BI<opc, MRMSrcMem,
+              (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
+              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
+                          "$dst {${mask}}, $src1, $src2}"),
+              [(set _.KRC:$dst, (and _.KRCWM:$mask,
+                                   (OpNode (_.VT _.RC:$src1),
+                                       (_.VT (bitconvert
+                                              (_.LdFrag addr:$src2))))))],
+              IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
+}
+
+multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+              X86VectorVTInfo _> :
+           avx512_icmp_packed<opc, OpcodeStr, OpNode, _> {
+  let mayLoad = 1 in {
+  def rmb : AVX512BI<opc, MRMSrcMem,
+              (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
+              !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
+                                    "|$dst, $src1, ${src2}", _.BroadcastStr, "}"),
+              [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+                              (X86VBroadcast (_.ScalarLdFrag addr:$src2))))],
+              IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
+  def rmbk : AVX512BI<opc, MRMSrcMem,
+               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
+                                       _.ScalarMemOp:$src2),
+               !strconcat(OpcodeStr,
+                          "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+                          "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
+               [(set _.KRC:$dst, (and _.KRCWM:$mask,
+                                      (OpNode (_.VT _.RC:$src1),
+                                        (X86VBroadcast
+                                          (_.ScalarLdFrag addr:$src2)))))],
+               IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
+  }
+}
+
+multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+  let Predicates = [prd] in
+  defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info512>,
+           EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info256>,
+                EVEX_V256;
+    defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info128>,
+                EVEX_V128;
+  }
+}
+
+multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, AVX512VLVectorVTInfo VTInfo,
+                                  Predicate prd> {
+  let Predicates = [prd] in
+  defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info512>,
+           EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info256>,
+                EVEX_V256;
+    defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info128>,
+                EVEX_V128;
+  }
+}
+
+defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm,
+                      avx512vl_i8_info, HasBWI>,
+                EVEX_CD8<8, CD8VF>;
+
+defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm,
+                      avx512vl_i16_info, HasBWI>,
+                EVEX_CD8<16, CD8VF>;
+
+defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm,
+                      avx512vl_i32_info, HasAVX512>,
+                EVEX_CD8<32, CD8VF>;
+
+defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm,
+                      avx512vl_i64_info, HasAVX512>,
+                T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm,
+                      avx512vl_i8_info, HasBWI>,
+                EVEX_CD8<8, CD8VF>;
+
+defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm,
+                      avx512vl_i16_info, HasBWI>,
+                EVEX_CD8<16, CD8VF>;
+
+defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm,
+                      avx512vl_i32_info, HasAVX512>,
+                EVEX_CD8<32, CD8VF>;
+
+defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm,
+                      avx512vl_i64_info, HasAVX512>,
+                T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (COPY_TO_REGCLASS (VPCMPGTDZrr
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
+
+def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (COPY_TO_REGCLASS (VPCMPEQDZrr
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
+
+multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
+                          X86VectorVTInfo _> {
+  def rri : AVX512AIi8<opc, MRMSrcReg,
+             (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc),
+             !strconcat("vpcmp${cc}", Suffix,
+                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                                       imm:$cc))],
+             IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+  let mayLoad = 1 in
+  def rmi : AVX512AIi8<opc, MRMSrcMem,
+             (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc),
+             !strconcat("vpcmp${cc}", Suffix,
+                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+                              (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                              imm:$cc))],
+             IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  def rrik : AVX512AIi8<opc, MRMSrcReg,
+              (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
+                                      AVX512ICC:$cc),
+              !strconcat("vpcmp${cc}", Suffix,
+                         "\t{$src2, $src1, $dst {${mask}}|",
+                         "$dst {${mask}}, $src1, $src2}"),
+              [(set _.KRC:$dst, (and _.KRCWM:$mask,
+                                  (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                                          imm:$cc)))],
+              IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+  let mayLoad = 1 in
+  def rmik : AVX512AIi8<opc, MRMSrcMem,
+              (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
+                                    AVX512ICC:$cc),
+              !strconcat("vpcmp${cc}", Suffix,
+                         "\t{$src2, $src1, $dst {${mask}}|",
+                         "$dst {${mask}}, $src1, $src2}"),
+              [(set _.KRC:$dst, (and _.KRCWM:$mask,
+                                   (OpNode (_.VT _.RC:$src1),
+                                      (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                                      imm:$cc)))],
+              IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
+
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    def rri_alt : AVX512AIi8<opc, MRMSrcReg,
+               (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+               !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
+                          "$dst, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+    let mayLoad = 1 in
+    def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
+               (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+               !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
+                          "$dst, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+    def rrik_alt : AVX512AIi8<opc, MRMSrcReg,
+               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
+                                       u8imm:$cc),
+               !strconcat("vpcmp", Suffix,
+                          "\t{$cc, $src2, $src1, $dst {${mask}}|",
+                          "$dst {${mask}}, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+    let mayLoad = 1 in
+    def rmik_alt : AVX512AIi8<opc, MRMSrcMem,
+               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
+                                       u8imm:$cc),
+               !strconcat("vpcmp", Suffix,
+                          "\t{$cc, $src2, $src1, $dst {${mask}}|",
+                          "$dst {${mask}}, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
+  }
+}
+
+multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
+                              X86VectorVTInfo _> :
+           avx512_icmp_cc<opc, Suffix, OpNode, _> {
+  def rmib : AVX512AIi8<opc, MRMSrcMem,
+             (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
+                                     AVX512ICC:$cc),
+             !strconcat("vpcmp${cc}", Suffix,
+                        "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
+                        "$dst, $src1, ${src2}", _.BroadcastStr, "}"),
+             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+                               (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+                               imm:$cc))],
+             IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
+  def rmibk : AVX512AIi8<opc, MRMSrcMem,
+              (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
+                                       _.ScalarMemOp:$src2, AVX512ICC:$cc),
+              !strconcat("vpcmp${cc}", Suffix,
+                       "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+                       "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
+              [(set _.KRC:$dst, (and _.KRCWM:$mask,
+                                  (OpNode (_.VT _.RC:$src1),
+                                    (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+                                    imm:$cc)))],
+              IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
+
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in {
+    def rmib_alt : AVX512AIi8<opc, MRMSrcMem,
+               (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
+                                       u8imm:$cc),
+               !strconcat("vpcmp", Suffix,
+                   "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|",
+                   "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
+               [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
+    def rmibk_alt : AVX512AIi8<opc, MRMSrcMem,
+               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
+                                       _.ScalarMemOp:$src2, u8imm:$cc),
+               !strconcat("vpcmp", Suffix,
+                  "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+                  "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
+               [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
+  }
+}
+
+multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, SDNode OpNode,
+                             AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+  let Predicates = [prd] in
+  defm Z : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info512>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info256>, EVEX_V256;
+    defm Z128 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info128>, EVEX_V128;
+  }
+}
+
+multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, SDNode OpNode,
+                                AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+  let Predicates = [prd] in
+  defm Z : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info512>,
+           EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info256>,
+                EVEX_V256;
+    defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info128>,
+                EVEX_V128;
+  }
+}
+
+defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, avx512vl_i8_info,
+                                HasBWI>, EVEX_CD8<8, CD8VF>;
+defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, avx512vl_i8_info,
+                                 HasBWI>, EVEX_CD8<8, CD8VF>;
+
+defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, avx512vl_i16_info,
+                                HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, avx512vl_i16_info,
+                                 HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>;
+
+defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, avx512vl_i32_info,
+                                    HasAVX512>, EVEX_CD8<32, CD8VF>;
+defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, avx512vl_i32_info,
+                                     HasAVX512>, EVEX_CD8<32, CD8VF>;
+
+defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info,
+                                    HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info,
+                                     HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_vcmp_common<X86VectorVTInfo _> {
+
+  defm  rri  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+                   (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,AVXCC:$cc),
+                   "vcmp${cc}"#_.Suffix,
+                   "$src2, $src1", "$src1, $src2",
+                   (X86cmpm (_.VT _.RC:$src1),
+                         (_.VT _.RC:$src2),
+                           imm:$cc)>;
+
+  let mayLoad = 1 in {
+    defm  rmi  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+                  (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
+                  "vcmp${cc}"#_.Suffix,
+                  "$src2, $src1", "$src1, $src2",
+                  (X86cmpm (_.VT _.RC:$src1),
+                          (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                          imm:$cc)>;
+
+    defm  rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+                  (outs _.KRC:$dst),
+                  (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
+                  "vcmp${cc}"#_.Suffix,
+                  "${src2}"##_.BroadcastStr##", $src1",
+                  "$src1, ${src2}"##_.BroadcastStr,
+                  (X86cmpm (_.VT _.RC:$src1),
+                          (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+                          imm:$cc)>,EVEX_B;
+  }
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    defm  rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+                         (outs _.KRC:$dst),
+                         (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+                         "vcmp"#_.Suffix,
+                         "$cc, $src2, $src1", "$src1, $src2, $cc">;
+
+    let mayLoad = 1 in {
+      defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
+                             (outs _.KRC:$dst),
+                             (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+                             "vcmp"#_.Suffix,
+                             "$cc, $src2, $src1", "$src1, $src2, $cc">;
+
+      defm  rmbi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
+                         (outs _.KRC:$dst),
+                         (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
+                         "vcmp"#_.Suffix,
+                         "$cc, ${src2}"##_.BroadcastStr##", $src1",
+                         "$src1, ${src2}"##_.BroadcastStr##", $cc">,EVEX_B;
+    }
+ }
+}
+
+multiclass avx512_vcmp_sae<X86VectorVTInfo _> {
+  // comparison code form (VCMP[EQ/LT/LE/...]
+  defm  rrib  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+                     (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
+                     "vcmp${cc}"#_.Suffix,
+                     "{sae}, $src2, $src1", "$src1, $src2,{sae}",
+                     (X86cmpmRnd (_.VT _.RC:$src1),
+                                    (_.VT _.RC:$src2),
+                                    imm:$cc,
+                                (i32 FROUND_NO_EXC))>, EVEX_B;
+
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    defm  rrib_alt  : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+                         (outs _.KRC:$dst),
+                         (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+                         "vcmp"#_.Suffix,
+                         "$cc,{sae}, $src2, $src1",
+                         "$src1, $src2,{sae}, $cc">, EVEX_B;
+   }
+}
+
+multiclass avx512_vcmp<AVX512VLVectorVTInfo _> {
+  let Predicates = [HasAVX512] in {
+    defm Z    : avx512_vcmp_common<_.info512>,
+                avx512_vcmp_sae<_.info512>, EVEX_V512;
+
+  }
+  let Predicates = [HasAVX512,HasVLX] in {
+   defm Z128 : avx512_vcmp_common<_.info128>, EVEX_V128;
+   defm Z256 : avx512_vcmp_common<_.info256>, EVEX_V256;
+  }
+}
+
+defm VCMPPD : avx512_vcmp<avx512vl_f64_info>,
+                          AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VCMPPS : avx512_vcmp<avx512vl_f32_info>,
+                          AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS (VCMPPSZrri
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS (VPCMPDZrri
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS (VPCMPUDZrri
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+
+// ----------------------------------------------------------------
+// FPClass
+//handle fpclass instruction  mask =  op(reg_scalar,imm)
+//                                    op(mem_scalar,imm)
+multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 X86VectorVTInfo _, Predicate prd> {
+  let Predicates = [prd] in {
+      def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),//_.KRC:$dst),
+                      (ins _.RC:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst | $dst, $src1, $src2}",
+                      [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
+                              (i32 imm:$src2)))], NoItinerary>;
+      def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+                      (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix#
+                      "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}",
+                      [(set _.KRC:$dst,(or _.KRCWM:$mask, 
+                                      (OpNode (_.VT _.RC:$src1),
+                                      (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+    let mayLoad = 1, AddedComplexity = 20 in {
+      def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                      (ins _.MemOp:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix##
+                                "\t{$src2, $src1, $dst | $dst, $src1, $src2}",
+                      [(set _.KRC:$dst,
+                            (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                    (i32 imm:$src2)))], NoItinerary>;
+      def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                      (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix##
+                      "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}",
+                      [(set _.KRC:$dst,(or _.KRCWM:$mask, 
+                          (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                              (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+    }
+  }
+}
+
+//handle fpclass instruction mask = fpclass(reg_vec, reg_vec, imm)
+//                                  fpclass(reg_vec, mem_vec, imm)
+//                                  fpclass(reg_vec, broadcast(eltVt), imm)
+multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 X86VectorVTInfo _, string mem, string broadcast>{
+  def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+                      (ins _.RC:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst | $dst, $src1, $src2}",
+                      [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
+                                       (i32 imm:$src2)))], NoItinerary>;
+  def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+                      (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix#
+                      "\t{$src2, $src1, $dst {${mask}}| $dst {${mask}}, $src1, $src2}",
+                      [(set _.KRC:$dst,(or _.KRCWM:$mask, 
+                                       (OpNode (_.VT _.RC:$src1),
+                                       (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+  let mayLoad = 1 in {
+    def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                      (ins _.MemOp:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix##mem#
+                      "\t{$src2, $src1, $dst | $dst, $src1, $src2}",
+                      [(set _.KRC:$dst,(OpNode 
+                                       (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                       (i32 imm:$src2)))], NoItinerary>;
+    def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                      (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix##mem#
+                      "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}",
+                      [(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode 
+                                    (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                    (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+    def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                      (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
+                                        _.BroadcastStr##", $dst | $dst, ${src1}"
+                                                    ##_.BroadcastStr##", $src2}",
+                      [(set _.KRC:$dst,(OpNode 
+                                       (_.VT (X86VBroadcast 
+                                             (_.ScalarLdFrag addr:$src1))),
+                                       (i32 imm:$src2)))], NoItinerary>,EVEX_B;
+    def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                      (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
+                            _.BroadcastStr##", $dst {${mask}} | $dst {${mask}}, ${src1}"##
+                                                     _.BroadcastStr##", $src2}",
+                      [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode 
+                                       (_.VT (X86VBroadcast 
+                                             (_.ScalarLdFrag addr:$src1))),
+                                       (i32 imm:$src2))))], NoItinerary>,
+                                                            EVEX_B, EVEX_K;
+  }
+}
+
+multiclass avx512_vector_fpclass_all<string OpcodeStr,
+            AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd, 
+                                                              string broadcast>{
+  let Predicates = [prd] in {
+    defm Z    : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info512, "{z}", 
+                                      broadcast>, EVEX_V512;
+  }
+  let Predicates = [prd, HasVLX] in {
+    defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info128, "{x}",
+                                      broadcast>, EVEX_V128;
+    defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info256, "{y}",
+                                      broadcast>, EVEX_V256;
+  }
+}
+
+multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec,
+             bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{
+  defm PS : avx512_vector_fpclass_all<OpcodeStr,  avx512vl_f32_info, opcVec, 
+                                      VecOpNode, prd, "{l}">, EVEX_CD8<32, CD8VF>;
+  defm PD : avx512_vector_fpclass_all<OpcodeStr,  avx512vl_f64_info, opcVec, 
+                                      VecOpNode, prd, "{q}">,EVEX_CD8<64, CD8VF> , VEX_W;
+  defm SS : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
+                                      f32x_info, prd>, EVEX_CD8<32, CD8VT1>;
+  defm SD : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
+                                      f64x_info, prd>, EVEX_CD8<64, CD8VT1>, VEX_W;
+}
+
+defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass,
+                                      X86Vfpclasss, HasDQI>, AVX512AIi8Base,EVEX;
+
+//-----------------------------------------------------------------
+// Mask register copy, including
+// - copy between mask registers
+// - load/store mask registers
+// - copy from GPR to mask register and vice versa
+//
+multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
+                         string OpcodeStr, RegisterClass KRC,
+                         ValueType vvt, X86MemOperand x86memop> {
+  let hasSideEffects = 0 in {
+    def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+    let mayLoad = 1 in
+    def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+               [(set KRC:$dst, (vvt (load addr:$src)))]>;
+    let mayStore = 1 in
+    def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+               [(store KRC:$src, addr:$dst)]>;
+  }
+}
+
+multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
+                             string OpcodeStr,
+                             RegisterClass KRC, RegisterClass GRC> {
+  let hasSideEffects = 0 in {
+    def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+    def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+  }
+}
+
+let Predicates = [HasDQI] in
+  defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem>,
+               avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>,
+               VEX, PD;
+
+let Predicates = [HasAVX512] in
+  defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
+               avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
+               VEX, PS;
+
+let Predicates = [HasBWI] in {
+  defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>,
+               VEX, PD, VEX_W;
+  defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>,
+               VEX, XD;
+}
+
+let Predicates = [HasBWI] in {
+  defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>,
+               VEX, PS, VEX_W;
+  defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>,
+               VEX, XD, VEX_W;
+}
+
+// GR from/to mask register
+let Predicates = [HasDQI] in {
+  def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
+            (KMOVBkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit))>;
+  def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
+            (EXTRACT_SUBREG (KMOVBrk VK8:$src), sub_8bit)>;
+}
+let Predicates = [HasAVX512] in {
+  def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
+            (KMOVWkr (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit))>;
+  def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
+            (EXTRACT_SUBREG (KMOVWrk VK16:$src), sub_16bit)>;
+}
+let Predicates = [HasBWI] in {
+  def : Pat<(v32i1 (bitconvert (i32 GR32:$src))), (KMOVDkr GR32:$src)>;
+  def : Pat<(i32 (bitconvert (v32i1 VK32:$src))), (KMOVDrk VK32:$src)>;
+}
+let Predicates = [HasBWI] in {
+  def : Pat<(v64i1 (bitconvert (i64 GR64:$src))), (KMOVQkr GR64:$src)>;
+  def : Pat<(i64 (bitconvert (v64i1 VK64:$src))), (KMOVQrk VK64:$src)>;
+}
+
+// Load/store kreg
+let Predicates = [HasDQI] in {
+  def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
+            (KMOVBmk addr:$dst, VK8:$src)>;
+  def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
+            (KMOVBkm addr:$src)>;
+
+  def : Pat<(store VK4:$src, addr:$dst),
+            (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK4:$src, VK8))>;
+  def : Pat<(store VK2:$src, addr:$dst),
+            (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK2:$src, VK8))>;
+}
+let Predicates = [HasAVX512, NoDQI] in {
+  def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
+            (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>;
+  def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
+            (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>;
+}
+let Predicates = [HasAVX512] in {
+  def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst),
+            (KMOVWmk addr:$dst, VK16:$src)>;
+  def : Pat<(i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (AND16ri (i16 (SUBREG_TO_REG (i32 0),
+                                              (MOV8rm addr:$src), sub_8bit)),
+                                (i16 1)), VK1)>;
+  def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))),
+            (KMOVWkm addr:$src)>;
+}
+let Predicates = [HasBWI] in {
+  def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst),
+            (KMOVDmk addr:$dst, VK32:$src)>;
+  def : Pat<(v32i1 (bitconvert (i32 (load addr:$src)))),
+            (KMOVDkm addr:$src)>;
+}
+let Predicates = [HasBWI] in {
+  def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst),
+            (KMOVQmk addr:$dst, VK64:$src)>;
+  def : Pat<(v64i1 (bitconvert (i64 (load addr:$src)))),
+            (KMOVQkm addr:$src)>;
+}
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(i1 (trunc (i64 GR64:$src))),
+            (COPY_TO_REGCLASS (KMOVWkr (AND32ri (EXTRACT_SUBREG $src, sub_32bit),
+                                        (i32 1))), VK1)>;
+
+  def : Pat<(i1 (trunc (i32 GR32:$src))),
+            (COPY_TO_REGCLASS (KMOVWkr (AND32ri $src, (i32 1))), VK1)>;
+
+  def : Pat<(i1 (trunc (i8 GR8:$src))),
+       (COPY_TO_REGCLASS
+        (KMOVWkr (AND32ri (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit), (i32 1))),
+       VK1)>;
+  def : Pat<(i1 (trunc (i16 GR16:$src))),
+       (COPY_TO_REGCLASS
+        (KMOVWkr (AND32ri (SUBREG_TO_REG (i32 0), $src, sub_16bit), (i32 1))),
+       VK1)>;
+
+  def : Pat<(i32 (zext VK1:$src)),
+            (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>;
+  def : Pat<(i32 (anyext VK1:$src)),
+            (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16))>;
+
+  def : Pat<(i8 (zext VK1:$src)),
+            (EXTRACT_SUBREG
+             (AND32ri (KMOVWrk
+                       (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>;
+  def : Pat<(i8 (anyext VK1:$src)),
+              (EXTRACT_SUBREG
+                (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_8bit)>;
+
+  def : Pat<(i64 (zext VK1:$src)),
+            (AND64ri8 (SUBREG_TO_REG (i64 0),
+             (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>;
+  def : Pat<(i16 (zext VK1:$src)),
+            (EXTRACT_SUBREG
+             (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
+              sub_16bit)>;
+}
+def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
+          (COPY_TO_REGCLASS VK1:$src, VK16)>;
+def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
+          (COPY_TO_REGCLASS VK1:$src, VK8)>;
+def : Pat<(v4i1 (scalar_to_vector VK1:$src)),
+          (COPY_TO_REGCLASS VK1:$src, VK4)>;
+def : Pat<(v2i1 (scalar_to_vector VK1:$src)),
+          (COPY_TO_REGCLASS VK1:$src, VK2)>;
+def : Pat<(v32i1 (scalar_to_vector VK1:$src)),
+          (COPY_TO_REGCLASS VK1:$src, VK32)>;
+def : Pat<(v64i1 (scalar_to_vector VK1:$src)),
+          (COPY_TO_REGCLASS VK1:$src, VK64)>;
+
+
+// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
+let Predicates = [HasAVX512, NoDQI] in {
+  // GR from/to 8-bit mask without native support
+  def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
+            (COPY_TO_REGCLASS
+             (KMOVWkr (MOVZX32rr8 GR8 :$src)), VK8)>;
+  def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
+            (EXTRACT_SUBREG
+              (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
+              sub_8bit)>;
+}
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))),
+            (COPY_TO_REGCLASS VK16:$src, VK1)>;
+  def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))),
+            (COPY_TO_REGCLASS VK8:$src, VK1)>;
+}
+let Predicates = [HasBWI] in {
+  def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))),
+            (COPY_TO_REGCLASS VK32:$src, VK1)>;
+  def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))),
+            (COPY_TO_REGCLASS VK64:$src, VK1)>;
+}
+
+// Mask unary operation
+// - KNOT
+multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr,
+                            RegisterClass KRC, SDPatternOperator OpNode,
+                            Predicate prd> {
+  let Predicates = [prd] in
+    def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+               [(set KRC:$dst, (OpNode KRC:$src))]>;
+}
+
+multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr,
+                                SDPatternOperator OpNode> {
+  defm B : avx512_mask_unop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
+                            HasDQI>, VEX, PD;
+  defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
+                            HasAVX512>, VEX, PS;
+  defm D : avx512_mask_unop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
+                            HasBWI>, VEX, PD, VEX_W;
+  defm Q : avx512_mask_unop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
+                            HasBWI>, VEX, PS, VEX_W;
+}
+
+defm KNOT : avx512_mask_unop_all<0x44, "knot", not>;
+
+multiclass avx512_mask_unop_int<string IntName, string InstName> {
+  let Predicates = [HasAVX512] in
+    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
+                (i16 GR16:$src)),
+              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
+              (v16i1 (COPY_TO_REGCLASS GR16:$src, VK16))), GR16)>;
+}
+defm : avx512_mask_unop_int<"knot", "KNOT">;
+
+let Predicates = [HasDQI] in
+def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), (KNOTBrr VK8:$src1)>;
+let Predicates = [HasAVX512] in
+def : Pat<(xor VK16:$src1, (v16i1 immAllOnesV)), (KNOTWrr VK16:$src1)>;
+let Predicates = [HasBWI] in
+def : Pat<(xor VK32:$src1, (v32i1 immAllOnesV)), (KNOTDrr VK32:$src1)>;
+let Predicates = [HasBWI] in
+def : Pat<(xor VK64:$src1, (v64i1 immAllOnesV)), (KNOTQrr VK64:$src1)>;
+
+// KNL does not support KMOVB, 8-bit mask is promoted to 16-bit
+let Predicates = [HasAVX512, NoDQI] in {
+def : Pat<(xor VK8:$src1,  (v8i1 immAllOnesV)),
+          (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>;
+def : Pat<(not VK8:$src),
+          (COPY_TO_REGCLASS
+            (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>;
+}
+def : Pat<(xor VK4:$src1,  (v4i1 immAllOnesV)),
+          (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK4:$src1, VK16)), VK4)>;
+def : Pat<(xor VK2:$src1,  (v2i1 immAllOnesV)),
+          (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK2:$src1, VK16)), VK2)>;
+
+// Mask binary operation
+// - KAND, KANDN, KOR, KXNOR, KXOR
+multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr,
+                           RegisterClass KRC, SDPatternOperator OpNode,
+                           Predicate prd, bit IsCommutable> {
+  let Predicates = [prd], isCommutable = IsCommutable in
+    def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
+               !strconcat(OpcodeStr,
+                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>;
+}
+
+multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
+                               SDPatternOperator OpNode, bit IsCommutable,
+                               Predicate prdW = HasAVX512> {
+  defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
+                             HasDQI, IsCommutable>, VEX_4V, VEX_L, PD;
+  defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
+                             prdW, IsCommutable>, VEX_4V, VEX_L, PS;
+  defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
+                             HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD;
+  defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
+                             HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS;
+}
+
+def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>;
+def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>;
+
+defm KAND  : avx512_mask_binop_all<0x41, "kand",  and,  1>;
+defm KOR   : avx512_mask_binop_all<0x45, "kor",   or,   1>;
+defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", xnor, 1>;
+defm KXOR  : avx512_mask_binop_all<0x47, "kxor",  xor,  1>;
+defm KANDN : avx512_mask_binop_all<0x42, "kandn", andn, 0>;
+defm KADD  : avx512_mask_binop_all<0x4A, "kadd",  add,  1, HasDQI>;
+
+multiclass avx512_mask_binop_int<string IntName, string InstName> {
+  let Predicates = [HasAVX512] in
+    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
+                (i16 GR16:$src1), (i16 GR16:$src2)),
+              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
+              (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)),
+              (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>;
+}
+
+defm : avx512_mask_binop_int<"kand",  "KAND">;
+defm : avx512_mask_binop_int<"kandn", "KANDN">;
+defm : avx512_mask_binop_int<"kor",   "KOR">;
+defm : avx512_mask_binop_int<"kxnor", "KXNOR">;
+defm : avx512_mask_binop_int<"kxor",  "KXOR">;
+
+multiclass avx512_binop_pat<SDPatternOperator OpNode, Instruction Inst> {
+  // With AVX512F, 8-bit mask is promoted to 16-bit mask,
+  // for the DQI set, this type is legal and KxxxB instruction is used
+  let Predicates = [NoDQI] in
+  def : Pat<(OpNode VK8:$src1, VK8:$src2),
+            (COPY_TO_REGCLASS
+              (Inst (COPY_TO_REGCLASS VK8:$src1, VK16),
+                    (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>;
+
+  // All types smaller than 8 bits require conversion anyway
+  def : Pat<(OpNode VK1:$src1, VK1:$src2),
+        (COPY_TO_REGCLASS (Inst
+                           (COPY_TO_REGCLASS VK1:$src1, VK16),
+                           (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
+  def : Pat<(OpNode VK2:$src1, VK2:$src2),
+        (COPY_TO_REGCLASS (Inst
+                           (COPY_TO_REGCLASS VK2:$src1, VK16),
+                           (COPY_TO_REGCLASS VK2:$src2, VK16)), VK1)>;
+  def : Pat<(OpNode VK4:$src1, VK4:$src2),
+        (COPY_TO_REGCLASS (Inst
+                           (COPY_TO_REGCLASS VK4:$src1, VK16),
+                           (COPY_TO_REGCLASS VK4:$src2, VK16)), VK1)>;
+}
+
+defm : avx512_binop_pat<and,  KANDWrr>;
+defm : avx512_binop_pat<andn, KANDNWrr>;
+defm : avx512_binop_pat<or,   KORWrr>;
+defm : avx512_binop_pat<xnor, KXNORWrr>;
+defm : avx512_binop_pat<xor,  KXORWrr>;
+
+def : Pat<(xor (xor VK16:$src1, VK16:$src2), (v16i1 immAllOnesV)),
+          (KXNORWrr VK16:$src1, VK16:$src2)>;
+def : Pat<(xor (xor VK8:$src1, VK8:$src2), (v8i1 immAllOnesV)),
+          (KXNORBrr VK8:$src1, VK8:$src2)>, Requires<[HasDQI]>;
+def : Pat<(xor (xor VK32:$src1, VK32:$src2), (v32i1 immAllOnesV)),
+          (KXNORDrr VK32:$src1, VK32:$src2)>, Requires<[HasBWI]>;
+def : Pat<(xor (xor VK64:$src1, VK64:$src2), (v64i1 immAllOnesV)),
+          (KXNORQrr VK64:$src1, VK64:$src2)>, Requires<[HasBWI]>;
+
+let Predicates = [NoDQI] in
+def : Pat<(xor (xor VK8:$src1, VK8:$src2), (v8i1 immAllOnesV)),
+          (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK8:$src1, VK16),
+                             (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>;
+
+def : Pat<(xor (xor VK4:$src1, VK4:$src2), (v4i1 immAllOnesV)),
+          (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK4:$src1, VK16),
+                             (COPY_TO_REGCLASS VK4:$src2, VK16)), VK4)>;
+
+def : Pat<(xor (xor VK2:$src1, VK2:$src2), (v2i1 immAllOnesV)),
+          (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK2:$src1, VK16),
+                             (COPY_TO_REGCLASS VK2:$src2, VK16)), VK2)>;
+
+def : Pat<(xor (xor VK1:$src1, VK1:$src2), (i1 1)),
+          (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK1:$src1, VK16),
+                             (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
+
+// Mask unpacking
+multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT,
+                             RegisterClass KRCSrc, Predicate prd> {
+  let Predicates = [prd] in {
+    let hasSideEffects = 0 in
+    def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst),
+               (ins KRC:$src1, KRC:$src2),
+               "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+               VEX_4V, VEX_L;
+
+    def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)),
+              (!cast<Instruction>(NAME##rr)
+                        (COPY_TO_REGCLASS KRCSrc:$src2, KRC),
+                        (COPY_TO_REGCLASS KRCSrc:$src1, KRC))>;
+  }
+}
+
+defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, HasAVX512>, PD;
+defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, HasBWI>, PS;
+defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, HasBWI>, PS, VEX_W;
+
+// Mask bit testing
+multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                              SDNode OpNode, Predicate prd> {
+  let Predicates = [prd], Defs = [EFLAGS] in
+    def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+               [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>;
+}
+
+multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                Predicate prdW = HasAVX512> {
+  defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, HasDQI>,
+                                                                VEX, PD;
+  defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, prdW>,
+                                                                VEX, PS;
+  defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, HasBWI>,
+                                                                VEX, PS, VEX_W;
+  defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, HasBWI>,
+                                                                VEX, PD, VEX_W;
+}
+
+defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>;
+defm KTEST   : avx512_mask_testop_w<0x99, "ktest", X86ktest, HasDQI>;
+
+// Mask shift
+multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                             SDNode OpNode> {
+  let Predicates = [HasAVX512] in
+    def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm),
+                 !strconcat(OpcodeStr,
+                            "\t{$imm, $src, $dst|$dst, $src, $imm}"),
+                            [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>;
+}
+
+multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
+                               SDNode OpNode> {
+  defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+                               VEX, TAPD, VEX_W;
+  let Predicates = [HasDQI] in
+  defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode>,
+                               VEX, TAPD;
+  let Predicates = [HasBWI] in {
+  defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode>,
+                               VEX, TAPD, VEX_W;
+  let Predicates = [HasDQI] in
+  defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode>,
+                               VEX, TAPD;
+  }
+}
+
+defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>;
+defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86vsrli>;
+
+// Mask setting all 0s or 1s
+multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
+  let Predicates = [HasAVX512] in
+    let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1 in
+      def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "",
+                     [(set KRC:$dst, (VT Val))]>;
+}
+
+multiclass avx512_mask_setop_w<PatFrag Val> {
+  defm B : avx512_mask_setop<VK8,   v8i1, Val>;
+  defm W : avx512_mask_setop<VK16, v16i1, Val>;
+  defm D : avx512_mask_setop<VK32,  v32i1, Val>;
+  defm Q : avx512_mask_setop<VK64, v64i1, Val>;
+}
+
+defm KSET0 : avx512_mask_setop_w<immAllZerosV>;
+defm KSET1 : avx512_mask_setop_w<immAllOnesV>;
+
+// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
+let Predicates = [HasAVX512] in {
+  def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
+  def : Pat<(v8i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK8)>;
+  def : Pat<(v4i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK4)>;
+  def : Pat<(v2i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK2)>;
+  def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>;
+  def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
+  def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
+}
+def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 0))),
+          (v8i1 (COPY_TO_REGCLASS VK16:$src, VK8))>;
+
+def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))),
+          (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16))>;
+
+def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
+          (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
+
+def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))),
+          (v16i1 (COPY_TO_REGCLASS VK32:$src, VK16))>;
+
+def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))),
+          (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>;
+
+def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))),
+          (v32i1 (COPY_TO_REGCLASS VK64:$src, VK32))>;
+
+def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))),
+          (v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>;
+
+def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
+          (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>;
+
+def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
+          (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>;
+
+def : Pat<(v4i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))),
+          (v4i1 (COPY_TO_REGCLASS VK2:$src, VK4))>;
+
+def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))),
+          (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>;
+def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))),
+          (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>;
+
+def : Pat<(v32i1 (insert_subvector undef, VK2:$src, (iPTR 0))),
+          (v32i1 (COPY_TO_REGCLASS VK2:$src, VK32))>;
+def : Pat<(v32i1 (insert_subvector undef, VK4:$src, (iPTR 0))),
+          (v32i1 (COPY_TO_REGCLASS VK4:$src, VK32))>;
+def : Pat<(v32i1 (insert_subvector undef, VK8:$src, (iPTR 0))),
+          (v32i1 (COPY_TO_REGCLASS VK8:$src, VK32))>;
+def : Pat<(v32i1 (insert_subvector undef, VK16:$src, (iPTR 0))),
+          (v32i1 (COPY_TO_REGCLASS VK16:$src, VK32))>;
+
+def : Pat<(v64i1 (insert_subvector undef, VK2:$src, (iPTR 0))),
+          (v64i1 (COPY_TO_REGCLASS VK2:$src, VK64))>;
+def : Pat<(v64i1 (insert_subvector undef, VK4:$src, (iPTR 0))),
+          (v64i1 (COPY_TO_REGCLASS VK4:$src, VK64))>;
+def : Pat<(v64i1 (insert_subvector undef, VK8:$src, (iPTR 0))),
+          (v64i1 (COPY_TO_REGCLASS VK8:$src, VK64))>;
+def : Pat<(v64i1 (insert_subvector undef, VK16:$src, (iPTR 0))),
+          (v64i1 (COPY_TO_REGCLASS VK16:$src, VK64))>;
+def : Pat<(v64i1 (insert_subvector undef, VK32:$src, (iPTR 0))),
+          (v64i1 (COPY_TO_REGCLASS VK32:$src, VK64))>;
+
+
+def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))),
+          (v8i1 (COPY_TO_REGCLASS
+                 (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16),
+                  (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>;
+
+def : Pat<(v8i1 (X86vsrli VK8:$src, (i8 imm:$imm))),
+          (v8i1 (COPY_TO_REGCLASS
+                 (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16),
+                  (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>;
+
+def : Pat<(v4i1 (X86vshli VK4:$src, (i8 imm:$imm))),
+          (v4i1 (COPY_TO_REGCLASS
+                 (KSHIFTLWri (COPY_TO_REGCLASS VK4:$src, VK16),
+                  (I8Imm $imm)), VK4))>, Requires<[HasAVX512]>;
+
+def : Pat<(v4i1 (X86vsrli VK4:$src, (i8 imm:$imm))),
+          (v4i1 (COPY_TO_REGCLASS
+                 (KSHIFTRWri (COPY_TO_REGCLASS VK4:$src, VK16),
+                  (I8Imm $imm)), VK4))>, Requires<[HasAVX512]>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Aligned and unaligned load and store
+//
+
+
+multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         PatFrag ld_frag, PatFrag mload,
+                         bit IsReMaterializable = 1> {
+  let hasSideEffects = 0 in {
+  def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
+                    _.ExeDomain>, EVEX;
+  def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
+                      (ins _.KRCWM:$mask,  _.RC:$src),
+                      !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+                       "${dst} {${mask}} {z}, $src}"), [], _.ExeDomain>,
+                       EVEX, EVEX_KZ;
+
+  let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable,
+      SchedRW = [WriteLoad] in
+  def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))],
+                    _.ExeDomain>, EVEX;
+
+  let Constraints = "$src0 = $dst" in {
+  def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
+                    (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1),
+                    !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
+                    "${dst} {${mask}}, $src1}"),
+                    [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
+                                        (_.VT _.RC:$src1),
+                                        (_.VT _.RC:$src0))))], _.ExeDomain>,
+                     EVEX, EVEX_K;
+  let mayLoad = 1, SchedRW = [WriteLoad] in
+    def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
+                     (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1),
+                     !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
+                      "${dst} {${mask}}, $src1}"),
+                     [(set _.RC:$dst, (_.VT
+                         (vselect _.KRCWM:$mask,
+                          (_.VT (bitconvert (ld_frag addr:$src1))),
+                           (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, EVEX_K;
+  }
+  let mayLoad = 1, SchedRW = [WriteLoad] in
+  def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
+                  (ins _.KRCWM:$mask, _.MemOp:$src),
+                  OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"#
+                                "${dst} {${mask}} {z}, $src}",
+                  [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
+                    (_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))],
+                  _.ExeDomain>, EVEX, EVEX_KZ;
+  }
+  def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)),
+            (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
+
+  def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, _.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
+
+  def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src0))),
+            (!cast<Instruction>(NAME#_.ZSuffix##rmk) _.RC:$src0,
+             _.KRCWM:$mask, addr:$ptr)>;
+}
+
+multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr,
+                                  AVX512VLVectorVTInfo _,
+                                  Predicate prd,
+                                  bit IsReMaterializable = 1> {
+  let Predicates = [prd] in
+  defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.AlignedLdFrag,
+                       masked_load_aligned512, IsReMaterializable>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+  defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.AlignedLdFrag,
+                          masked_load_aligned256, IsReMaterializable>, EVEX_V256;
+  defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.AlignedLdFrag,
+                          masked_load_aligned128, IsReMaterializable>, EVEX_V128;
+  }
+}
+
+multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
+                                  AVX512VLVectorVTInfo _,
+                                  Predicate prd,
+                                  bit IsReMaterializable = 1> {
+  let Predicates = [prd] in
+  defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.LdFrag,
+                       masked_load_unaligned, IsReMaterializable>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+  defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.LdFrag,
+                         masked_load_unaligned, IsReMaterializable>, EVEX_V256;
+  defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.LdFrag,
+                         masked_load_unaligned, IsReMaterializable>, EVEX_V128;
+  }
+}
+
+multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                        PatFrag st_frag, PatFrag mstore> {
+
+  def rr_REV  : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
+                         OpcodeStr # ".s\t{$src, $dst|$dst, $src}",
+                         [], _.ExeDomain>, EVEX;
+  def rrk_REV : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),
+                         (ins _.KRCWM:$mask, _.RC:$src),
+                         OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"#
+                         "${dst} {${mask}}, $src}",
+                         [], _.ExeDomain>,  EVEX, EVEX_K;
+  def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),
+                          (ins _.KRCWM:$mask, _.RC:$src),
+                          OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" #
+                          "${dst} {${mask}} {z}, $src}",
+                          [], _.ExeDomain>, EVEX, EVEX_KZ;
+
+  let mayStore = 1 in {
+  def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(st_frag (_.VT _.RC:$src), addr:$dst)], _.ExeDomain>, EVEX;
+  def mrk : AVX512PI<opc, MRMDestMem, (outs),
+                     (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
+              OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}",
+               [], _.ExeDomain>, EVEX, EVEX_K;
+  }
+
+  def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)),
+           (!cast<Instruction>(NAME#_.ZSuffix##mrk) addr:$ptr,
+                                                    _.KRCWM:$mask, _.RC:$src)>;
+}
+
+
+multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,
+                            AVX512VLVectorVTInfo _, Predicate prd> {
+  let Predicates = [prd] in
+  defm Z : avx512_store<opc, OpcodeStr, _.info512, store,
+                        masked_store_unaligned>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_store<opc, OpcodeStr, _.info256, store,
+                             masked_store_unaligned>, EVEX_V256;
+    defm Z128 : avx512_store<opc, OpcodeStr, _.info128, store,
+                             masked_store_unaligned>, EVEX_V128;
+  }
+}
+
+multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,
+                                  AVX512VLVectorVTInfo _,  Predicate prd> {
+  let Predicates = [prd] in
+  defm Z : avx512_store<opc, OpcodeStr, _.info512, alignedstore512,
+                        masked_store_aligned512>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_store<opc, OpcodeStr, _.info256, alignedstore256,
+                             masked_store_aligned256>, EVEX_V256;
+    defm Z128 : avx512_store<opc, OpcodeStr, _.info128, alignedstore,
+                             masked_store_aligned128>, EVEX_V128;
+  }
+}
+
+defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info,
+                                     HasAVX512>,
+               avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info,
+                                      HasAVX512>,  PS, EVEX_CD8<32, CD8VF>;
+
+defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
+                                     HasAVX512>,
+               avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info,
+                                     HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512>,
+               avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512>,
+                              PS, EVEX_CD8<32, CD8VF>;
+
+defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, 0>,
+               avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>,
+               PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+def: Pat<(v8f64 (int_x86_avx512_mask_loadu_pd_512 addr:$ptr,
+                (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)),
+       (VMOVUPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
+
+def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr,
+                 (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)),
+       (VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
+
+def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr,
+                (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)),
+       (VMOVAPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
+
+def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr,
+                 (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)),
+       (VMOVAPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
+
+def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr,
+                (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
+       (VMOVAPDZrm addr:$ptr)>;
+
+def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr,
+                 (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))),
+       (VMOVAPSZrm addr:$ptr)>;
+
+def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src),
+          GR16:$mask),
+         (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
+            VR512:$src)>;
+def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src),
+          GR8:$mask),
+         (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
+            VR512:$src)>;
+
+def: Pat<(int_x86_avx512_mask_store_ps_512 addr:$ptr, (v16f32 VR512:$src),
+          GR16:$mask),
+         (VMOVAPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
+            VR512:$src)>;
+def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src),
+          GR8:$mask),
+         (VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
+            VR512:$src)>;
+
+defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
+                                       HasAVX512>,
+                 avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
+                                       HasAVX512>, PD, EVEX_CD8<32, CD8VF>;
+
+defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
+                                       HasAVX512>,
+                 avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
+                                    HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>,
+                 avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
+                                 HasBWI>, XD, EVEX_CD8<8, CD8VF>;
+
+defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>,
+                 avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,
+                                 HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>;
+
+defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512>,
+                 avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
+                                 HasAVX512>, XS, EVEX_CD8<32, CD8VF>;
+
+defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512>,
+                 avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
+                                 HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>;
+
+def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr,
+                 (v16i32 immAllZerosV), GR16:$mask)),
+       (VMOVDQU32Zrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
+
+def: Pat<(v8i64 (int_x86_avx512_mask_loadu_q_512 addr:$ptr,
+                (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)),
+       (VMOVDQU64Zrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
+
+def: Pat<(int_x86_avx512_mask_storeu_d_512 addr:$ptr, (v16i32 VR512:$src),
+            GR16:$mask),
+         (VMOVDQU32Zmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
+            VR512:$src)>;
+def: Pat<(int_x86_avx512_mask_storeu_q_512 addr:$ptr, (v8i64 VR512:$src),
+            GR8:$mask),
+         (VMOVDQU64Zmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
+            VR512:$src)>;
+
+let AddedComplexity = 20 in {
+def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src),
+                          (bc_v8i64 (v16i32 immAllZerosV)))),
+                  (VMOVDQU64Zrrkz VK8WM:$mask, VR512:$src)>;
+
+def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
+                          (v8i64 VR512:$src))),
+   (VMOVDQU64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)),
+                                              VK8), VR512:$src)>;
+
+def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src),
+                           (v16i32 immAllZerosV))),
+                  (VMOVDQU32Zrrkz VK16WM:$mask, VR512:$src)>;
+
+def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
+                           (v16i32 VR512:$src))),
+                  (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
+}
+
+// Move Int Doubleword to Packed Double Int
+//
+def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+                        EVEX;
+def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                        [(set VR128X:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))],
+                          IIC_SSE_MOVDQ>, EVEX, VEX_W;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def VMOV64toPQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
+                      (ins i64mem:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}", []>,
+                      EVEX, VEX_W, EVEX_CD8<64, CD8VT1>;
+let isCodeGenOnly = 1 in {
+def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src),
+                       "vmovq\t{$src, $dst|$dst, $src}",
+                       [(set FR64X:$dst, (bitconvert GR64:$src))],
+                       IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src),
+                         "vmovq\t{$src, $dst|$dst, $src}",
+                         [(set GR64:$dst, (bitconvert FR64X:$src))],
+                         IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src),
+                         "vmovq\t{$src, $dst|$dst, $src}",
+                         [(store (i64 (bitconvert FR64X:$src)), addr:$dst)],
+                         IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
+                         EVEX_CD8<64, CD8VT1>;
+}
+
+// Move Int Doubleword to Single Scalar
+//
+let isCodeGenOnly = 1 in {
+def VMOVDI2SSZrr  : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(set FR32X:$dst, (bitconvert GR32:$src))],
+                      IIC_SSE_MOVDQ>, EVEX;
+
+def VMOVDI2SSZrm  : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
+                      IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+}
+
+// Move doubleword from xmm register to r/m32
+//
+def VMOVPDI2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
+                       "vmovd\t{$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
+                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
+                       EVEX;
+def VMOVPDI2DIZmr  : AVX512BI<0x7E, MRMDestMem, (outs),
+                       (ins i32mem:$dst, VR128X:$src),
+                       "vmovd\t{$src, $dst|$dst, $src}",
+                       [(store (i32 (extractelt (v4i32 VR128X:$src),
+                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+                       EVEX, EVEX_CD8<32, CD8VT1>;
+
+// Move quadword from xmm1 register to r/m64
+//
+def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
+                                                   (iPTR 0)))],
+                      IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
+                      Requires<[HasAVX512, In64BitMode]>;
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                      [], IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
+                      Requires<[HasAVX512, In64BitMode]>;
+
+def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs),
+                      (ins i64mem:$dst, VR128X:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                      [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
+                              addr:$dst)], IIC_SSE_MOVDQ>,
+                      EVEX, PD, VEX_W, EVEX_CD8<64, CD8VT1>,
+                      Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>;
+
+let hasSideEffects = 0 in
+def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
+                             (ins VR128X:$src),
+                             "vmovq.s\t{$src, $dst|$dst, $src}",[]>,
+                             EVEX, VEX_W;
+
+// Move Scalar Single to Double Int
+//
+let isCodeGenOnly = 1 in {
+def VMOVSS2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
+                      (ins FR32X:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (bitconvert FR32X:$src))],
+                      IIC_SSE_MOVD_ToGP>, EVEX;
+def VMOVSS2DIZmr  : AVX512BI<0x7E, MRMDestMem, (outs),
+                      (ins i32mem:$dst, FR32X:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
+                      IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+}
+
+// Move Quadword Int to Packed Quadword Int
+//
+def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
+                      (ins i64mem:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
+                      EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  MOVSS, MOVSD
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_move_scalar <string asm, SDNode OpNode, 
+                              X86VectorVTInfo _> {
+  defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst), 
+                    (ins _.RC:$src1, _.RC:$src2),
+                    asm, "$src2, $src1","$src1, $src2", 
+                    (_.VT (OpNode (_.VT _.RC:$src1),
+                                   (_.VT _.RC:$src2))),
+                                   IIC_SSE_MOV_S_RR>, EVEX_4V;
+  let Constraints = "$src1 = $dst" , mayLoad = 1 in
+    defm rm_Int : AVX512_maskable_3src_scalar<0x10, MRMSrcMem, _,
+                    (outs _.RC:$dst), 
+                    (ins _.ScalarMemOp:$src),
+                    asm,"$src","$src",
+                    (_.VT (OpNode (_.VT _.RC:$src1), 
+                               (_.VT (scalar_to_vector 
+                                     (_.ScalarLdFrag addr:$src)))))>, EVEX;
+  let isCodeGenOnly = 1 in {
+    def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), 
+               (ins _.RC:$src1, _.FRC:$src2),
+               !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1,
+                                      (scalar_to_vector _.FRC:$src2))))],
+               _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V;
+  let mayLoad = 1 in
+    def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
+               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+               [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
+               _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX;
+  }
+  let mayStore = 1 in {
+    def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src),
+               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+               [(store _.FRC:$src, addr:$dst)],  _.ExeDomain, IIC_SSE_MOV_S_MR>,
+               EVEX;
+    def mrk: AVX512PI<0x11, MRMDestMem, (outs), 
+                (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src),
+                !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+                [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K;
+  } // mayStore
+}
+
+defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
+                                  VEX_LIG, XS, EVEX_CD8<32, CD8VT1>;
+
+defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>,
+                                  VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(f32 (X86select VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
+          (COPY_TO_REGCLASS (VMOVSSZrr_Intk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
+           VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>;
+
+def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
+          (COPY_TO_REGCLASS (VMOVSDZrr_Intk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
+           VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>;
+
+def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
+          (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)),
+           (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+
+defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info,
+                           (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
+                           "vmovss.s", "$src2, $src1", "$src1, $src2", []>,
+                           XS, EVEX_4V, VEX_LIG;
+
+defm VMOVSSDrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info,
+                           (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
+                           "vmovsd.s", "$src2, $src1", "$src1, $src2", []>,
+                           XD, EVEX_4V, VEX_LIG, VEX_W;
+
+let Predicates = [HasAVX512] in {
+  let AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128X then do a
+  // MOVS{S,D} to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32X:$src)))),
+            (VMOVSSZrr (v4f32 (V_SET0)), FR32X:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
+            (VMOVSSZrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
+            (VMOVSSZrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))),
+            (VMOVSDZrr (v2f64 (V_SET0)), FR64X:$src)>;
+
+  // Move low f32 and clear high bits.
+  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSSZrr (v4f32 (V_SET0)),
+              (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSSZrr (v4i32 (V_SET0)),
+                       (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSSrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+
+  // MOVSDrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (X86vzload addr:$src)),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+
+  // Represent the same patterns above but in the form they appear for
+  // 256-bit types
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+  }
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (V_SET0)),
+                                            FR32X:$src)), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector FR64X:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (V_SET0)),
+                                     FR64X:$src)), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
+
+  // Move low f64 and clear high bits.
+  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSDZrr (v2f64 (V_SET0)),
+                       (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
+
+  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)),
+                       (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
+
+  // Extract and store.
+  def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
+  def : Pat<(store (f64 (extractelt (v2f64 VR128X:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSDZmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X))>;
+
+  // Shuffle with VMOVSS
+  def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
+            (VMOVSSZrr (v4i32 VR128X:$src1),
+                      (COPY_TO_REGCLASS (v4i32 VR128X:$src2), FR32X))>;
+  def : Pat<(v4f32 (X86Movss VR128X:$src1, VR128X:$src2)),
+            (VMOVSSZrr (v4f32 VR128X:$src1),
+                      (COPY_TO_REGCLASS (v4f32 VR128X:$src2), FR32X))>;
+
+  // 256-bit variants
+  def : Pat<(v8i32 (X86Movss VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSZrr (EXTRACT_SUBREG (v8i32 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8i32 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v8f32 (X86Movss VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSZrr (EXTRACT_SUBREG (v8f32 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8f32 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+
+  // Shuffle with VMOVSD
+  def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4f32 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4i32 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+
+  // 256-bit variants
+  def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDZrr (EXTRACT_SUBREG (v4i64 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4i64 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v4f64 (X86Movsd VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDZrr (EXTRACT_SUBREG (v4f64 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4f64 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+
+  def : Pat<(v2f64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v2i64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4f32 (X86Movlps VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4i32 (X86Movlps VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+}
+
+let AddedComplexity = 15 in
+def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
+                                (ins VR128X:$src),
+                                "vmovq\t{$src, $dst|$dst, $src}",
+                                [(set VR128X:$dst, (v2i64 (X86vzmovl
+                                                   (v2i64 VR128X:$src))))],
+                                IIC_SSE_MOVQ_RR>, EVEX, VEX_W;
+
+let AddedComplexity = 20 , isCodeGenOnly = 1 in
+def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
+                                 (ins i128mem:$src),
+                                 "vmovq\t{$src, $dst|$dst, $src}",
+                                 [(set VR128X:$dst, (v2i64 (X86vzmovl
+                                                     (loadv2i64 addr:$src))))],
+                                 IIC_SSE_MOVDQ>, EVEX, VEX_W,
+                                 EVEX_CD8<8, CD8VT8>;
+
+let Predicates = [HasAVX512] in {
+  // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
+  let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+              (VMOV64toPQIZrr GR64:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (VMOVDI2PDIZrr GR32:$src)>;
+
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+            (VMOVZPQILo2PQIZrm addr:$src)>;
+    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
+            (VMOVZPQILo2PQIZrr VR128X:$src)>;
+    def : Pat<(v2i64 (X86vzload addr:$src)),
+            (VMOVZPQILo2PQIZrm addr:$src)>;
+  }
+
+  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
+}
+
+def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
+
+def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Non-temporals
+//===----------------------------------------------------------------------===//
+let SchedRW = [WriteLoad] in {
+  def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
+                        (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
+                        [(set VR512:$dst, (int_x86_avx512_movntdqa addr:$src))],
+                        SSEPackedInt>, EVEX, T8PD, EVEX_V512,
+                        EVEX_CD8<64, CD8VF>;
+
+  let Predicates = [HasAVX512, HasVLX] in {
+    def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
+                             (ins i256mem:$src),
+                             "vmovntdqa\t{$src, $dst|$dst, $src}", [],
+                             SSEPackedInt>, EVEX, T8PD, EVEX_V256,
+                             EVEX_CD8<64, CD8VF>;
+
+    def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
+                             (ins i128mem:$src),
+                             "vmovntdqa\t{$src, $dst|$dst, $src}", [],
+                             SSEPackedInt>, EVEX, T8PD, EVEX_V128,
+                             EVEX_CD8<64, CD8VF>;
+  }
+}
+
+multiclass avx512_movnt<bits<8> opc, string OpcodeStr, PatFrag st_frag,
+                        ValueType OpVT, RegisterClass RC, X86MemOperand memop,
+                        Domain d, InstrItinClass itin = IIC_SSE_MOVNT> {
+  let SchedRW = [WriteStore], mayStore = 1,
+      AddedComplexity = 400 in
+  def mr : AVX512PI<opc, MRMDestMem, (outs), (ins memop:$dst, RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(st_frag (OpVT RC:$src), addr:$dst)], d, itin>, EVEX;
+}
+
+multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr, PatFrag st_frag,
+                           string elty, string elsz, string vsz512,
+                           string vsz256, string vsz128, Domain d,
+                           Predicate prd, InstrItinClass itin = IIC_SSE_MOVNT> {
+  let Predicates = [prd] in
+  defm Z : avx512_movnt<opc, OpcodeStr, st_frag,
+                        !cast<ValueType>("v"##vsz512##elty##elsz), VR512,
+                        !cast<X86MemOperand>(elty##"512mem"), d, itin>,
+                        EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_movnt<opc, OpcodeStr, st_frag,
+                             !cast<ValueType>("v"##vsz256##elty##elsz), VR256X,
+                             !cast<X86MemOperand>(elty##"256mem"), d, itin>,
+                             EVEX_V256;
+
+    defm Z128 : avx512_movnt<opc, OpcodeStr, st_frag,
+                             !cast<ValueType>("v"##vsz128##elty##elsz), VR128X,
+                             !cast<X86MemOperand>(elty##"128mem"), d, itin>,
+                             EVEX_V128;
+  }
+}
+
+defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", alignednontemporalstore,
+                                "i", "64", "8", "4", "2", SSEPackedInt,
+                                HasAVX512>, PD, EVEX_CD8<64, CD8VF>;
+
+defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", alignednontemporalstore,
+                                "f", "64", "8", "4", "2", SSEPackedDouble,
+                                HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", alignednontemporalstore,
+                                "f", "32", "16", "8", "4", SSEPackedSingle,
+                                HasAVX512>, PS, EVEX_CD8<32, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Integer arithmetic
+//
+multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           X86VectorVTInfo _, OpndItins itins,
+                           bit IsCommutable = 0> {
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                    "$src2, $src1", "$src1, $src2",
+                    (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+                    itins.rr, IsCommutable>,
+            AVX512BIBase, EVEX_4V;
+
+  let mayLoad = 1 in
+    defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                    "$src2, $src1", "$src1, $src2",
+                    (_.VT (OpNode _.RC:$src1,
+                                  (bitconvert (_.LdFrag addr:$src2)))),
+                    itins.rm>,
+              AVX512BIBase, EVEX_4V;
+}
+
+multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _, OpndItins itins,
+                            bit IsCommutable = 0> :
+           avx512_binop_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> {
+  let mayLoad = 1 in
+    defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                    "${src2}"##_.BroadcastStr##", $src1",
+                    "$src1, ${src2}"##_.BroadcastStr,
+                    (_.VT (OpNode _.RC:$src1,
+                                  (X86VBroadcast
+                                      (_.ScalarLdFrag addr:$src2)))),
+                    itins.rm>,
+               AVX512BIBase, EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_binop_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              AVX512VLVectorVTInfo VTInfo, OpndItins itins,
+                              Predicate prd, bit IsCommutable = 0> {
+  let Predicates = [prd] in
+    defm Z : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+                             IsCommutable>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
+                             IsCommutable>, EVEX_V256;
+    defm Z128 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
+                             IsCommutable>, EVEX_V128;
+  }
+}
+
+multiclass avx512_binop_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                               AVX512VLVectorVTInfo VTInfo, OpndItins itins,
+                               Predicate prd, bit IsCommutable = 0> {
+  let Predicates = [prd] in
+    defm Z : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+                             IsCommutable>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
+                             IsCommutable>, EVEX_V256;
+    defm Z128 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
+                             IsCommutable>, EVEX_V128;
+  }
+}
+
+multiclass avx512_binop_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                OpndItins itins, Predicate prd,
+                                bit IsCommutable = 0> {
+  defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+                               itins, prd, IsCommutable>,
+                               VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                OpndItins itins, Predicate prd,
+                                bit IsCommutable = 0> {
+  defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+                               itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                OpndItins itins, Predicate prd,
+                                bit IsCommutable = 0> {
+  defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i16_info,
+                              itins, prd, IsCommutable>, EVEX_CD8<16, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                OpndItins itins, Predicate prd,
+                                bit IsCommutable = 0> {
+  defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i8_info,
+                              itins, prd, IsCommutable>, EVEX_CD8<8, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
+                                 SDNode OpNode, OpndItins itins, Predicate prd,
+                                 bit IsCommutable = 0> {
+  defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd,
+                                   IsCommutable>;
+
+  defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd,
+                                   IsCommutable>;
+}
+
+multiclass avx512_binop_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
+                                 SDNode OpNode, OpndItins itins, Predicate prd,
+                                 bit IsCommutable = 0> {
+  defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr#"w", OpNode, itins, prd,
+                                   IsCommutable>;
+
+  defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr#"b", OpNode, itins, prd,
+                                   IsCommutable>;
+}
+
+multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
+                                  bits<8> opc_d, bits<8> opc_q,
+                                  string OpcodeStr, SDNode OpNode,
+                                  OpndItins itins, bit IsCommutable = 0> {
+  defm NAME : avx512_binop_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode,
+                                    itins, HasAVX512, IsCommutable>,
+              avx512_binop_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode,
+                                    itins, HasBWI, IsCommutable>;
+}
+
+multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
+                            SDNode OpNode,X86VectorVTInfo _Src,
+                            X86VectorVTInfo _Dst, bit IsCommutable = 0> {
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
+                            (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
+                            "$src2, $src1","$src1, $src2",
+                            (_Dst.VT (OpNode
+                                         (_Src.VT _Src.RC:$src1),
+                                         (_Src.VT _Src.RC:$src2))),
+                            itins.rr, IsCommutable>,
+                            AVX512BIBase, EVEX_4V;
+  let mayLoad = 1 in {
+      defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+                            (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
+                            "$src2, $src1", "$src1, $src2",
+                            (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
+                                          (bitconvert (_Src.LdFrag addr:$src2)))),
+                            itins.rm>,
+                            AVX512BIBase, EVEX_4V;
+
+      defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+                        (ins _Src.RC:$src1, _Dst.ScalarMemOp:$src2),
+                        OpcodeStr,
+                        "${src2}"##_Dst.BroadcastStr##", $src1",
+                         "$src1, ${src2}"##_Dst.BroadcastStr,
+                        (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
+                                     (_Dst.VT (X86VBroadcast
+                                              (_Dst.ScalarLdFrag addr:$src2)))))),
+                        itins.rm>,
+                        AVX512BIBase, EVEX_4V, EVEX_B;
+  }
+}
+
+defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add,
+                                    SSE_INTALU_ITINS_P, 1>;
+defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub,
+                                    SSE_INTALU_ITINS_P, 0>;
+defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", X86adds,
+                                    SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs,
+                                    SSE_INTALU_ITINS_P, HasBWI, 0>;
+defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", X86addus,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus,
+                                     SSE_INTALU_ITINS_P, HasBWI, 0>;
+defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul,
+                                    SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul,
+                                    SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul,
+                                    SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD;
+defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SSE_INTALU_ITINS_P,
+                                    HasBWI, 1>;
+defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SSE_INTMUL_ITINS_P,
+                                     HasBWI, 1>;
+defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SSE_INTMUL_ITINS_P,
+                                      HasBWI, 1>, T8PD;
+defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg,
+                                   SSE_INTALU_ITINS_P, HasBWI, 1>;
+
+multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins,
+                            SDNode OpNode, bit IsCommutable = 0> {
+
+  defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+                                 v16i32_info, v8i64_info, IsCommutable>,
+                                EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W;
+  let Predicates = [HasVLX] in {
+    defm NAME#Z256 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+                                      v8i32x_info, v4i64x_info, IsCommutable>,
+                                     EVEX_V256, EVEX_CD8<64, CD8VF>, VEX_W;
+    defm NAME#Z128 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+                                      v4i32x_info, v2i64x_info, IsCommutable>,
+                                     EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W;
+  }
+}
+
+defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTALU_ITINS_P,
+                   X86pmuldq, 1>,T8PD;
+defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P,
+                   X86pmuludq, 1>;
+
+multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _Src, X86VectorVTInfo _Dst> {
+  let mayLoad = 1 in {
+      defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+                        (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2),
+                        OpcodeStr,
+                        "${src2}"##_Src.BroadcastStr##", $src1",
+                         "$src1, ${src2}"##_Src.BroadcastStr,
+                        (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
+                                     (_Src.VT (X86VBroadcast
+                                              (_Src.ScalarLdFrag addr:$src2))))))>,
+                        EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>;
+  }
+}
+
+multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
+                            SDNode OpNode,X86VectorVTInfo _Src,
+                            X86VectorVTInfo _Dst> {
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
+                            (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
+                            "$src2, $src1","$src1, $src2",
+                            (_Dst.VT (OpNode
+                                         (_Src.VT _Src.RC:$src1),
+                                         (_Src.VT _Src.RC:$src2)))>,
+                            EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V;
+  let mayLoad = 1 in {
+    defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+                          (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
+                          "$src2, $src1", "$src1, $src2",
+                          (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
+                                        (bitconvert (_Src.LdFrag addr:$src2))))>,
+                           EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>;
+  }
+}
+
+multiclass avx512_packs_all_i32_i16<bits<8> opc, string OpcodeStr,
+                                    SDNode OpNode> {
+  defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i32_info,
+                                 v32i16_info>,
+                avx512_packs_rmb<opc, OpcodeStr, OpNode, v16i32_info,
+                                 v32i16_info>, EVEX_V512;
+  let Predicates = [HasVLX] in {
+    defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i32x_info,
+                                     v16i16x_info>,
+                     avx512_packs_rmb<opc, OpcodeStr, OpNode, v8i32x_info,
+                                     v16i16x_info>, EVEX_V256;
+    defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v4i32x_info,
+                                     v8i16x_info>,
+                     avx512_packs_rmb<opc, OpcodeStr, OpNode, v4i32x_info,
+                                     v8i16x_info>, EVEX_V128;
+  }
+}
+multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr,
+                            SDNode OpNode> {
+  defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v32i16_info,
+                                v64i8_info>, EVEX_V512;
+  let Predicates = [HasVLX] in {
+    defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i16x_info,
+                                    v32i8x_info>, EVEX_V256;
+    defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i16x_info,
+                                    v16i8x_info>, EVEX_V128;
+  }
+}
+
+multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr,
+                            SDNode OpNode, AVX512VLVectorVTInfo _Src,
+                            AVX512VLVectorVTInfo _Dst> {
+  defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512,
+                                _Dst.info512>, EVEX_V512;
+  let Predicates = [HasVLX] in {
+    defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256,
+                                     _Dst.info256>, EVEX_V256;
+    defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128,
+                                     _Dst.info128>, EVEX_V128;
+  }
+}
+
+let Predicates = [HasBWI] in {
+  defm VPACKSSDW : avx512_packs_all_i32_i16<0x6B, "vpackssdw", X86Packss>, PD;
+  defm VPACKUSDW : avx512_packs_all_i32_i16<0x2b, "vpackusdw", X86Packus>, T8PD;
+  defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512BIBase, VEX_W;
+  defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase, VEX_W;
+
+  defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw,
+                       avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD;
+  defm VPMADDWD   : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd,
+                       avx512vl_i16_info, avx512vl_i32_info>, AVX512BIBase;
+}
+
+defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", smax,
+                                     SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+
+defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", umax,
+                                     SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+
+defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", smin,
+                                     SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+
+defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin,
+                                     SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+//===----------------------------------------------------------------------===//
+// AVX-512  Logical Instructions
+//===----------------------------------------------------------------------===//
+
+defm VPAND : avx512_binop_rm_vl_dq<0xDB, 0xDB, "vpand", and,
+                                  SSE_INTALU_ITINS_P, HasAVX512, 1>;
+defm VPOR : avx512_binop_rm_vl_dq<0xEB, 0xEB, "vpor", or,
+                                  SSE_INTALU_ITINS_P, HasAVX512, 1>;
+defm VPXOR : avx512_binop_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
+                                  SSE_INTALU_ITINS_P, HasAVX512, 1>;
+defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
+                                  SSE_INTALU_ITINS_P, HasAVX512, 0>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  FP arithmetic
+//===----------------------------------------------------------------------===//
+multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+                         SDNode OpNode, SDNode VecNode, OpndItins itins,
+                         bit IsCommutable> {
+
+  defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                           (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                           "$src2, $src1", "$src1, $src2",
+                           (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                           (i32 FROUND_CURRENT)),
+                           itins.rr, IsCommutable>;
+
+  defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (VecNode (_.VT _.RC:$src1),
+                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+                           (i32 FROUND_CURRENT)),
+                         itins.rm, IsCommutable>;
+  let isCodeGenOnly = 1, isCommutable = IsCommutable,
+      Predicates = [HasAVX512] in {
+  def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
+                         (ins _.FRC:$src1, _.FRC:$src2),
+                          OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
+                          itins.rr>;
+  def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
+                         (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+                         OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         [(set _.FRC:$dst, (OpNode _.FRC:$src1,
+                         (_.ScalarLdFrag addr:$src2)))], itins.rr>;
+  }
+}
+
+multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+                         SDNode VecNode, OpndItins itins, bit IsCommutable = 0> {
+
+  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                          (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
+                          "$rc, $src2, $src1", "$src1, $src2, $rc",
+                          (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                          (i32 imm:$rc)), itins.rr, IsCommutable>,
+                          EVEX_B, EVEX_RC;
+}
+multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+                         SDNode VecNode, OpndItins itins, bit IsCommutable> {
+
+  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                            "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+                            (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                            (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+
+multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  SDNode VecNode,
+                                  SizeItins itins, bit IsCommutable> {
+  defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
+                              itins.s, IsCommutable>,
+             avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, VecNode,
+                              itins.s, IsCommutable>,
+                              XS, EVEX_4V, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
+  defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
+                              itins.d,                  IsCommutable>,
+             avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, VecNode,
+                              itins.d, IsCommutable>,
+                              XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+}
+
+multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  SDNode VecNode,
+                                  SizeItins itins, bit IsCommutable> {
+  defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
+                              itins.s, IsCommutable>,
+             avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, VecNode,
+                              itins.s, IsCommutable>,
+                              XS, EVEX_4V, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
+  defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
+                              itins.d,                  IsCommutable>,
+             avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, VecNode,
+                              itins.d, IsCommutable>,
+                              XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+}
+defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnd, SSE_ALU_ITINS_S, 1>;
+defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnd, SSE_ALU_ITINS_S, 1>;
+defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnd, SSE_ALU_ITINS_S, 0>;
+defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnd, SSE_ALU_ITINS_S, 0>;
+defm VMIN : avx512_binop_s_sae  <0x5D, "vmin", X86fmin, X86fminRnd, SSE_ALU_ITINS_S, 1>;
+defm VMAX : avx512_binop_s_sae  <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITINS_S, 1>;
+
+multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _, bit IsCommutable> {
+  defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+                  "$src2, $src1", "$src1, $src2",
+                  (_.VT (OpNode _.RC:$src1, _.RC:$src2))>, EVEX_4V;
+  let mayLoad = 1 in {
+    defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+                    "$src2, $src1", "$src1, $src2",
+                    (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>, EVEX_4V;
+    defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+                     "${src2}"##_.BroadcastStr##", $src1",
+                     "$src1, ${src2}"##_.BroadcastStr,
+                     (OpNode  _.RC:$src1, (_.VT (X86VBroadcast
+                                                (_.ScalarLdFrag addr:$src2))))>,
+                     EVEX_4V, EVEX_B;
+  }//let mayLoad = 1
+}
+
+multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
+                            X86VectorVTInfo _> {
+  defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix,
+                  "$rc, $src2, $src1", "$src1, $src2, $rc",
+                  (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc)))>,
+                  EVEX_4V, EVEX_B, EVEX_RC;
+}
+
+
+multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
+                            X86VectorVTInfo _> {
+  defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+                  "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+                  (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC)))>,
+                  EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                             bit IsCommutable = 0> {
+  defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info,
+                              IsCommutable>, EVEX_V512, PS,
+                              EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f64_info,
+                              IsCommutable>, EVEX_V512, PD, VEX_W,
+                              EVEX_CD8<64, CD8VF>;
+
+    // Define only if AVX512VL feature is present.
+  let Predicates = [HasVLX] in {
+    defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f32x_info,
+                                   IsCommutable>, EVEX_V128, PS,
+                                   EVEX_CD8<32, CD8VF>;
+    defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f32x_info,
+                                   IsCommutable>, EVEX_V256, PS,
+                                   EVEX_CD8<32, CD8VF>;
+    defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v2f64x_info,
+                                   IsCommutable>, EVEX_V128, PD, VEX_W,
+                                   EVEX_CD8<64, CD8VF>;
+    defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f64x_info,
+                                   IsCommutable>, EVEX_V256, PD, VEX_W,
+                                   EVEX_CD8<64, CD8VF>;
+  }
+}
+
+multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> {
+  defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info>,
+                              EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info>,
+                              EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> {
+  defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info>,
+                              EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info>,
+                              EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
+}
+
+defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>,
+            avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>;
+defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>,
+            avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>;
+defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>,
+            avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>;
+defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>,
+            avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>;
+defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>,
+            avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd>;
+defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>,
+            avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd>;
+let Predicates = [HasDQI] in {
+  defm VAND  : avx512_fp_binop_p<0x54, "vand", X86fand, 1>;
+  defm VANDN : avx512_fp_binop_p<0x55, "vandn", X86fandn, 0>;
+  defm VOR   : avx512_fp_binop_p<0x56, "vor", X86for, 1>;
+  defm VXOR  : avx512_fp_binop_p<0x57, "vxor", X86fxor, 1>;
+}
+
+multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _> {
+  defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+                  "$src2, $src1", "$src1, $src2",
+                  (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>, EVEX_4V;
+  let mayLoad = 1 in {
+    defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+                    "$src2, $src1", "$src1, $src2",
+                    (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>, EVEX_4V;
+    defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+                     "${src2}"##_.BroadcastStr##", $src1",
+                     "$src1, ${src2}"##_.BroadcastStr,
+                     (OpNode  _.RC:$src1, (_.VT (X86VBroadcast
+                                                (_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT))>,
+                     EVEX_4V, EVEX_B;
+  }//let mayLoad = 1
+}
+
+multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _> {
+  defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+                  "$src2, $src1", "$src1, $src2",
+                  (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>;
+  let mayLoad = 1 in {
+    defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+                    "$src2, $src1", "$src1, $src2",
+                    (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>;
+  }//let mayLoad = 1
+}
+
+multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode> {
+  defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>,
+             avx512_fp_round_packed<opc, OpcodeStr, OpNode, v16f32_info>,
+                              EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>,
+             avx512_fp_round_packed<opc, OpcodeStr, OpNode, v8f64_info>,
+                              EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+  defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNode, f32x_info>,
+                avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNode, SSE_ALU_ITINS_S.s>,
+                              EVEX_4V,EVEX_CD8<32, CD8VT1>;
+  defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNode, f64x_info>,
+                avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNode, SSE_ALU_ITINS_S.d>,
+                              EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+
+  // Define only if AVX512VL feature is present.
+  let Predicates = [HasVLX] in {
+    defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f32x_info>,
+                                   EVEX_V128, EVEX_CD8<32, CD8VF>;
+    defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f32x_info>,
+                                   EVEX_V256, EVEX_CD8<32, CD8VF>;
+    defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v2f64x_info>,
+                                   EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
+    defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f64x_info>,
+                                   EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+  }
+}
+defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef>, T8PD;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  VPTESTM instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vptest<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _> {
+  defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
+                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                      "$src2, $src1", "$src1, $src2",
+                   (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
+                    EVEX_4V;
+  let mayLoad = 1 in
+  defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
+                   (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                       "$src2, $src1", "$src1, $src2",
+                   (OpNode (_.VT _.RC:$src1),
+                    (_.VT (bitconvert (_.LdFrag addr:$src2))))>,
+                    EVEX_4V,
+                   EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _> {
+  let mayLoad = 1 in
+  defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
+                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                    "${src2}"##_.BroadcastStr##", $src1",
+                    "$src1, ${src2}"##_.BroadcastStr,
+                    (OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast
+                                                (_.ScalarLdFrag addr:$src2))))>,
+                    EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+}
+multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  AVX512VLVectorVTInfo _> {
+  let Predicates  = [HasAVX512] in
+  defm Z : avx512_vptest<opc, OpcodeStr, OpNode, _.info512>,
+           avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+
+  let Predicates = [HasAVX512, HasVLX] in {
+  defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, _.info256>,
+              avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+  defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, _.info128>,
+              avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
+  }
+}
+
+multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode,
+                                 avx512vl_i32_info>;
+  defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode,
+                                 avx512vl_i64_info>, VEX_W;
+}
+
+multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
+                                 SDNode OpNode> {
+  let Predicates = [HasBWI] in {
+  defm WZ:    avx512_vptest<opc, OpcodeStr#"w", OpNode, v32i16_info>,
+              EVEX_V512, VEX_W;
+  defm BZ:    avx512_vptest<opc, OpcodeStr#"b", OpNode, v64i8_info>,
+              EVEX_V512;
+  }
+  let Predicates = [HasVLX, HasBWI] in {
+
+  defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, v16i16x_info>,
+              EVEX_V256, VEX_W;
+  defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, v8i16x_info>,
+              EVEX_V128, VEX_W;
+  defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, v32i8x_info>,
+              EVEX_V256;
+  defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, v16i8x_info>,
+              EVEX_V128;
+  }
+}
+
+multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr,
+                                   SDNode OpNode> :
+  avx512_vptest_wb <opc_wb, OpcodeStr, OpNode>,
+  avx512_vptest_dq<opc_dq, OpcodeStr, OpNode>;
+
+defm VPTESTM   : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86testm>, T8PD;
+defm VPTESTNM  : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86testnm>, T8XS;
+
+def : Pat <(i16 (int_x86_avx512_mask_ptestm_d_512 (v16i32 VR512:$src1),
+                 (v16i32 VR512:$src2), (i16 -1))),
+                 (COPY_TO_REGCLASS (VPTESTMDZrr VR512:$src1, VR512:$src2), GR16)>;
+
+def : Pat <(i8 (int_x86_avx512_mask_ptestm_q_512 (v8i64 VR512:$src1),
+                 (v8i64 VR512:$src2), (i8 -1))),
+                 (COPY_TO_REGCLASS (VPTESTMQZrr VR512:$src1, VR512:$src2), GR8)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Shift instructions
+//===----------------------------------------------------------------------===//
+multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
+                         string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> {
+  defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, u8imm:$src2), OpcodeStr,
+                      "$src2, $src1", "$src1, $src2",
+                   (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))),
+                   SSE_INTSHIFT_ITINS_P.rr>;
+  let mayLoad = 1 in
+  defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
+                   (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
+                       "$src2, $src1", "$src1, $src2",
+                   (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                          (i8 imm:$src2))),
+                   SSE_INTSHIFT_ITINS_P.rm>;
+}
+
+multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
+                         string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> {
+  let mayLoad = 1 in
+  defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
+                   (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
+      "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2",
+     (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2))),
+     SSE_INTSHIFT_ITINS_P.rm>, EVEX_B;
+}
+
+multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> {
+   // src2 is always 128-bit
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, VR128X:$src2), OpcodeStr,
+                      "$src2, $src1", "$src1, $src2",
+                   (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2))),
+                   SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V;
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, i128mem:$src2), OpcodeStr,
+                       "$src2, $src1", "$src1, $src2",
+                   (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))),
+                   SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase,
+                   EVEX_4V;
+}
+
+multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  ValueType SrcVT, PatFrag bc_frag,
+                                  AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+  let Predicates = [prd] in
+  defm Z    : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag,
+                            VTInfo.info512>, EVEX_V512,
+                            EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ;
+  let Predicates = [prd, HasVLX] in {
+  defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag,
+                            VTInfo.info256>, EVEX_V256,
+                            EVEX_CD8<VTInfo.info256.EltSize, CD8VH>;
+  defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag,
+                            VTInfo.info128>, EVEX_V128,
+                            EVEX_CD8<VTInfo.info128.EltSize, CD8VF>;
+  }
+}
+
+multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw,
+                              string OpcodeStr, SDNode OpNode> {
+  defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, v4i32, bc_v4i32,
+                                 avx512vl_i32_info, HasAVX512>;
+  defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, v2i64, bc_v2i64,
+                                 avx512vl_i64_info, HasAVX512>, VEX_W;
+  defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, v8i16, bc_v8i16,
+                                 avx512vl_i16_info, HasBWI>;
+}
+
+multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
+                                 string OpcodeStr, SDNode OpNode,
+                                 AVX512VLVectorVTInfo VTInfo> {
+  let Predicates = [HasAVX512] in
+  defm Z:    avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info512>,
+             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info512>, EVEX_V512;
+  let Predicates = [HasAVX512, HasVLX] in {
+  defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info256>,
+             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info256>, EVEX_V256;
+  defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info128>,
+             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info128>, EVEX_V128;
+  }
+}
+
+multiclass avx512_shift_rmi_w<bits<8> opcw,
+                                 Format ImmFormR, Format ImmFormM,
+                                 string OpcodeStr, SDNode OpNode> {
+  let Predicates = [HasBWI] in
+  defm WZ:    avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                               v32i16_info>, EVEX_V512;
+  let Predicates = [HasVLX, HasBWI] in {
+  defm WZ256: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                               v16i16x_info>, EVEX_V256;
+  defm WZ128: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                               v8i16x_info>, EVEX_V128;
+  }
+}
+
+multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq,
+                                 Format ImmFormR, Format ImmFormM,
+                                 string OpcodeStr, SDNode OpNode> {
+  defm D: avx512_shift_rmi_sizes<opcd, ImmFormR, ImmFormM, OpcodeStr#"d", OpNode,
+                                 avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+  defm Q: avx512_shift_rmi_sizes<opcq, ImmFormR, ImmFormM, OpcodeStr#"q", OpNode,
+                                 avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W;
+}
+
+defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli>,
+             avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli>, AVX512BIi8Base, EVEX_4V;
+
+defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli>,
+             avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli>, AVX512BIi8Base, EVEX_4V;
+
+defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai>,
+             avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai>, AVX512BIi8Base, EVEX_4V;
+
+defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", rotr>, AVX512BIi8Base, EVEX_4V;
+defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", rotl>, AVX512BIi8Base, EVEX_4V;
+
+defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>;
+defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>;
+defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>;
+
+//===-------------------------------------------------------------------===//
+// Variable Bit Shifts
+//===-------------------------------------------------------------------===//
+multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _> {
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                      "$src2, $src1", "$src1, $src2",
+                   (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))),
+                   SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V;
+  let mayLoad = 1 in
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                       "$src2, $src1", "$src1, $src2",
+                   (_.VT (OpNode _.RC:$src1,
+                   (_.VT (bitconvert (_.LdFrag addr:$src2))))),
+                   SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V,
+                   EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _> {
+  let mayLoad = 1 in
+  defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                    "${src2}"##_.BroadcastStr##", $src1",
+                    "$src1, ${src2}"##_.BroadcastStr,
+                    (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast
+                                                (_.ScalarLdFrag addr:$src2))))),
+                    SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B,
+                    EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+}
+multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  AVX512VLVectorVTInfo _> {
+  let Predicates  = [HasAVX512] in
+  defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>,
+           avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+
+  let Predicates = [HasAVX512, HasVLX] in {
+  defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>,
+              avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+  defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info128>,
+              avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
+  }
+}
+
+multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr,
+                                 SDNode OpNode> {
+  defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode,
+                                 avx512vl_i32_info>;
+  defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode,
+                                 avx512vl_i64_info>, VEX_W;
+}
+
+// Use 512bit version to implement 128/256 bit in case NoVLX.  
+multiclass avx512_var_shift_w_lowering<AVX512VLVectorVTInfo _, SDNode OpNode> {
+  let Predicates = [HasBWI, NoVLX] in {
+  def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1), 
+                                  (_.info256.VT _.info256.RC:$src2))),
+            (EXTRACT_SUBREG                
+                (!cast<Instruction>(NAME#"WZrr")
+                    (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+                    (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
+             sub_ymm)>;
+
+  def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1), 
+                                  (_.info128.VT _.info128.RC:$src2))),
+            (EXTRACT_SUBREG                
+                (!cast<Instruction>(NAME#"WZrr")
+                    (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+                    (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
+             sub_xmm)>;
+  }
+}
+
+multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
+                                 SDNode OpNode> {
+  let Predicates = [HasBWI] in
+  defm WZ:    avx512_var_shift<opc, OpcodeStr, OpNode, v32i16_info>,
+              EVEX_V512, VEX_W;
+  let Predicates = [HasVLX, HasBWI] in {
+
+  defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, v16i16x_info>,
+              EVEX_V256, VEX_W;
+  defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, v8i16x_info>,
+              EVEX_V128, VEX_W;
+  }
+}
+
+defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>,
+              avx512_var_shift_w<0x12, "vpsllvw", shl>,
+              avx512_var_shift_w_lowering<avx512vl_i16_info, shl>;
+defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>,
+              avx512_var_shift_w<0x11, "vpsravw", sra>,
+              avx512_var_shift_w_lowering<avx512vl_i16_info, sra>;
+defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>,
+              avx512_var_shift_w<0x10, "vpsrlvw", srl>,
+              avx512_var_shift_w_lowering<avx512vl_i16_info, srl>;
+defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>;
+defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>;
+
+//===-------------------------------------------------------------------===//
+// 1-src variable permutation VPERMW/D/Q
+//===-------------------------------------------------------------------===//
+multiclass avx512_vperm_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  AVX512VLVectorVTInfo _> {
+  let Predicates  = [HasAVX512] in
+  defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>,
+           avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+
+  let Predicates = [HasAVX512, HasVLX] in
+  defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>,
+              avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+}
+
+multiclass avx512_vpermi_dq_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
+                                 string OpcodeStr, SDNode OpNode,
+                                 AVX512VLVectorVTInfo VTInfo> {
+  let Predicates = [HasAVX512] in
+  defm Z:    avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info512>,
+             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info512>, EVEX_V512;
+  let Predicates = [HasAVX512, HasVLX] in
+  defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info256>,
+             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info256>, EVEX_V256;
+}
+
+
+defm VPERM  : avx512_var_shift_w<0x8D, "vpermw", X86VPermv>;
+
+defm VPERMD : avx512_vperm_dq_sizes<0x36, "vpermd", X86VPermv,
+                                    avx512vl_i32_info>;
+defm VPERMQ : avx512_vperm_dq_sizes<0x36, "vpermq", X86VPermv,
+                                    avx512vl_i64_info>, VEX_W;
+defm VPERMPS : avx512_vperm_dq_sizes<0x16, "vpermps", X86VPermv,
+                                    avx512vl_f32_info>;
+defm VPERMPD : avx512_vperm_dq_sizes<0x16, "vpermpd", X86VPermv,
+                                    avx512vl_f64_info>, VEX_W;
+
+defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq",
+                             X86VPermi, avx512vl_i64_info>,
+                             EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd",
+                             X86VPermi, avx512vl_f64_info>,
+                             EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPERMIL 
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr,  SDNode OpNode,
+                             X86VectorVTInfo _, X86VectorVTInfo Ctrl> {
+  defm rr: AVX512_maskable<OpcVar, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, Ctrl.RC:$src2), OpcodeStr,
+                  "$src2, $src1", "$src1, $src2",
+                  (_.VT (OpNode _.RC:$src1,
+                               (Ctrl.VT Ctrl.RC:$src2)))>,
+                  T8PD, EVEX_4V;
+  let mayLoad = 1 in {
+    defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
+                    "$src2, $src1", "$src1, $src2",
+                    (_.VT (OpNode
+                             _.RC:$src1,
+                             (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>,
+                    T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+    defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
+                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                     "${src2}"##_.BroadcastStr##", $src1",
+                     "$src1, ${src2}"##_.BroadcastStr,
+                     (_.VT (OpNode
+                              _.RC:$src1,
+                              (Ctrl.VT (X86VBroadcast
+                                         (Ctrl.ScalarLdFrag addr:$src2)))))>,
+                     T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+  }//let mayLoad = 1
+}
+
+multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar,
+                             AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
+  let Predicates = [HasAVX512] in {
+    defm Z    : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info512,
+                                  Ctrl.info512>, EVEX_V512;
+  }
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info128,
+                                  Ctrl.info128>, EVEX_V128;
+    defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info256,
+                                  Ctrl.info256>, EVEX_V256;
+  }
+}
+
+multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar,
+                         AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
+
+  defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, _, Ctrl>;
+  defm NAME: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem, OpcodeStr,
+                                    X86VPermilpi, _>,
+                    EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>;
+}
+
+defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info,
+                               avx512vl_i32_info>;
+defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info,
+                               avx512vl_i64_info>, VEX_W;
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW
+//===----------------------------------------------------------------------===//
+
+defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd",
+                             X86PShufd, avx512vl_i32_info>,
+                             EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>;
+defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw",
+                                  X86PShufhw>, EVEX, AVX512XSIi8Base;
+defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw",
+                                  X86PShuflw>, EVEX, AVX512XDIi8Base;
+
+multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  let Predicates = [HasBWI] in
+  defm Z:    avx512_var_shift<opc, OpcodeStr, OpNode, v64i8_info>, EVEX_V512;
+
+  let Predicates = [HasVLX, HasBWI] in {
+  defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, v32i8x_info>, EVEX_V256;
+  defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, v16i8x_info>, EVEX_V128;
+  }
+}
+
+defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb>;
+
+//===----------------------------------------------------------------------===//
+// Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
+          (ins VR128X:$src1, VR128X:$src2),
+          "vmovlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))],
+           IIC_SSE_MOV_LH>, EVEX_4V;
+def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
+          (ins VR128X:$src1, VR128X:$src2),
+          "vmovhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))],
+          IIC_SSE_MOV_LH>, EVEX_4V;
+
+let Predicates = [HasAVX512] in {
+  // MOVLHPS patterns
+  def : Pat<(v4i32 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+            (VMOVLHPSZrr VR128X:$src1, VR128X:$src2)>;
+  def : Pat<(v2i64 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+            (VMOVLHPSZrr (v2i64 VR128X:$src1), VR128X:$src2)>;
+
+  // MOVHLPS patterns
+  def : Pat<(v4i32 (X86Movhlps VR128X:$src1, VR128X:$src2)),
+            (VMOVHLPSZrr VR128X:$src1, VR128X:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VMOVHPS/PD VMOVLPS Instructions
+// All patterns was taken from SSS implementation.
+//===----------------------------------------------------------------------===//
+multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  X86VectorVTInfo _> {
+  let mayLoad = 1 in
+    def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst),
+                    (ins _.RC:$src1, f64mem:$src2),
+                    !strconcat(OpcodeStr,
+                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set _.RC:$dst,
+                       (OpNode _.RC:$src1,
+                         (_.VT (bitconvert
+                           (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))],
+                    IIC_SSE_MOV_LH>, EVEX_4V;
+}
+
+defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", X86Movlhps,
+                                  v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
+defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Movlhpd,
+                                  v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
+defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", X86Movlps,
+                                  v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
+defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movlpd,
+                                  v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
+
+let Predicates = [HasAVX512] in {
+  // VMOVHPS patterns
+  def : Pat<(X86Movlhps VR128X:$src1,
+               (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+          (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(X86Movlhps VR128X:$src1,
+               (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+          (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
+  // VMOVHPD patterns
+  def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
+                    (scalar_to_vector (loadf64 addr:$src2)))),
+           (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
+                    (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
+           (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
+  // VMOVLPS patterns
+  def : Pat<(v4f32 (X86Movlps VR128X:$src1, (load addr:$src2))),
+          (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86Movlps VR128X:$src1, (load addr:$src2))),
+          (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>;
+  // VMOVLPD patterns
+  def : Pat<(v2f64 (X86Movlpd VR128X:$src1, (load addr:$src2))),
+          (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (X86Movlpd VR128X:$src1, (load addr:$src2))),
+          (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Movsd VR128X:$src1,
+                           (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+          (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+}
+
+let mayStore = 1 in {
+def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs),
+                       (ins f64mem:$dst, VR128X:$src),
+                       "vmovhps\t{$src, $dst|$dst, $src}",
+                       [(store (f64 (vector_extract
+                                     (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)),
+                                                (bc_v2f64 (v4f32 VR128X:$src))),
+                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
+                       EVEX, EVEX_CD8<32, CD8VT2>;
+def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs),
+                       (ins f64mem:$dst, VR128X:$src),
+                       "vmovhpd\t{$src, $dst|$dst, $src}",
+                       [(store (f64 (vector_extract
+                                     (v2f64 (X86Unpckh VR128X:$src, VR128X:$src)),
+                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
+                       EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs),
+                       (ins f64mem:$dst, VR128X:$src),
+                       "vmovlps\t{$src, $dst|$dst, $src}",
+                       [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128X:$src)),
+                                     (iPTR 0))), addr:$dst)],
+                                     IIC_SSE_MOV_LH>,
+                       EVEX, EVEX_CD8<32, CD8VT2>;
+def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs),
+                       (ins f64mem:$dst, VR128X:$src),
+                       "vmovlpd\t{$src, $dst|$dst, $src}",
+                       [(store (f64 (vector_extract (v2f64 VR128X:$src),
+                                     (iPTR 0))), addr:$dst)],
+                                     IIC_SSE_MOV_LH>,
+                       EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+}
+let Predicates = [HasAVX512] in {
+  // VMOVHPD patterns
+  def : Pat<(store (f64 (vector_extract
+                           (v2f64 (X86VPermilpi VR128X:$src, (i8 1))),
+                           (iPTR 0))), addr:$dst),
+           (VMOVHPDZ128mr addr:$dst, VR128X:$src)>;
+  // VMOVLPS patterns
+  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128X:$src2)),
+                   addr:$src1),
+            (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>;
+  def : Pat<(store (v4i32 (X86Movlps
+                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128X:$src2)), addr:$src1),
+            (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>;
+  // VMOVLPD patterns
+  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128X:$src2)),
+                   addr:$src1),
+            (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>;
+  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128X:$src2)),
+                   addr:$src1),
+            (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>;
+}
+//===----------------------------------------------------------------------===//
+// FMA - Fused Multiply Operations
+//
+
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _> {
+  defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
+         AVX512FMA3Base;
+
+  let mayLoad = 1 in {
+    defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+            (ins _.RC:$src2, _.MemOp:$src3),
+            OpcodeStr, "$src3, $src2", "$src2, $src3",
+            (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>,
+            AVX512FMA3Base;
+
+    defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+              (ins _.RC:$src2, _.ScalarMemOp:$src3),
+              OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
+              !strconcat("$src2, ${src3}", _.BroadcastStr ),
+              (OpNode _.RC:$src1,
+               _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
+              AVX512FMA3Base, EVEX_B;
+  }
+}
+
+multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _> {
+  defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+          OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+          (_.VT ( OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc)))>,
+          AVX512FMA3Base, EVEX_B, EVEX_RC;
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                     SDNode OpNodeRnd, AVX512VLVectorVTInfo _> {
+  let Predicates = [HasAVX512] in {
+    defm Z      : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info512>,
+                  avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, _.info512>,
+                      EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+  }
+  let Predicates = [HasVLX, HasAVX512] in {
+    defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info256>,
+                      EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+    defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info128>,
+                      EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+  }
+}
+
+multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            SDNode OpNodeRnd > {
+    defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
+                                      avx512vl_f32_info>;
+    defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
+                                      avx512vl_f64_info>, VEX_W;
+}
+
+defm VFMADD213    : avx512_fma3p_213_f<0xA8, "vfmadd213", X86Fmadd, X86FmaddRnd>;
+defm VFMSUB213    : avx512_fma3p_213_f<0xAA, "vfmsub213", X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD213   : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB213   : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubRnd>;
+
+
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _> {
+  defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1))>,
+         AVX512FMA3Base;
+
+  let mayLoad = 1 in {
+    defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+            (ins _.RC:$src2, _.MemOp:$src3),
+            OpcodeStr, "$src3, $src2", "$src2, $src3",
+            (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
+           AVX512FMA3Base;
+
+    defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+           (ins _.RC:$src2, _.ScalarMemOp:$src3),
+           OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
+           "$src2, ${src3}"##_.BroadcastStr,
+           (_.VT (OpNode _.RC:$src2,
+                        (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+                        _.RC:$src1))>, AVX512FMA3Base, EVEX_B;
+  }
+}
+
+multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _> {
+  defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+          OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+          (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc)))>,
+          AVX512FMA3Base, EVEX_B, EVEX_RC;
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                     SDNode OpNodeRnd, AVX512VLVectorVTInfo _> {
+  let Predicates = [HasAVX512] in {
+    defm Z      : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info512>,
+                  avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, _.info512>,
+                      EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+  }
+  let Predicates = [HasVLX, HasAVX512] in {
+    defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info256>,
+                      EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+    defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info128>,
+                      EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+  }
+}
+
+multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            SDNode OpNodeRnd > {
+    defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
+                                      avx512vl_f32_info>;
+    defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
+                                      avx512vl_f64_info>, VEX_W;
+}
+
+defm VFMADD231    : avx512_fma3p_231_f<0xB8, "vfmadd231", X86Fmadd, X86FmaddRnd>;
+defm VFMSUB231    : avx512_fma3p_231_f<0xBA, "vfmsub231", X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD231   : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB231   : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubRnd>;
+
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _> {
+  defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src3, _.RC:$src2),
+          OpcodeStr, "$src2, $src3", "$src3, $src2",
+          (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
+         AVX512FMA3Base;
+
+  let mayLoad = 1 in {
+    defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+            (ins _.RC:$src3, _.MemOp:$src2),
+            OpcodeStr, "$src2, $src3", "$src3, $src2",
+            (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2), _.RC:$src3))>,
+           AVX512FMA3Base;
+
+    defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+           (ins _.RC:$src3, _.ScalarMemOp:$src2),
+           OpcodeStr, "${src2}"##_.BroadcastStr##", $src3",
+           "$src3, ${src2}"##_.BroadcastStr,
+           (_.VT (OpNode _.RC:$src1,
+                        (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+                        _.RC:$src3))>, AVX512FMA3Base, EVEX_B;
+  }
+}
+
+multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _> {
+  defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src3, _.RC:$src2, AVX512RC:$rc),
+          OpcodeStr, "$rc, $src2, $src3", "$src3, $src2, $rc",
+          (_.VT ( OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc)))>,
+          AVX512FMA3Base, EVEX_B, EVEX_RC;
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                     SDNode OpNodeRnd, AVX512VLVectorVTInfo _> {
+  let Predicates = [HasAVX512] in {
+    defm Z      : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info512>,
+                  avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, _.info512>,
+                      EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+  }
+  let Predicates = [HasVLX, HasAVX512] in {
+    defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info256>,
+                      EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+    defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info128>,
+                      EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+  }
+}
+
+multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            SDNode OpNodeRnd > {
+    defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
+                                      avx512vl_f32_info>;
+    defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
+                                      avx512vl_f64_info>, VEX_W;
+}
+
+defm VFMADD132    : avx512_fma3p_132_f<0x98, "vfmadd132", X86Fmadd, X86FmaddRnd>;
+defm VFMSUB132    : avx512_fma3p_132_f<0x9A, "vfmsub132", X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD132   : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB132   : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubRnd>;
+
+// Scalar FMA
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                               dag RHS_VEC_r, dag RHS_VEC_m, dag RHS_VEC_rb,
+                                                        dag RHS_r, dag RHS_m > {
+  defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3), OpcodeStr,
+          "$src3, $src2", "$src2, $src3", RHS_VEC_r>, AVX512FMA3Base;
+
+  let mayLoad = 1 in
+    defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+            (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr,
+            "$src3, $src2", "$src2, $src3", RHS_VEC_m>, AVX512FMA3Base;
+
+  defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+         (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+         OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", RHS_VEC_rb>,
+                                       AVX512FMA3Base, EVEX_B, EVEX_RC;
+
+  let isCodeGenOnly = 1 in {
+    def r     : AVX512FMA3<opc, MRMSrcReg, (outs _.FRC:$dst),
+                     (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3),
+                     !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                     [RHS_r]>;
+    let mayLoad = 1 in
+      def m     : AVX512FMA3<opc, MRMSrcMem, (outs _.FRC:$dst),
+                      (ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3),
+                      !strconcat(OpcodeStr,
+                                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                      [RHS_m]>;
+  }// isCodeGenOnly = 1
+}
+}// Constraints = "$src1 = $dst"
+
+multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
+         string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, X86VectorVTInfo _ ,
+                                                                  string SUFF> {
+
+  defm NAME#213#SUFF: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ ,
+                (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)),
+                (_.VT (OpNode _.RC:$src2, _.RC:$src1,
+                         (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))))),
+                (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3,
+                         (i32 imm:$rc))),
+                (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
+                         _.FRC:$src3))),
+                (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
+                         (_.ScalarLdFrag addr:$src3))))>;
+
+  defm NAME#231#SUFF: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ ,
+                (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)),
+                (_.VT (OpNode _.RC:$src2,
+                       (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
+                              _.RC:$src1)),
+                (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1,
+                                  (i32 imm:$rc))),
+                (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
+                                          _.FRC:$src1))),
+                (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2,
+                            (_.ScalarLdFrag addr:$src3), _.FRC:$src1)))>;
+
+  defm NAME#132#SUFF: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ ,
+                (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)),
+                (_.VT (OpNode _.RC:$src1,
+                       (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
+                              _.RC:$src2)),
+                (_.VT ( OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2,
+                         (i32 imm:$rc))),
+                (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,
+                         _.FRC:$src2))),
+                (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1,
+                          (_.ScalarLdFrag addr:$src3), _.FRC:$src2)))>;
+}
+
+multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
+                             string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd>{
+  let Predicates = [HasAVX512] in {
+    defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
+                                   OpNodeRnd, f32x_info, "SS">,
+                                   EVEX_CD8<32, CD8VT1>, VEX_LIG;
+    defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
+                                   OpNodeRnd, f64x_info, "SD">,
+                                   EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
+  }
+}
+
+defm VFMADD  : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnd>;
+defm VFMSUB  : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>;
+defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Scalar convert from sign integer to float/double
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
+                    X86VectorVTInfo DstVT, X86MemOperand x86memop,
+                    PatFrag ld_frag, string asm> {
+  let hasSideEffects = 0 in {
+    def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst),
+              (ins DstVT.FRC:$src1, SrcRC:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              EVEX_4V;
+    let mayLoad = 1 in
+      def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst),
+              (ins DstVT.FRC:$src1, x86memop:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              EVEX_4V;
+  } // hasSideEffects = 0
+  let isCodeGenOnly = 1 in {
+    def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
+                  (ins DstVT.RC:$src1, SrcRC:$src2),
+                  !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set DstVT.RC:$dst,
+                        (OpNode (DstVT.VT DstVT.RC:$src1),
+                                 SrcRC:$src2,
+                                 (i32 FROUND_CURRENT)))]>, EVEX_4V;
+
+    def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst),
+                  (ins DstVT.RC:$src1, x86memop:$src2),
+                  !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set DstVT.RC:$dst,
+                        (OpNode (DstVT.VT DstVT.RC:$src1),
+                                 (ld_frag addr:$src2),
+                                 (i32 FROUND_CURRENT)))]>, EVEX_4V;
+  }//isCodeGenOnly = 1
+}
+
+multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
+                    X86VectorVTInfo DstVT, string asm> {
+  def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
+              (ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc),
+              !strconcat(asm,
+                  "\t{$src2, $rc, $src1, $dst|$dst, $src1, $rc, $src2}"),
+              [(set DstVT.RC:$dst,
+                    (OpNode (DstVT.VT DstVT.RC:$src1),
+                             SrcRC:$src2,
+                             (i32 imm:$rc)))]>, EVEX_4V, EVEX_B, EVEX_RC;
+}
+
+multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
+                    X86VectorVTInfo DstVT, X86MemOperand x86memop,
+                    PatFrag ld_frag, string asm> {
+  defm NAME : avx512_vcvtsi_round<opc, OpNode, SrcRC, DstVT, asm>,
+              avx512_vcvtsi<opc, OpNode, SrcRC, DstVT, x86memop, ld_frag, asm>,
+                        VEX_LIG;
+}
+
+let Predicates = [HasAVX512] in {
+defm VCVTSI2SSZ  : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32,
+                                 v4f32x_info, i32mem, loadi32, "cvtsi2ss{l}">,
+                                 XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64,
+                                 v4f32x_info, i64mem, loadi64, "cvtsi2ss{q}">,
+                                 XS, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTSI2SDZ  : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32,
+                                 v2f64x_info, i32mem, loadi32, "cvtsi2sd{l}">,
+                                 XD, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64,
+                                 v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">,
+                                 XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
+          (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
+          (VCVTSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
+          (VCVTSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
+          (VCVTSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+def : Pat<(f32 (sint_to_fp GR32:$src)),
+          (VCVTSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f32 (sint_to_fp GR64:$src)),
+          (VCVTSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(f64 (sint_to_fp GR32:$src)),
+          (VCVTSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f64 (sint_to_fp GR64:$src)),
+          (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+
+defm VCVTUSI2SSZ   : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR32,
+                                  v4f32x_info, i32mem, loadi32,
+                                  "cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64,
+                                  v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">,
+                                  XS, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTUSI2SDZ   : avx512_vcvtsi<0x7B, X86UintToFpRnd, GR32, v2f64x_info,
+                                  i32mem, loadi32, "cvtusi2sd{l}">,
+                                  XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64,
+                                  v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">,
+                                  XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))),
+          (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f32 (uint_to_fp (loadi64 addr:$src))),
+          (VCVTUSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (uint_to_fp (loadi32 addr:$src))),
+          (VCVTUSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (uint_to_fp (loadi64 addr:$src))),
+          (VCVTUSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+def : Pat<(f32 (uint_to_fp GR32:$src)),
+          (VCVTUSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f32 (uint_to_fp GR64:$src)),
+          (VCVTUSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(f64 (uint_to_fp GR32:$src)),
+          (VCVTUSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f64 (uint_to_fp GR64:$src)),
+          (VCVTUSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Scalar convert from float/double to integer
+//===----------------------------------------------------------------------===//
+multiclass avx512_cvt_s_int_round<bits<8> opc, RegisterClass SrcRC, 
+                                  RegisterClass DstRC, Intrinsic Int,
+                           Operand memop, ComplexPattern mem_cpat, string asm> {
+  let hasSideEffects = 0, Predicates = [HasAVX512] in {
+    def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+                !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+                [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG;
+    def rb : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc),
+                !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), []>, 
+                EVEX, VEX_LIG, EVEX_B, EVEX_RC;
+    let mayLoad = 1 in
+    def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
+                !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG;
+  } // hasSideEffects = 0, Predicates = [HasAVX512] 
+}
+
+// Convert float/double to signed/unsigned int 32/64
+defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse_cvtss2si,
+                                   ssmem, sse_load_f32, "cvtss2si">,
+                                   XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, 
+                                  int_x86_sse_cvtss2si64,
+                                   ssmem, sse_load_f32, "cvtss2si">,
+                                   XS, VEX_W, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, 
+                                  int_x86_avx512_cvtss2usi,
+                                   ssmem, sse_load_f32, "cvtss2usi">,
+                                   XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64,
+                                   int_x86_avx512_cvtss2usi64, ssmem,
+                                   sse_load_f32, "cvtss2usi">, XS, VEX_W,
+                                   EVEX_CD8<32, CD8VT1>;
+defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si,
+                                   sdmem, sse_load_f64, "cvtsd2si">,
+                                   XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, 
+                                   int_x86_sse2_cvtsd2si64,
+                                   sdmem, sse_load_f64, "cvtsd2si">,
+                                   XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2USIZ:   avx512_cvt_s_int_round<0x79, VR128X, GR32, 
+                                   int_x86_avx512_cvtsd2usi,
+                                   sdmem, sse_load_f64, "cvtsd2usi">,
+                                   XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64,
+                                   int_x86_avx512_cvtsd2usi64, sdmem,
+                                   sse_load_f64, "cvtsd2usi">, XD, VEX_W,
+                                   EVEX_CD8<64, CD8VT1>;
+
+let isCodeGenOnly = 1 , Predicates = [HasAVX512] in {
+  defm Int_VCVTSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+            int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
+            SSE_CVT_Scalar, 0>, XS, EVEX_4V;
+  defm Int_VCVTSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+            int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
+            SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W;
+  defm Int_VCVTSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+            int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
+            SSE_CVT_Scalar, 0>, XD, EVEX_4V;
+  defm Int_VCVTSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+            int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
+            SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W;
+
+  defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+            int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}",
+            SSE_CVT_Scalar, 0>, XD, EVEX_4V;
+} // isCodeGenOnly = 1, Predicates = [HasAVX512]
+
+// Convert float/double to signed/unsigned int 32/64 with truncation
+multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC, 
+                            X86VectorVTInfo _DstRC, SDNode OpNode, 
+                            SDNode OpNodeRnd>{
+let Predicates = [HasAVX512] in {
+  def rr : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+              [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, EVEX;
+  def rb : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
+                !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
+                []>, EVEX, EVEX_B;
+  def rm : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.MemOp:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+              [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>, 
+              EVEX;
+
+  let isCodeGenOnly = 1,hasSideEffects = 0 in {
+      def rr_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+                !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+               [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src,
+                                     (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG;
+      def rb_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+                !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
+                [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src, 
+                                      (i32 FROUND_NO_EXC)))]>, 
+                                      EVEX,VEX_LIG , EVEX_B;
+      let mayLoad = 1 in
+        def rm_Int : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), 
+                    (ins _SrcRC.MemOp:$src),
+                    !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+                    []>, EVEX, VEX_LIG;
+
+  } // isCodeGenOnly = 1, hasSideEffects = 0
+} //HasAVX512
+}
+
+
+defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i32x_info, 
+                        fp_to_sint,X86cvttss2IntRnd>, 
+                        XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i64x_info, 
+                        fp_to_sint,X86cvttss2IntRnd>, 
+                        VEX_W, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i32x_info, 
+                        fp_to_sint,X86cvttsd2IntRnd>,
+                        XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i64x_info, 
+                        fp_to_sint,X86cvttsd2IntRnd>, 
+                        VEX_W, XD, EVEX_CD8<64, CD8VT1>;
+
+defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i32x_info, 
+                        fp_to_uint,X86cvttss2UIntRnd>, 
+                        XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i64x_info, 
+                        fp_to_uint,X86cvttss2UIntRnd>, 
+                        XS,VEX_W, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i32x_info, 
+                        fp_to_uint,X86cvttsd2UIntRnd>, 
+                        XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i64x_info, 
+                        fp_to_uint,X86cvttsd2UIntRnd>, 
+                        XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+let Predicates = [HasAVX512] in {
+  def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))),
+            (VCVTTSS2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+  def : Pat<(i64 (int_x86_sse_cvttss2si64 (v4f32 VR128X:$src))),
+            (VCVTTSS2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+  def : Pat<(i32 (int_x86_sse2_cvttsd2si (v2f64 VR128X:$src))),
+            (VCVTTSD2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>;
+  def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (v2f64 VR128X:$src))),
+            (VCVTTSD2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>;
+
+} // HasAVX512
+//===----------------------------------------------------------------------===//
+// AVX-512  Convert form float to double and back
+//===----------------------------------------------------------------------===//
+multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNode> {
+  defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, 
+                         "$src2, $src1", "$src1, $src2",
+                         (_.VT (OpNode (_Src.VT _Src.RC:$src1),
+                                       (_Src.VT _Src.RC:$src2)))>, 
+                         EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
+  defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, 
+                         "$src2, $src1", "$src1, $src2",
+                         (_.VT (OpNode (_Src.VT _Src.RC:$src1), 
+                                  (_Src.VT (scalar_to_vector 
+                                            (_Src.ScalarLdFrag addr:$src2)))))>, 
+                         EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+// Scalar Coversion with SAE - suppress all exceptions
+multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
+                        "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+                        (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src1), 
+                                         (_Src.VT _Src.RC:$src2),
+                                         (i32 FROUND_NO_EXC)))>,
+                        EVEX_4V, VEX_LIG, EVEX_B;
+}
+
+// Scalar Conversion with rounding control (RC)
+multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _Src.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
+                        "$rc, $src2, $src1", "$src1, $src2, $rc",
+                        (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src1), 
+                                         (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>,
+                        EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>,
+                        EVEX_B, EVEX_RC;
+}
+multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr, SDNode OpNode, 
+                                  SDNode OpNodeRnd, X86VectorVTInfo _src, 
+                                                        X86VectorVTInfo _dst> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>,
+             avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
+                               OpNodeRnd>, VEX_W, EVEX_CD8<64, CD8VT1>,
+                               EVEX_V512, XD;
+  }
+}
+
+multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNode, 
+                                    SDNode OpNodeRnd, X86VectorVTInfo _src, 
+                                                          X86VectorVTInfo _dst> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>,
+             avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>, 
+             EVEX_CD8<32, CD8VT1>, XS, EVEX_V512;
+  }
+}
+defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86fround,
+                                         X86froundRnd, f64x_info, f32x_info>;
+defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext, 
+                                          X86fpextRnd,f32x_info, f64x_info >;
+
+def : Pat<(f64 (fextend FR32X:$src)), 
+          (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X), 
+                               (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>,
+          Requires<[HasAVX512]>;
+def : Pat<(f64 (fextend (loadf32 addr:$src))),
+          (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+          Requires<[HasAVX512]>;
+
+def : Pat<(f64 (extloadf32 addr:$src)),
+      (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+      Requires<[HasAVX512, OptForSize]>;
+
+def : Pat<(f64 (extloadf32 addr:$src)),
+          (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)), 
+                    (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>,
+          Requires<[HasAVX512, OptForSpeed]>;
+
+def : Pat<(f32 (fround FR64X:$src)), 
+          (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), 
+                    (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>,
+           Requires<[HasAVX512]>;
+//===----------------------------------------------------------------------===//
+// AVX-512  Vector convert from signed/unsigned integer to float/double
+//          and from float/double to signed/unsigned integer
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNode,
+                         string Broadcast = _.BroadcastStr,
+                         string Alias = ""> {
+
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _Src.RC:$src), OpcodeStr, "$src", "$src",
+                         (_.VT (OpNode (_Src.VT _Src.RC:$src)))>, EVEX;
+
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _Src.MemOp:$src), OpcodeStr#Alias, "$src", "$src",
+                         (_.VT (OpNode (_Src.VT
+                             (bitconvert (_Src.LdFrag addr:$src)))))>, EVEX;
+
+  defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _Src.MemOp:$src), OpcodeStr,
+                         "${src}"##Broadcast, "${src}"##Broadcast,
+                         (_.VT (OpNode (_Src.VT
+                                  (X86VBroadcast (_Src.ScalarLdFrag addr:$src)))
+                            ))>, EVEX, EVEX_B;
+}
+// Coversion with SAE - suppress all exceptions
+multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+  defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _Src.RC:$src), OpcodeStr,
+                        "{sae}, $src", "$src, {sae}",
+                        (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src),
+                               (i32 FROUND_NO_EXC)))>,
+                        EVEX, EVEX_B;
+}
+
+// Conversion with rounding control (RC)
+multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+  defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr,
+                        "$rc, $src", "$src, $rc",
+                        (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc)))>,
+                        EVEX, EVEX_B, EVEX_RC;
+}
+
+// Extend Float to Double
+multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, fextend>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
+                                X86vfpextRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info,
+                               X86vfpext, "{1to2}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fextend>,
+                                     EVEX_V256;
+  }
+}
+
+// Truncate Double to Float
+multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fround>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
+                               X86vfproundRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
+                               X86vfpround, "{1to2}", "{x}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fround,
+                               "{1to4}", "{y}">, EVEX_V256;
+  }
+}
+
+defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps">,
+                                  VEX_W, PD, EVEX_CD8<64, CD8VF>;
+defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd">,
+                                  PS, EVEX_CD8<32, CD8VH>;
+
+def : Pat<(v8f64 (extloadv8f32 addr:$src)),
+            (VCVTPS2PDZrm addr:$src)>;
+
+let Predicates = [HasVLX] in {
+  def : Pat<(v4f64 (extloadv4f32 addr:$src)),
+              (VCVTPS2PDZ256rm addr:$src)>;
+}
+
+// Convert Signed/Unsigned Doubleword to Double
+multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           SDNode OpNode128> {
+  // No rounding in this op
+  let Predicates = [HasAVX512] in
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode>,
+                                     EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info,
+                                     OpNode128, "{1to2}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode>,
+                                     EVEX_V256;
+  }
+}
+
+// Convert Signed/Unsigned Doubleword to Float
+multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v16f32_info, v16i32_info,
+                               OpNodeRnd>, EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode>,
+                                     EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode>,
+                                     EVEX_V256;
+  }
+}
+
+// Convert Float to Signed/Unsigned Doubleword with truncation
+multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info,
+                                OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>,
+                                     EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>,
+                                     EVEX_V256;
+  }
+}
+
+// Convert Float to Signed/Unsigned Doubleword
+multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f32_info,
+                                OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>,
+                                     EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>,
+                                     EVEX_V256;
+  }
+}
+
+// Convert Double to Signed/Unsigned Doubleword with truncation
+multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
+                                OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+    // memory forms of these instructions in Asm Parcer. They have the same
+    // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+    // due to the same reason.
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode,
+                               "{1to2}", "{x}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
+                               "{1to4}", "{y}">, EVEX_V256;
+  }
+}
+
+// Convert Double to Signed/Unsigned Doubleword
+multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8i32x_info, v8f64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+    // memory forms of these instructions in Asm Parcer. They have the same
+    // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+    // due to the same reason.
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode,
+                               "{1to2}", "{x}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
+                               "{1to4}", "{y}">, EVEX_V256;
+  }
+}
+
+// Convert Double to Signed/Unsigned Quardword
+multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>,
+                               EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Double to Signed/Unsigned Quardword with truncation
+multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>,
+                               EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Signed/Unsigned Quardword to Double
+multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f64_info, v8i64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode>,
+                               EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Float to Signed/Unsigned Quardword
+multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f32x_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    // Explicitly specified broadcast string, since we take only 2 elements
+    // from v4f32x_info source
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
+                               "{1to2}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Float to Signed/Unsigned Quardword with truncation
+multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    // Explicitly specified broadcast string, since we take only 2 elements
+    // from v4f32x_info source
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
+                               "{1to2}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Signed/Unsigned Quardword to Float
+multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+    // memory forms of these instructions in Asm Parcer. They have the same
+    // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+    // due to the same reason.
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode,
+                               "{1to2}", "{x}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode,
+                               "{1to4}", "{y}">, EVEX_V256;
+  }
+}
+
+defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86cvtdq2pd>, XS,
+                                EVEX_CD8<32, CD8VH>;
+
+defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp,
+                                X86VSintToFpRnd>,
+                                PS, EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint,
+                                X86VFpToSintRnd>,
+                                XS, EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint,
+                                 X86VFpToSintRnd>,
+                                 PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint,
+                                 X86VFpToUintRnd>, PS,
+                                 EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint,
+                                 X86VFpToUintRnd>, PS, VEX_W,
+                                 EVEX_CD8<64, CD8VF>;
+
+defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, X86cvtudq2pd>,
+                                 XS, EVEX_CD8<32, CD8VH>;
+
+defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp,
+                                 X86VUintToFpRnd>, XD,
+                                 EVEX_CD8<32, CD8VF>;
+
+defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtps2Int,
+                                 X86cvtps2IntRnd>, PD, EVEX_CD8<32, CD8VF>;
+
+defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtpd2Int,
+                                 X86cvtpd2IntRnd>, XD, VEX_W,
+                                 EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtps2UInt,
+                                 X86cvtps2UIntRnd>,
+                                 PS, EVEX_CD8<32, CD8VF>;
+defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtpd2UInt,
+                                 X86cvtpd2UIntRnd>, VEX_W,
+                                 PS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtpd2Int,
+                                 X86cvtpd2IntRnd>, VEX_W,
+                                 PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtps2Int,
+                                 X86cvtps2IntRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtpd2UInt,
+                                 X86cvtpd2UIntRnd>, VEX_W,
+                                 PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtps2UInt,
+                                 X86cvtps2UIntRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", fp_to_sint,
+                                 X86VFpToSlongRnd>, VEX_W,
+                                 PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", fp_to_sint,
+                                 X86VFpToSlongRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", fp_to_uint,
+                                 X86VFpToUlongRnd>, VEX_W,
+                                 PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", fp_to_uint,
+                                 X86VFpToUlongRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp,
+                            X86VSlongToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp,
+                            X86VUlongToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp,
+                            X86VSlongToFpRnd>, VEX_W, PS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp,
+                            X86VUlongToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>;
+
+let Predicates = [HasAVX512, NoVLX] in {
+def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
+          (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
+           (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))),
+          (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
+           (v16f32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))),
+          (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
+           (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))),
+          (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
+           (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))),
+          (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr
+           (v8i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_ymm)>;
+}
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(v8f32 (fround (loadv8f64 addr:$src))),
+            (VCVTPD2PSZrm addr:$src)>;
+  def : Pat<(v8f64 (extloadv8f32 addr:$src)),
+            (VCVTPS2PDZrm addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//===----------------------------------------------------------------------===//
+multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src, 
+                           X86MemOperand x86memop, PatFrag ld_frag> {
+  defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
+                    "vcvtph2ps", "$src", "$src",
+                   (X86cvtph2ps (_src.VT _src.RC:$src),
+                                                (i32 FROUND_CURRENT))>, T8PD;
+  let hasSideEffects = 0, mayLoad = 1 in {
+    defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src),
+                      "vcvtph2ps", "$src", "$src", 
+                      (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))),
+                                       (i32 FROUND_CURRENT))>, T8PD;
+  }
+}
+
+multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
+  defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
+                    "vcvtph2ps", "{sae}, $src", "$src, {sae}",
+                   (X86cvtph2ps (_src.VT _src.RC:$src),
+                                                (i32 FROUND_NO_EXC))>, T8PD, EVEX_B;
+
+}
+
+let Predicates = [HasAVX512] in {
+  defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64>,
+                    avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>, 
+                    EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
+  let Predicates = [HasVLX] in {
+    defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem, 
+                         loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
+    defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
+                         loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
+  }
+}
+
+multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src, 
+                           X86MemOperand x86memop> {
+  defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst),
+               (ins _src.RC:$src1, i32u8imm:$src2),
+                    "vcvtps2ph", "$src2, $src1", "$src1, $src2", 
+                   (X86cvtps2ph (_src.VT _src.RC:$src1),
+                                (i32 imm:$src2), 
+                                (i32 FROUND_CURRENT))>, AVX512AIi8Base;
+  let hasSideEffects = 0, mayStore = 1 in {
+    def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
+               (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", 
+               [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1),
+                                       (i32 imm:$src2), (i32 FROUND_CURRENT) )),
+                                       addr:$dst)]>;
+    def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
+               (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", 
+                []>, EVEX_K;
+  }
+}
+multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
+  defm rb : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst),
+               (ins _src.RC:$src1, i32u8imm:$src2),
+                    "vcvtps2ph", "$src2, {sae}, $src1", "$src1, $src2, {sae}", 
+                   (X86cvtps2ph (_src.VT _src.RC:$src1),
+                                (i32 imm:$src2), 
+                                (i32 FROUND_NO_EXC))>, EVEX_B, AVX512AIi8Base;
+}
+let Predicates = [HasAVX512] in {
+  defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem>,
+                    avx512_cvtps2ph_sae<v16i16x_info, v16f32_info>,
+                      EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
+  let Predicates = [HasVLX] in {
+    defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem>,
+                        EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
+    defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f128mem>,
+                        EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
+  }
+}
+
+//  Unordered/Ordered scalar fp compare with Sea and set EFLAGS
+multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, SDNode OpNode,
+                            string OpcodeStr> {
+  def rb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
+                 !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"),
+                 [(set EFLAGS, (OpNode (_.VT _.RC:$src1), _.RC:$src2, 
+                                                        (i32 FROUND_NO_EXC)))],
+                 IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128,
+                 Sched<[WriteFAdd]>;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX512] in {
+  defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, X86ucomiSae, "vucomiss">,
+                                   AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
+  defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, X86ucomiSae, "vucomisd">,
+                                   AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
+  defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, X86comiSae, "vcomiss">,
+                                   AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
+  defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, X86comiSae, "vcomisd">,
+                                   AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX512] in {
+  defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
+                                 "ucomiss">, PS, EVEX, VEX_LIG,
+                                 EVEX_CD8<32, CD8VT1>;
+  defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64,
+                                  "ucomisd">, PD, EVEX,
+                                  VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+  let Pattern = []<dag> in {
+    defm VCOMISSZ  : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32,
+                                   "comiss">, PS, EVEX, VEX_LIG,
+                                   EVEX_CD8<32, CD8VT1>;
+    defm VCOMISDZ  : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64,
+                                   "comisd">, PD, EVEX,
+                                    VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+  }
+  let isCodeGenOnly = 1 in {
+    defm Int_VUCOMISSZ  : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v4f32, f128mem,
+                              load, "ucomiss">, PS, EVEX, VEX_LIG,
+                              EVEX_CD8<32, CD8VT1>;
+    defm Int_VUCOMISDZ  : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v2f64, f128mem,
+                              load, "ucomisd">, PD, EVEX,
+                              VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+    defm Int_VCOMISSZ  : sse12_ord_cmp<0x2F, VR128X, X86comi, v4f32, f128mem,
+                              load, "comiss">, PS, EVEX, VEX_LIG,
+                              EVEX_CD8<32, CD8VT1>;
+    defm Int_VCOMISDZ  : sse12_ord_cmp<0x2F, VR128X, X86comi, v2f64, f128mem,
+                              load, "comisd">, PD, EVEX,
+                              VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+  }
+}
+
+/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
+multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _> {
+  let hasSideEffects = 0, AddedComplexity = 20 , Predicates = [HasAVX512] in {
+  defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                           (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                           "$src2, $src1", "$src1, $src2",
+                           (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, EVEX_4V;
+  let mayLoad = 1 in {
+  defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (OpNode (_.VT _.RC:$src1),
+                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))))>, EVEX_4V;
+  }
+}
+}
+
+defm VRCP14SS   : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>,
+                  EVEX_CD8<32, CD8VT1>, T8PD;
+defm VRCP14SD   : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>,
+                  VEX_W, EVEX_CD8<64, CD8VT1>, T8PD;
+defm VRSQRT14SS   : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>,
+                  EVEX_CD8<32, CD8VT1>, T8PD;
+defm VRSQRT14SD   : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>,
+                  VEX_W, EVEX_CD8<64, CD8VT1>, T8PD;
+
+/// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
+multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         X86VectorVTInfo _> {
+  defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src), OpcodeStr, "$src", "$src",
+                         (_.FloatVT (OpNode _.RC:$src))>, EVEX, T8PD;
+  let mayLoad = 1 in {
+    defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                           (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+                           (OpNode (_.FloatVT
+                             (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD;
+    defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                            (ins _.ScalarMemOp:$src), OpcodeStr,
+                            "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+                            (OpNode (_.FloatVT
+                              (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+                            EVEX, T8PD, EVEX_B;
+  }
+}
+
+multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, v16f32_info>,
+                          EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, v8f64_info>,
+                          EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+  // Define only if AVX512VL feature is present.
+  let Predicates = [HasVLX] in {
+    defm PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
+                                OpNode, v4f32x_info>,
+                               EVEX_V128, EVEX_CD8<32, CD8VF>;
+    defm PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
+                                OpNode, v8f32x_info>,
+                               EVEX_V256, EVEX_CD8<32, CD8VF>;
+    defm PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
+                                OpNode, v2f64x_info>,
+                               EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
+    defm PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
+                                OpNode, v4f64x_info>,
+                               EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+  }
+}
+
+defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>;
+defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>;
+
+/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
+multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+                         SDNode OpNode> {
+
+  defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                           (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                           "$src2, $src1", "$src1, $src2",
+                           (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                           (i32 FROUND_CURRENT))>;
+
+  defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                            "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+                            (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                            (i32 FROUND_NO_EXC))>, EVEX_B;
+
+  defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (OpNode (_.VT _.RC:$src1),
+                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+                         (i32 FROUND_CURRENT))>;
+}
+
+multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm SS : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode>,
+              EVEX_CD8<32, CD8VT1>;
+  defm SD : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode>,
+              EVEX_CD8<64, CD8VT1>, VEX_W;
+}
+
+let hasSideEffects = 0, Predicates = [HasERI] in {
+  defm VRCP28   : avx512_eri_s<0xCB, "vrcp28",   X86rcp28s>,   T8PD, EVEX_4V;
+  defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V;
+}
+
+defm VGETEXP   : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds>, T8PD, EVEX_4V;
+/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
+
+multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         SDNode OpNode> {
+
+  defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src), OpcodeStr, "$src", "$src",
+                         (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>;
+
+  defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+                         (OpNode (_.FloatVT
+                             (bitconvert (_.LdFrag addr:$src))),
+                          (i32 FROUND_CURRENT))>;
+
+  defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.MemOp:$src), OpcodeStr,
+                         "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+                         (OpNode (_.FloatVT
+                                  (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+                                 (i32 FROUND_CURRENT))>, EVEX_B;
+}
+multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         SDNode OpNode> {
+  defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _.RC:$src), OpcodeStr,
+                        "{sae}, $src", "$src, {sae}",
+                        (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+
+multiclass  avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+   defm PS : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode>,
+             avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode>,
+             T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+   defm PD : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode>,
+             avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode>,
+             T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode> {
+  // Define only if AVX512VL feature is present.
+  let Predicates = [HasVLX] in {
+    defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode>,
+                                     EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>;
+    defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode>,
+                                     EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>;
+    defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode>,
+                                     EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
+    defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode>,
+                                     EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
+  }
+}
+let Predicates = [HasERI], hasSideEffects = 0 in {
+
+ defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX;
+ defm VRCP28   : avx512_eri<0xCA, "vrcp28",   X86rcp28>,   EVEX;
+ defm VEXP2    : avx512_eri<0xC8, "vexp2",    X86exp2>,    EVEX;
+}
+defm VGETEXP   : avx512_eri<0x42, "vgetexp", X86fgetexpRnd>,
+                 avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexpRnd> , EVEX;
+
+multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
+                              SDNode OpNodeRnd, X86VectorVTInfo _>{
+  defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc",
+                         (_.VT (OpNodeRnd _.RC:$src, (i32 imm:$rc)))>,
+                         EVEX, EVEX_B, EVEX_RC;
+}
+
+multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
+                              SDNode OpNode, X86VectorVTInfo _>{
+  defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src), OpcodeStr, "$src", "$src",
+                         (_.FloatVT (OpNode _.RC:$src))>, EVEX;
+  let mayLoad = 1 in {
+    defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                           (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+                           (OpNode (_.FloatVT
+                             (bitconvert (_.LdFrag addr:$src))))>, EVEX;
+
+    defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                            (ins _.ScalarMemOp:$src), OpcodeStr,
+                            "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+                            (OpNode (_.FloatVT
+                              (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+                            EVEX, EVEX_B;
+  }
+}
+
+multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode> {
+  defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+                                v16f32_info>,
+                                EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+                                v8f64_info>,
+                                EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+  // Define only if AVX512VL feature is present.
+  let Predicates = [HasVLX] in {
+    defm PSZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
+                                     OpNode, v4f32x_info>,
+                                     EVEX_V128, PS, EVEX_CD8<32, CD8VF>;
+    defm PSZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
+                                     OpNode, v8f32x_info>,
+                                     EVEX_V256, PS, EVEX_CD8<32, CD8VF>;
+    defm PDZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
+                                     OpNode, v2f64x_info>,
+                                     EVEX_V128, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+    defm PDZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
+                                     OpNode, v4f64x_info>,
+                                     EVEX_V256, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+  }
+}
+
+multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
+                                          SDNode OpNodeRnd> {
+  defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"), OpNodeRnd,
+                                v16f32_info>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"), OpNodeRnd,
+                                v8f64_info>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+                              string SUFF, SDNode OpNode, SDNode OpNodeRnd> {
+
+  defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (OpNodeRnd (_.VT _.RC:$src1),
+                                    (_.VT _.RC:$src2),
+                                    (i32 FROUND_CURRENT))>;
+  let mayLoad = 1 in
+    defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (OpNodeRnd (_.VT _.RC:$src1),
+                                    (_.VT (scalar_to_vector
+                                              (_.ScalarLdFrag addr:$src2))),
+                                    (i32 FROUND_CURRENT))>;
+
+  defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
+                         "$rc, $src2, $src1", "$src1, $src2, $rc",
+                         (OpNodeRnd (_.VT _.RC:$src1),
+                                     (_.VT _.RC:$src2),
+                                     (i32 imm:$rc))>,
+                         EVEX_B, EVEX_RC;
+
+  let isCodeGenOnly = 1 in {
+    def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
+               (ins _.FRC:$src1, _.FRC:$src2),
+               OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
+
+    let mayLoad = 1 in
+      def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
+                 (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+                 OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
+  }
+
+  def : Pat<(_.EltVT (OpNode _.FRC:$src)),
+            (!cast<Instruction>(NAME#SUFF#Zr)
+                (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
+
+  def : Pat<(_.EltVT (OpNode (load addr:$src))),
+            (!cast<Instruction>(NAME#SUFF#Zm)
+                (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[OptForSize]>;
+}
+
+multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> {
+  defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", f32x_info, "SS", fsqrt,
+                        X86fsqrtRnds>, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
+  defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", f64x_info, "SD", fsqrt,
+                        X86fsqrtRnds>, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W;
+}
+
+defm VSQRT   : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>,
+               avx512_sqrt_packed_all_round<0x51, "vsqrt", X86fsqrtRnd>;
+
+defm VSQRT   : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG;
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(f32 (X86frsqrt FR32X:$src)),
+            (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>;
+  def : Pat<(f32 (X86frsqrt (load addr:$src))),
+            (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+            Requires<[OptForSize]>;
+  def : Pat<(f32 (X86frcp FR32X:$src)),
+            (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>;
+  def : Pat<(f32 (X86frcp (load addr:$src))),
+            (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+            Requires<[OptForSize]>;
+}
+
+multiclass
+avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+
+  let ExeDomain = _.ExeDomain in {
+  defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                           (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
+                           "$src3, $src2, $src1", "$src1, $src2, $src3",
+                           (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                            (i32 imm:$src3), (i32 FROUND_CURRENT)))>;
+
+  defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
+                         "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
+                         (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                         (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B;
+
+  let mayLoad = 1 in
+  defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr,
+                         "$src3, $src2, $src1", "$src1, $src2, $src3",
+                         (_.VT (X86RndScales (_.VT _.RC:$src1),
+                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+                          (i32 imm:$src3), (i32 FROUND_CURRENT)))>;
+  }
+  let Predicates = [HasAVX512] in {
+  def : Pat<(ffloor _.FRC:$src), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x1))), _.FRC)>;
+  def : Pat<(fceil _.FRC:$src), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x2))), _.FRC)>;
+  def : Pat<(ftrunc _.FRC:$src), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x3))), _.FRC)>;
+  def : Pat<(frint _.FRC:$src), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x4))), _.FRC)>;
+  def : Pat<(fnearbyint _.FRC:$src), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xc))), _.FRC)>;
+
+  def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+             addr:$src, (i32 0x1))), _.FRC)>;
+  def : Pat<(fceil (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+             addr:$src, (i32 0x2))), _.FRC)>;
+  def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+             addr:$src, (i32 0x3))), _.FRC)>;
+  def : Pat<(frint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+             addr:$src, (i32 0x4))), _.FRC)>;
+  def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+             addr:$src, (i32 0xc))), _.FRC)>;
+  }
+}
+
+defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", f32x_info>,
+                                AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W,
+                                AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>;
+
+//-------------------------------------------------
+// Integer truncate and extend operations
+//-------------------------------------------------
+
+multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              X86VectorVTInfo SrcInfo, X86VectorVTInfo DestInfo,
+                              X86MemOperand x86memop> {
+
+  defm rr  : AVX512_maskable<opc, MRMDestReg, DestInfo, (outs DestInfo.RC:$dst),
+                      (ins SrcInfo.RC:$src1), OpcodeStr ,"$src1", "$src1",
+                      (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1)))>,
+                       EVEX, T8XS;
+
+  // for intrinsic patter match
+  def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
+                           (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+                           undef)),
+            (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask ,
+                                      SrcInfo.RC:$src1)>;
+
+  def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
+                           (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+                           DestInfo.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask ,
+                                      SrcInfo.RC:$src1)>;
+
+  def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
+                           (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+                           DestInfo.RC:$src0)),
+            (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrk) DestInfo.RC:$src0,
+                                      DestInfo.KRCWM:$mask ,
+                                      SrcInfo.RC:$src1)>;
+
+  let mayStore = 1 in {
+    def mr : AVX512XS8I<opc, MRMDestMem, (outs),
+               (ins x86memop:$dst, SrcInfo.RC:$src),
+               OpcodeStr # "\t{$src, $dst |$dst, $src}",
+               []>, EVEX;
+
+    def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
+               (ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src),
+               OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
+               []>, EVEX, EVEX_K;
+  }//mayStore = 1
+}
+
+multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo,
+                                    X86VectorVTInfo DestInfo,
+                                    PatFrag truncFrag, PatFrag mtruncFrag > {
+
+  def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst),
+            (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr)
+                                    addr:$dst, SrcInfo.RC:$src)>;
+
+  def : Pat<(mtruncFrag addr:$dst, SrcInfo.KRCWM:$mask,
+                                               (SrcInfo.VT SrcInfo.RC:$src)),
+            (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk)
+                            addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>;
+}
+
+multiclass avx512_trunc_sat_mr_lowering<X86VectorVTInfo SrcInfo,
+                                        X86VectorVTInfo DestInfo, string sat > {
+
+  def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix#
+                               DestInfo.Suffix#"_mem_"#SrcInfo.Size)
+                  addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), SrcInfo.MRC:$mask),
+           (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk) addr:$ptr,
+                    (COPY_TO_REGCLASS SrcInfo.MRC:$mask, SrcInfo.KRCWM),
+                    (SrcInfo.VT SrcInfo.RC:$src))>;
+
+  def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix#
+                               DestInfo.Suffix#"_mem_"#SrcInfo.Size)
+                  addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), -1),
+           (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr) addr:$ptr,
+                    (SrcInfo.VT SrcInfo.RC:$src))>;
+}
+
+multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode,
+         AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128,
+         X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
+         X86MemOperand x86memopZ128, X86MemOperand x86memopZ256,
+         X86MemOperand x86memopZ, PatFrag truncFrag, PatFrag mtruncFrag,
+                                                     Predicate prd = HasAVX512>{
+
+  let Predicates = [HasVLX, prd] in {
+    defm Z128:  avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128,
+                             DestInfoZ128, x86memopZ128>,
+                avx512_trunc_mr_lowering<VTSrcInfo.info128, DestInfoZ128,
+                             truncFrag, mtruncFrag>, EVEX_V128;
+
+    defm Z256:  avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256,
+                             DestInfoZ256, x86memopZ256>,
+                avx512_trunc_mr_lowering<VTSrcInfo.info256, DestInfoZ256,
+                             truncFrag, mtruncFrag>, EVEX_V256;
+  }
+  let Predicates = [prd] in
+    defm Z:     avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512,
+                             DestInfoZ, x86memopZ>,
+                avx512_trunc_mr_lowering<VTSrcInfo.info512, DestInfoZ,
+                             truncFrag, mtruncFrag>, EVEX_V512;
+}
+
+multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr, SDNode OpNode,
+         AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128,
+         X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
+         X86MemOperand x86memopZ128, X86MemOperand x86memopZ256,
+         X86MemOperand x86memopZ, string sat, Predicate prd = HasAVX512>{
+
+  let Predicates = [HasVLX, prd] in {
+    defm Z128:  avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128,
+                             DestInfoZ128, x86memopZ128>,
+                avx512_trunc_sat_mr_lowering<VTSrcInfo.info128, DestInfoZ128,
+                             sat>, EVEX_V128;
+
+    defm Z256:  avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256,
+                             DestInfoZ256, x86memopZ256>,
+                avx512_trunc_sat_mr_lowering<VTSrcInfo.info256, DestInfoZ256,
+                             sat>, EVEX_V256;
+  }
+  let Predicates = [prd] in
+    defm Z:     avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512,
+                             DestInfoZ, x86memopZ>,
+                avx512_trunc_sat_mr_lowering<VTSrcInfo.info512, DestInfoZ,
+                             sat>, EVEX_V512;
+}
+
+multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+               v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem,
+               truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VO>;
+}
+multiclass avx512_trunc_sat_qb<bits<8> opc, string sat, SDNode OpNode> {
+  defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qb", OpNode, avx512vl_i64_info,
+               v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem,
+               sat>, EVEX_CD8<8, CD8VO>;
+}
+
+multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+               v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem,
+               truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VQ>;
+}
+multiclass avx512_trunc_sat_qw<bits<8> opc, string sat, SDNode OpNode> {
+  defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qw", OpNode, avx512vl_i64_info,
+               v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem,
+               sat>, EVEX_CD8<16, CD8VQ>;
+}
+
+multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+               v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem,
+               truncstorevi32, masked_truncstorevi32>, EVEX_CD8<32, CD8VH>;
+}
+multiclass avx512_trunc_sat_qd<bits<8> opc, string sat, SDNode OpNode> {
+  defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qd", OpNode, avx512vl_i64_info,
+               v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem,
+               sat>, EVEX_CD8<32, CD8VH>;
+}
+
+multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+               v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem,
+               truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VQ>;
+}
+multiclass avx512_trunc_sat_db<bits<8> opc, string sat, SDNode OpNode> {
+  defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"db", OpNode, avx512vl_i32_info,
+               v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem,
+               sat>, EVEX_CD8<8, CD8VQ>;
+}
+
+multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+              v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem,
+              truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VH>;
+}
+multiclass avx512_trunc_sat_dw<bits<8> opc, string sat, SDNode OpNode> {
+  defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"dw", OpNode, avx512vl_i32_info,
+              v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem,
+              sat>, EVEX_CD8<16, CD8VH>;
+}
+
+multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i16_info,
+              v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem,
+              truncstorevi8, masked_truncstorevi8,HasBWI>, EVEX_CD8<16, CD8VH>;
+}
+multiclass avx512_trunc_sat_wb<bits<8> opc, string sat, SDNode OpNode> {
+  defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"wb", OpNode, avx512vl_i16_info,
+              v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem,
+              sat, HasBWI>, EVEX_CD8<16, CD8VH>;
+}
+
+defm VPMOVQB    : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc>;
+defm VPMOVSQB   : avx512_trunc_sat_qb<0x22, "s",   X86vtruncs>;
+defm VPMOVUSQB  : avx512_trunc_sat_qb<0x12, "us",  X86vtruncus>;
+
+defm VPMOVQW    : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc>;
+defm VPMOVSQW   : avx512_trunc_sat_qw<0x24, "s",   X86vtruncs>;
+defm VPMOVUSQW  : avx512_trunc_sat_qw<0x14, "us",  X86vtruncus>;
+
+defm VPMOVQD    : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc>;
+defm VPMOVSQD   : avx512_trunc_sat_qd<0x25, "s",   X86vtruncs>;
+defm VPMOVUSQD  : avx512_trunc_sat_qd<0x15, "us",  X86vtruncus>;
+
+defm VPMOVDB    : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc>;
+defm VPMOVSDB   : avx512_trunc_sat_db<0x21, "s",   X86vtruncs>;
+defm VPMOVUSDB  : avx512_trunc_sat_db<0x11, "us",  X86vtruncus>;
+
+defm VPMOVDW    : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc>;
+defm VPMOVSDW   : avx512_trunc_sat_dw<0x23, "s",   X86vtruncs>;
+defm VPMOVUSDW  : avx512_trunc_sat_dw<0x13, "us",  X86vtruncus>;
+
+defm VPMOVWB    : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc>;
+defm VPMOVSWB   : avx512_trunc_sat_wb<0x20, "s",   X86vtruncs>;
+defm VPMOVUSWB  : avx512_trunc_sat_wb<0x10, "us",  X86vtruncus>;
+
+let Predicates = [HasAVX512, NoVLX] in {
+def: Pat<(v8i16 (X86vtrunc (v8i32 VR256X:$src))),
+         (v8i16 (EXTRACT_SUBREG
+                 (v16i16 (VPMOVDWZrr (v16i32 (SUBREG_TO_REG (i32 0),
+                                          VR256X:$src, sub_ymm)))), sub_xmm))>;
+def: Pat<(v4i32 (X86vtrunc (v4i64 VR256X:$src))),
+         (v4i32 (EXTRACT_SUBREG
+                 (v8i32 (VPMOVQDZrr (v8i64 (SUBREG_TO_REG (i32 0),
+                                           VR256X:$src, sub_ymm)))), sub_xmm))>;
+}
+
+let Predicates = [HasBWI, NoVLX] in {
+def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))),
+         (v16i8 (EXTRACT_SUBREG  (VPMOVWBZrr (v32i16 (SUBREG_TO_REG (i32 0),
+                                            VR256X:$src, sub_ymm))), sub_xmm))>;
+}
+
+multiclass avx512_extend_common<bits<8> opc, string OpcodeStr,
+                  X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo,
+                  X86MemOperand x86memop, PatFrag LdFrag, SDNode OpNode>{
+
+  defm rr   : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+                    (ins SrcInfo.RC:$src), OpcodeStr ,"$src", "$src",
+                    (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src)))>,
+                  EVEX;
+
+  let mayLoad = 1 in {
+    defm rm : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+                    (ins x86memop:$src), OpcodeStr ,"$src", "$src",
+                    (DestInfo.VT (LdFrag addr:$src))>,
+                  EVEX;
+  }
+}
+
+multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr, SDNode OpNode,
+          string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+  let Predicates = [HasVLX, HasBWI] in {
+    defm Z128:  avx512_extend_common<opc, OpcodeStr, v8i16x_info,
+                    v16i8x_info, i64mem, LdFrag, OpNode>,
+                     EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128;
+
+    defm Z256:  avx512_extend_common<opc, OpcodeStr, v16i16x_info,
+                    v16i8x_info, i128mem, LdFrag, OpNode>,
+                     EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256;
+  }
+  let Predicates = [HasBWI] in {
+    defm Z   :  avx512_extend_common<opc, OpcodeStr, v32i16_info,
+                    v32i8x_info, i256mem, LdFrag, OpNode>,
+                     EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512;
+  }
+}
+
+multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr, SDNode OpNode,
+          string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+  let Predicates = [HasVLX, HasAVX512] in {
+    defm Z128:  avx512_extend_common<opc, OpcodeStr, v4i32x_info,
+                   v16i8x_info, i32mem, LdFrag, OpNode>,
+                         EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128;
+
+    defm Z256:  avx512_extend_common<opc, OpcodeStr, v8i32x_info,
+                   v16i8x_info, i64mem, LdFrag, OpNode>,
+                         EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256;
+  }
+  let Predicates = [HasAVX512] in {
+    defm Z   :  avx512_extend_common<opc, OpcodeStr, v16i32_info,
+                   v16i8x_info, i128mem, LdFrag, OpNode>,
+                         EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512;
+  }
+}
+
+multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr, SDNode OpNode,
+          string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+  let Predicates = [HasVLX, HasAVX512] in {
+    defm Z128:  avx512_extend_common<opc, OpcodeStr, v2i64x_info,
+                   v16i8x_info, i16mem, LdFrag, OpNode>,
+                     EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128;
+
+    defm Z256:  avx512_extend_common<opc, OpcodeStr, v4i64x_info,
+                   v16i8x_info, i32mem, LdFrag, OpNode>,
+                     EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256;
+  }
+  let Predicates = [HasAVX512] in {
+    defm Z   :  avx512_extend_common<opc, OpcodeStr, v8i64_info,
+                   v16i8x_info, i64mem, LdFrag, OpNode>,
+                     EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512;
+  }
+}
+
+multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr, SDNode OpNode,
+         string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
+  let Predicates = [HasVLX, HasAVX512] in {
+    defm Z128:  avx512_extend_common<opc, OpcodeStr, v4i32x_info,
+                   v8i16x_info, i64mem, LdFrag, OpNode>,
+                     EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128;
+
+    defm Z256:  avx512_extend_common<opc, OpcodeStr, v8i32x_info,
+                   v8i16x_info, i128mem, LdFrag, OpNode>,
+                     EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256;
+  }
+  let Predicates = [HasAVX512] in {
+    defm Z   :  avx512_extend_common<opc, OpcodeStr, v16i32_info,
+                   v16i16x_info, i256mem, LdFrag, OpNode>,
+                     EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512;
+  }
+}
+
+multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr, SDNode OpNode,
+         string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
+  let Predicates = [HasVLX, HasAVX512] in {
+    defm Z128:  avx512_extend_common<opc, OpcodeStr, v2i64x_info,
+                   v8i16x_info, i32mem, LdFrag, OpNode>,
+                     EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128;
+
+    defm Z256:  avx512_extend_common<opc, OpcodeStr, v4i64x_info,
+                   v8i16x_info, i64mem, LdFrag, OpNode>,
+                     EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256;
+  }
+  let Predicates = [HasAVX512] in {
+    defm Z   :  avx512_extend_common<opc, OpcodeStr, v8i64_info,
+                   v8i16x_info, i128mem, LdFrag, OpNode>,
+                     EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512;
+  }
+}
+
+multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr, SDNode OpNode,
+         string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> {
+
+  let Predicates = [HasVLX, HasAVX512] in {
+    defm Z128:  avx512_extend_common<opc, OpcodeStr, v2i64x_info,
+                   v4i32x_info, i64mem, LdFrag, OpNode>,
+                     EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128;
+
+    defm Z256:  avx512_extend_common<opc, OpcodeStr, v4i64x_info,
+                   v4i32x_info, i128mem, LdFrag, OpNode>,
+                     EVEX_CD8<32, CD8VH>, T8PD, EVEX_V256;
+  }
+  let Predicates = [HasAVX512] in {
+    defm Z   :  avx512_extend_common<opc, OpcodeStr, v8i64_info,
+                   v8i32x_info, i256mem, LdFrag, OpNode>,
+                     EVEX_CD8<32, CD8VH>, T8PD, EVEX_V512;
+  }
+}
+
+defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, "z">;
+defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, "z">;
+defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, "z">;
+defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, "z">;
+defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, "z">;
+defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, "z">;
+
+
+defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, "s">;
+defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, "s">;
+defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, "s">;
+defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, "s">;
+defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, "s">;
+defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, "s">;
+
+//===----------------------------------------------------------------------===//
+// GATHER - SCATTER Operations
+
+multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86MemOperand memop, PatFrag GatherNode> {
+  let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb",
+      ExeDomain = _.ExeDomain in
+  def rm  : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, _.KRCWM:$mask_wb),
+            (ins _.RC:$src1, _.KRCWM:$mask, memop:$src2),
+            !strconcat(OpcodeStr#_.Suffix,
+            "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+            [(set _.RC:$dst, _.KRCWM:$mask_wb,
+              (GatherNode  (_.VT _.RC:$src1), _.KRCWM:$mask,
+                     vectoraddr:$src2))]>, EVEX, EVEX_K,
+             EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc,
+                        AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+  defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512,
+                                      vy32xmem, mgatherv8i32>, EVEX_V512, VEX_W;
+  defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info512,
+                                      vz64mem,  mgatherv8i64>, EVEX_V512, VEX_W;
+let Predicates = [HasVLX] in {
+  defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
+                              vx32xmem, mgatherv4i32>, EVEX_V256, VEX_W;
+  defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info256,
+                              vy64xmem, mgatherv4i64>, EVEX_V256, VEX_W;
+  defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
+                              vx32xmem, mgatherv4i32>, EVEX_V128, VEX_W;
+  defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
+                              vx64xmem, mgatherv2i64>, EVEX_V128, VEX_W;
+}
+}
+
+multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc,
+                       AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+  defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz32mem,
+                                       mgatherv16i32>, EVEX_V512;
+  defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz64mem,
+                                       mgatherv8i64>, EVEX_V512;
+let Predicates = [HasVLX] in {
+  defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
+                                          vy32xmem, mgatherv8i32>, EVEX_V256;
+  defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info128,
+                                          vy64xmem, mgatherv4i64>, EVEX_V256;
+  defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
+                                          vx32xmem, mgatherv4i32>, EVEX_V128;
+  defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
+                                          vx64xmem, mgatherv2i64>, EVEX_V128;
+}
+}
+
+
+defm VGATHER : avx512_gather_q_pd<0x92, 0x93, avx512vl_f64_info, "vgather", "PD">,
+               avx512_gather_d_ps<0x92, 0x93, avx512vl_f32_info, "vgather", "PS">;
+
+defm VPGATHER : avx512_gather_q_pd<0x90, 0x91, avx512vl_i64_info, "vpgather", "Q">,
+                avx512_gather_d_ps<0x90, 0x91, avx512vl_i32_info, "vpgather", "D">;
+
+multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                          X86MemOperand memop, PatFrag ScatterNode> {
+
+let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in
+
+  def mr  : AVX5128I<opc, MRMDestMem, (outs _.KRCWM:$mask_wb),
+            (ins memop:$dst, _.KRCWM:$mask, _.RC:$src),
+            !strconcat(OpcodeStr#_.Suffix,
+            "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
+            [(set _.KRCWM:$mask_wb, (ScatterNode (_.VT _.RC:$src),
+                                     _.KRCWM:$mask,  vectoraddr:$dst))]>,
+            EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc,
+                        AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+  defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512,
+                                      vy32xmem, mscatterv8i32>, EVEX_V512, VEX_W;
+  defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info512,
+                                      vz64mem,  mscatterv8i64>, EVEX_V512, VEX_W;
+let Predicates = [HasVLX] in {
+  defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
+                              vx32xmem, mscatterv4i32>, EVEX_V256, VEX_W;
+  defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info256,
+                              vy64xmem, mscatterv4i64>, EVEX_V256, VEX_W;
+  defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
+                              vx32xmem, mscatterv4i32>, EVEX_V128, VEX_W;
+  defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
+                              vx64xmem, mscatterv2i64>, EVEX_V128, VEX_W;
+}
+}
+
+multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc,
+                       AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+  defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz32mem,
+                                       mscatterv16i32>, EVEX_V512;
+  defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz64mem,
+                                       mscatterv8i64>, EVEX_V512;
+let Predicates = [HasVLX] in {
+  defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
+                                          vy32xmem, mscatterv8i32>, EVEX_V256;
+  defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
+                                          vy64xmem, mscatterv4i64>, EVEX_V256;
+  defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
+                                          vx32xmem, mscatterv4i32>, EVEX_V128;
+  defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
+                                          vx64xmem, mscatterv2i64>, EVEX_V128;
+}
+}
+
+defm VSCATTER : avx512_scatter_q_pd<0xA2, 0xA3, avx512vl_f64_info, "vscatter", "PD">,
+               avx512_scatter_d_ps<0xA2, 0xA3, avx512vl_f32_info, "vscatter", "PS">;
+
+defm VPSCATTER : avx512_scatter_q_pd<0xA0, 0xA1, avx512vl_i64_info, "vpscatter", "Q">,
+                avx512_scatter_d_ps<0xA0, 0xA1, avx512vl_i32_info, "vpscatter", "D">;
+
+// prefetch
+multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr,
+                       RegisterClass KRC, X86MemOperand memop> {
+  let Predicates = [HasPFI], hasSideEffects = 1 in
+  def m  : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src),
+            !strconcat(OpcodeStr, "\t{$src {${mask}}|{${mask}}, $src}"),
+            []>, EVEX, EVEX_K;
+}
+
+defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
+                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
+                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd",
+                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
+                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
+                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd",
+                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps",
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
+                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
+                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd",
+                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps",
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
+                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
+                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
+                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+// Helper fragments to match sext vXi1 to vXiY.
+def v16i1sextv16i32  : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>;
+def v8i1sextv8i64  : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>;
+
+def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
+def : Pat<(store (i1  1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
+def : Pat<(store (i1  0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
+
+def : Pat<(store VK1:$src, addr:$dst),
+          (MOV8mr addr:$dst,
+           (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)),
+            sub_8bit))>, Requires<[HasAVX512, NoDQI]>;
+
+def : Pat<(store VK8:$src, addr:$dst),
+          (MOV8mr addr:$dst,
+           (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
+            sub_8bit))>, Requires<[HasAVX512, NoDQI]>;
+
+def truncstorei1 : PatFrag<(ops node:$val, node:$ptr),
+                           (truncstore node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i1;
+}]>;
+
+def : Pat<(truncstorei1 GR8:$src, addr:$dst),
+          (MOV8mr addr:$dst, GR8:$src)>;
+
+multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > {
+def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
+                  !strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"),
+                  [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX;
+}
+
+multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
+                                 string OpcodeStr, Predicate prd> {
+let Predicates = [prd] in
+  defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
+    defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
+  }
+}
+
+multiclass avx512_convert_mask_to_vector<string OpcodeStr> {
+  defm NAME##B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info,  OpcodeStr,
+                                       HasBWI>;
+  defm NAME##W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, OpcodeStr,
+                                       HasBWI>, VEX_W;
+  defm NAME##D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, OpcodeStr,
+                                       HasDQI>;
+  defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr,
+                                       HasDQI>, VEX_W;
+}
+
+defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">;
+
+multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > {
+def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX;
+}
+
+multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr,
+                        AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+let Predicates = [prd] in
+  defm Z : convert_vector_to_mask_common <opc, VTInfo.info512, OpcodeStr>,
+   EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : convert_vector_to_mask_common<opc, VTInfo.info256, OpcodeStr>,
+     EVEX_V256;
+    defm Z128 : convert_vector_to_mask_common<opc, VTInfo.info128, OpcodeStr>,
+     EVEX_V128;
+  }
+}
+
+defm VPMOVB2M : avx512_convert_vector_to_mask<0x29, "vpmovb2m",
+                                              avx512vl_i8_info, HasBWI>;
+defm VPMOVW2M : avx512_convert_vector_to_mask<0x29, "vpmovw2m",
+                                              avx512vl_i16_info, HasBWI>, VEX_W;
+defm VPMOVD2M : avx512_convert_vector_to_mask<0x39, "vpmovd2m",
+                                              avx512vl_i32_info, HasDQI>;
+defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m",
+                                              avx512vl_i64_info, HasDQI>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - COMPRESS and EXPAND
+//
+
+multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _,
+                                 string OpcodeStr> {
+  defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst),
+              (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
+              (_.VT (X86compress _.RC:$src1))>, AVX5128IBase;
+
+  let mayStore = 1 in {
+  def mr : AVX5128I<opc, MRMDestMem, (outs),
+              (ins _.MemOp:$dst, _.RC:$src),
+              OpcodeStr # "\t{$src, $dst |$dst, $src}",
+              []>, EVEX_CD8<_.EltSize, CD8VT1>;
+
+  def mrk : AVX5128I<opc, MRMDestMem, (outs),
+              (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
+              OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
+              [(store (_.VT (vselect _.KRCWM:$mask,
+                             (_.VT (X86compress  _.RC:$src)), _.ImmAllZerosV)),
+                addr:$dst)]>,
+              EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
+  }
+}
+
+multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr,
+                                 AVX512VLVectorVTInfo VTInfo> {
+  defm Z : compress_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z256 : compress_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
+    defm Z128 : compress_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
+  }
+}
+
+defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", avx512vl_i32_info>,
+                                         EVEX;
+defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", avx512vl_i64_info>,
+                                         EVEX, VEX_W;
+defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", avx512vl_f32_info>,
+                                         EVEX;
+defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info>,
+                                         EVEX, VEX_W;
+
+// expand
+multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
+                                 string OpcodeStr> {
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+              (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
+              (_.VT (X86expand _.RC:$src1))>, AVX5128IBase;
+
+  let mayLoad = 1 in
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+              (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1",
+              (_.VT (X86expand (_.VT (bitconvert
+                                      (_.LdFrag addr:$src1)))))>,
+            AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr,
+                                 AVX512VLVectorVTInfo VTInfo> {
+  defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
+    defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
+  }
+}
+
+defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", avx512vl_i32_info>,
+                                         EVEX;
+defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", avx512vl_i64_info>,
+                                         EVEX, VEX_W;
+defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>,
+                                         EVEX;
+defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>,
+                                         EVEX, VEX_W;
+
+//handle instruction  reg_vec1 = op(reg_vec,imm)
+//                               op(mem_vec,imm)
+//                               op(broadcast(eltVt),imm)
+//all instruction created with FROUND_CURRENT
+multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _>{
+  defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix, "$src2, $src1", "$src2, $src2",
+                      (OpNode (_.VT _.RC:$src1),
+                              (i32 imm:$src2),
+                              (i32 FROUND_CURRENT))>;
+  let mayLoad = 1 in {
+    defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.MemOp:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
+                      (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                              (i32 imm:$src2),
+                              (i32 FROUND_CURRENT))>;
+    defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr,
+                      "${src1}"##_.BroadcastStr##", $src2",
+                      (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))),
+                              (i32 imm:$src2),
+                              (i32 FROUND_CURRENT))>, EVEX_B;
+  }
+}
+
+//handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
+                                             SDNode OpNode, X86VectorVTInfo _>{
+  defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix, "$src2,{sae}, $src1",
+                      "$src1, {sae}, $src2",
+                      (OpNode (_.VT _.RC:$src1),
+                              (i32 imm:$src2),
+                              (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+
+multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
+            AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+  let Predicates = [prd] in {
+    defm Z    : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+                avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+                                  EVEX_V512;
+  }
+  let Predicates = [prd, HasVLX] in {
+    defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>,
+                                  EVEX_V128;
+    defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>,
+                                  EVEX_V256;
+  }
+}
+
+//handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
+//                               op(reg_vec2,mem_vec,imm)
+//                               op(reg_vec2,broadcast(eltVt),imm)
+//all instruction created with FROUND_CURRENT
+multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _>{
+  defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (i32 imm:$src3),
+                              (i32 FROUND_CURRENT))>;
+  let mayLoad = 1 in {
+    defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
+                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                              (i32 imm:$src3),
+                              (i32 FROUND_CURRENT))>;
+    defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+                      OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
+                      "$src1, ${src2}"##_.BroadcastStr##", $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+                              (i32 imm:$src3),
+                              (i32 FROUND_CURRENT))>, EVEX_B;
+  }
+}
+
+//handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
+//                               op(reg_vec2,mem_vec,imm)
+multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                             X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo>{
+
+  defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+                  (ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3),
+                  OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                  (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
+                               (SrcInfo.VT SrcInfo.RC:$src2),
+                               (i8 imm:$src3)))>;
+  let mayLoad = 1 in
+    defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+                  (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3),
+                  OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                  (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
+                               (SrcInfo.VT (bitconvert
+                                                  (SrcInfo.LdFrag addr:$src2))),
+                               (i8 imm:$src3)))>;
+}
+
+//handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
+//                               op(reg_vec2,mem_vec,imm)
+//                               op(reg_vec2,broadcast(eltVt),imm)
+multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           X86VectorVTInfo _>:
+  avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, _, _>{
+
+  let mayLoad = 1 in
+    defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+                      OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
+                      "$src1, ${src2}"##_.BroadcastStr##", $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+                              (i8 imm:$src3))>, EVEX_B;
+}
+
+//handle scalar instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
+//                                      op(reg_vec2,mem_scalar,imm)
+//all instruction created with FROUND_CURRENT
+multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                           X86VectorVTInfo _> {
+
+  defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (i32 imm:$src3),
+                              (i32 FROUND_CURRENT))>;
+  let mayLoad = 1 in {
+    defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
+                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT (scalar_to_vector
+                                        (_.ScalarLdFrag addr:$src2))),
+                              (i32 imm:$src3),
+                              (i32 FROUND_CURRENT))>;
+
+    let isAsmParserOnly = 1 in {
+      defm rmi_alt :AVX512_maskable_in_asm<opc, MRMSrcMem, _, (outs _.FRC:$dst),
+                      (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                      []>;
+    }
+  }
+}
+
+//handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
+                                             SDNode OpNode, X86VectorVTInfo _>{
+  defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+                      OpcodeStr, "$src3,{sae}, $src2, $src1",
+                      "$src1, $src2,{sae}, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (i32 imm:$src3),
+                              (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+//handle scalar instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr,
+                                             SDNode OpNode, X86VectorVTInfo _> {
+  defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+                      OpcodeStr, "$src3,{sae}, $src2, $src1",
+                      "$src1, $src2,{sae}, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (i32 imm:$src3),
+                              (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+
+multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr,
+            AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+  let Predicates = [prd] in {
+    defm Z    : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+                avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+                                  EVEX_V512;
+
+  }
+  let Predicates = [prd, HasVLX] in {
+    defm Z128 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>,
+                                  EVEX_V128;
+    defm Z256 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>,
+                                  EVEX_V256;
+  }
+}
+
+multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr,
+                   AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo>{
+  let Predicates = [HasBWI] in {
+    defm Z    : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info512,
+                           SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V;
+  }
+  let Predicates = [HasBWI, HasVLX] in {
+    defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info128,
+                           SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V;
+    defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode,  DestInfo.info256,
+                           SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V;
+  }
+}
+
+multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _,
+                                bits<8> opc, SDNode OpNode>{
+  let Predicates = [HasAVX512] in {
+    defm Z    : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+  }
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
+    defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+  }
+}
+
+multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
+                  X86VectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+  let Predicates = [prd] in {
+     defm Z128 : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, _>,
+                 avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNode, _>;
+  }
+}
+
+multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
+                    bits<8> opcPs, bits<8> opcPd, SDNode OpNode, Predicate prd>{
+  defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
+                            opcPs, OpNode, prd>, EVEX_CD8<32, CD8VF>;
+  defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info,
+                            opcPd, OpNode, prd>, EVEX_CD8<64, CD8VF>, VEX_W;
+}
+
+defm VFIXUPIMMPD : avx512_common_fp_sae_packed_imm<"vfixupimmpd",
+                              avx512vl_f64_info, 0x54, X86VFixupimm, HasAVX512>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VFIXUPIMMPS : avx512_common_fp_sae_packed_imm<"vfixupimmps",
+                              avx512vl_f32_info, 0x54, X86VFixupimm, HasAVX512>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+
+defm VFIXUPIMMSD: avx512_common_fp_sae_scalar_imm<"vfixupimmsd", f64x_info,
+                                                 0x55, X86VFixupimm, HasAVX512>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VFIXUPIMMSS: avx512_common_fp_sae_scalar_imm<"vfixupimmss", f32x_info,
+                                                 0x55, X86VFixupimm, HasAVX512>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VREDUCE   : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
+                              X86VReduce, HasDQI>, AVX512AIi8Base, EVEX;
+defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
+                              X86VRndScale, HasAVX512>, AVX512AIi8Base, EVEX;
+defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
+                              X86VGetMant, HasAVX512>, AVX512AIi8Base, EVEX;
+
+
+defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
+                                                       0x50, X86VRange, HasDQI>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info,
+                                                       0x50, X86VRange, HasDQI>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+
+defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd", f64x_info,
+                                                 0x51, X86VRange, HasDQI>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info,
+                                                 0x51, X86VRange, HasDQI>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
+                                                 0x57, X86Reduces, HasDQI>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
+                                                 0x57, X86Reduces, HasDQI>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
+                                                 0x27, X86GetMants, HasAVX512>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
+                                                 0x27, X86GetMants, HasAVX512>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _,
+                                       bits<8> opc, SDNode OpNode = X86Shuf128>{
+  let Predicates = [HasAVX512] in {
+    defm Z    : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+
+  }
+  let Predicates = [HasAVX512, HasVLX] in {
+     defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+  }
+}
+let Predicates = [HasAVX512] in {
+def : Pat<(v16f32 (ffloor VR512:$src)),
+          (VRNDSCALEPSZrri VR512:$src, (i32 0x1))>;
+def : Pat<(v16f32 (fnearbyint VR512:$src)),
+          (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>;
+def : Pat<(v16f32 (fceil VR512:$src)),
+          (VRNDSCALEPSZrri VR512:$src, (i32 0x2))>;
+def : Pat<(v16f32 (frint VR512:$src)),
+          (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>;
+def : Pat<(v16f32 (ftrunc VR512:$src)),
+          (VRNDSCALEPSZrri VR512:$src, (i32 0x3))>;
+
+def : Pat<(v8f64 (ffloor VR512:$src)),
+          (VRNDSCALEPDZrri VR512:$src, (i32 0x1))>;
+def : Pat<(v8f64 (fnearbyint VR512:$src)),
+          (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>;
+def : Pat<(v8f64 (fceil VR512:$src)),
+          (VRNDSCALEPDZrri VR512:$src, (i32 0x2))>;
+def : Pat<(v8f64 (frint VR512:$src)),
+          (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>;
+def : Pat<(v8f64 (ftrunc VR512:$src)),
+          (VRNDSCALEPDZrri VR512:$src, (i32 0x3))>;
+}
+
+defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2",avx512vl_f64_info, 0x23>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+
+multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I> {
+  defm NAME:       avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>,
+                           AVX512AIi8Base, EVEX_4V;
+}
+
+defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info>,
+                                                  EVEX_CD8<32, CD8VF>;
+defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info>,
+                                                  EVEX_CD8<64, CD8VF>, VEX_W;
+
+multiclass avx512_vpalign_lowering<X86VectorVTInfo _ , list<Predicate> p>{
+  let Predicates = p in
+    def NAME#_.VTName#rri:
+          Pat<(_.VT (X86PAlignr _.RC:$src1, _.RC:$src2, (i8 imm:$imm))),
+              (!cast<Instruction>(NAME#_.ZSuffix#rri)
+                    _.RC:$src1, _.RC:$src2, imm:$imm)>;
+}
+
+multiclass avx512_vpalign_lowering_common<AVX512VLVectorVTInfo _>:
+      avx512_vpalign_lowering<_.info512, [HasBWI]>,
+      avx512_vpalign_lowering<_.info128, [HasBWI, HasVLX]>,
+      avx512_vpalign_lowering<_.info256, [HasBWI, HasVLX]>;
+
+defm VPALIGN:   avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" ,
+                                          avx512vl_i8_info, avx512vl_i8_info>,
+                avx512_vpalign_lowering_common<avx512vl_i16_info>,
+                avx512_vpalign_lowering_common<avx512vl_i32_info>,
+                avx512_vpalign_lowering_common<avx512vl_f32_info>,
+                avx512_vpalign_lowering_common<avx512vl_i64_info>,
+                avx512_vpalign_lowering_common<avx512vl_f64_info>,
+                EVEX_CD8<8, CD8VF>;
+
+defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" ,
+                    avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>;
+
+multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           X86VectorVTInfo _> {
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1), OpcodeStr,
+                    "$src1", "$src1",
+                    (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase;
+
+  let mayLoad = 1 in
+    defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.MemOp:$src1), OpcodeStr,
+                    "$src1", "$src1",
+                    (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>,
+              EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _> :
+           avx512_unary_rm<opc, OpcodeStr, OpNode, _> {
+  let mayLoad = 1 in
+    defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.ScalarMemOp:$src1), OpcodeStr,
+                    "${src1}"##_.BroadcastStr,
+                    "${src1}"##_.BroadcastStr,
+                    (_.VT (OpNode (X86VBroadcast
+                                      (_.ScalarLdFrag addr:$src1))))>,
+               EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_unary_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+  let Predicates = [prd] in
+    defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>,
+                              EVEX_V256;
+    defm Z128 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info128>,
+                              EVEX_V128;
+  }
+}
+
+multiclass avx512_unary_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                               AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+  let Predicates = [prd] in
+    defm Z : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info512>,
+                              EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info256>,
+                                 EVEX_V256;
+    defm Z128 : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info128>,
+                                 EVEX_V128;
+  }
+}
+
+multiclass avx512_unary_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
+                                 SDNode OpNode, Predicate prd> {
+  defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, avx512vl_i64_info,
+                               prd>, VEX_W;
+  defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, avx512vl_i32_info,
+                               prd>;
+}
+
+multiclass avx512_unary_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
+                                 SDNode OpNode, Predicate prd> {
+  defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, avx512vl_i16_info, prd>;
+  defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, avx512vl_i8_info, prd>;
+}
+
+multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
+                                  bits<8> opc_d, bits<8> opc_q,
+                                  string OpcodeStr, SDNode OpNode> {
+  defm NAME : avx512_unary_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode,
+                                    HasAVX512>,
+              avx512_unary_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode,
+                                    HasBWI>;
+}
+
+defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", X86Abs>;
+
+def : Pat<(xor
+          (bc_v16i32 (v16i1sextv16i32)),
+          (bc_v16i32 (add (v16i32 VR512:$src), (v16i1sextv16i32)))),
+          (VPABSDZrr VR512:$src)>;
+def : Pat<(xor
+          (bc_v8i64 (v8i1sextv8i64)),
+          (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))),
+          (VPABSQZrr VR512:$src)>;
+
+multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{
+
+  defm NAME :          avx512_unary_rm_vl_dq<opc, opc, OpcodeStr, ctlz, prd>;
+}
+
+defm VPLZCNT    : avx512_ctlz<0x44, "vplzcnt", HasCDI>;
+defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, HasCDI>;
+
+//===---------------------------------------------------------------------===//
+// Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode>{
+  defm NAME:       avx512_unary_rm_vl<opc, OpcodeStr, OpNode, avx512vl_f32_info,
+                                      HasAVX512>, XS;
+}
+
+defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup>;
+defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - MOVDDUP
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _> {
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                   (ins _.RC:$src), OpcodeStr, "$src", "$src",
+                   (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX;
+  let mayLoad = 1 in
+    defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                   (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+                   (_.VT (OpNode (_.VT (scalar_to_vector
+                                         (_.ScalarLdFrag addr:$src)))))>,
+                   EVEX, EVEX_CD8<_.EltSize, CD8VH>;
+}
+
+multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                  AVX512VLVectorVTInfo VTInfo> {
+
+  defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512;
+
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>,
+                               EVEX_V256;
+    defm Z128 : avx512_movddup_128<opc, OpcodeStr, OpNode, VTInfo.info128>,
+                               EVEX_V128;
+  }
+}
+
+multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode>{
+  defm NAME:      avx512_movddup_common<opc, OpcodeStr, OpNode,
+                                        avx512vl_f64_info>, XD, VEX_W;
+}
+
+defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>;
+
+def : Pat<(X86Movddup (loadv2f64 addr:$src)),
+          (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>;
+def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
+          (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Unpack Instructions
+//===----------------------------------------------------------------------===//
+defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh>;
+defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl>;
+
+defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl,
+                                       SSE_INTALU_ITINS_P, HasBWI>;
+defm VPUNPCKHBW : avx512_binop_rm_vl_b<0x68, "vpunpckhbw", X86Unpckh,
+                                       SSE_INTALU_ITINS_P, HasBWI>;
+defm VPUNPCKLWD : avx512_binop_rm_vl_w<0x61, "vpunpcklwd", X86Unpckl,
+                                       SSE_INTALU_ITINS_P, HasBWI>;
+defm VPUNPCKHWD : avx512_binop_rm_vl_w<0x69, "vpunpckhwd", X86Unpckh,
+                                       SSE_INTALU_ITINS_P, HasBWI>;
+
+defm VPUNPCKLDQ : avx512_binop_rm_vl_d<0x62, "vpunpckldq", X86Unpckl,
+                                       SSE_INTALU_ITINS_P, HasAVX512>;
+defm VPUNPCKHDQ : avx512_binop_rm_vl_d<0x6A, "vpunpckhdq", X86Unpckh,
+                                       SSE_INTALU_ITINS_P, HasAVX512>;
+defm VPUNPCKLQDQ : avx512_binop_rm_vl_q<0x6C, "vpunpcklqdq", X86Unpckl,
+                                       SSE_INTALU_ITINS_P, HasAVX512>;
+defm VPUNPCKHQDQ : avx512_binop_rm_vl_q<0x6D, "vpunpckhqdq", X86Unpckh,
+                                       SSE_INTALU_ITINS_P, HasAVX512>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Extract & Insert Integer Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _> {
+  let mayStore = 1 in
+    def mr : AVX512Ii8<opc, MRMDestMem, (outs),
+                (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
+                OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                [(store (_.EltVT (trunc (assertzext (OpNode (_.VT _.RC:$src1),
+                                                            imm:$src2)))),
+                        addr:$dst)]>,
+                EVEX, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
+  let Predicates = [HasBWI] in {
+    def rr : AVX512Ii8<0x14, MRMDestReg, (outs GR32orGR64:$dst),
+                  (ins _.RC:$src1, u8imm:$src2),
+                  OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  [(set GR32orGR64:$dst,
+                        (X86pextrb (_.VT _.RC:$src1), imm:$src2))]>,
+                  EVEX, TAPD;
+
+    defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD;
+  }
+}
+
+multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
+  let Predicates = [HasBWI] in {
+    def rr : AVX512Ii8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst),
+                  (ins _.RC:$src1, u8imm:$src2),
+                  OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  [(set GR32orGR64:$dst,
+                        (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>,
+                  EVEX, PD;
+
+    def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst),
+                   (ins _.RC:$src1, u8imm:$src2),
+                   OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                   EVEX, TAPD;
+
+    defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD;
+  }
+}
+
+multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _,
+                                                            RegisterClass GRC> {
+  let Predicates = [HasDQI] in {
+    def rr : AVX512Ii8<0x16, MRMDestReg, (outs GRC:$dst),
+                  (ins _.RC:$src1, u8imm:$src2),
+                  OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  [(set GRC:$dst,
+                      (extractelt (_.VT _.RC:$src1), imm:$src2))]>,
+                  EVEX, TAPD;
+
+    let mayStore = 1 in
+      def mr : AVX512Ii8<0x16, MRMDestMem, (outs),
+                  (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
+                  OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  [(store (extractelt (_.VT _.RC:$src1),
+                                      imm:$src2),addr:$dst)]>,
+                  EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD;
+  }
+}
+
+defm VPEXTRBZ : avx512_extract_elt_b<"vpextrb", v16i8x_info>;
+defm VPEXTRWZ : avx512_extract_elt_w<"vpextrw", v8i16x_info>;
+defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>;
+defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W;
+
+multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                            X86VectorVTInfo _, PatFrag LdFrag> {
+  def rm : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst),
+      (ins _.RC:$src1,  _.ScalarMemOp:$src2, u8imm:$src3),
+      OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+      [(set _.RC:$dst,
+          (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>,
+      EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                            X86VectorVTInfo _, PatFrag LdFrag> {
+  let Predicates = [HasBWI] in {
+    def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst),
+        (ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3),
+        OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+        [(set _.RC:$dst,
+            (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V;
+
+    defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag>;
+  }
+}
+
+multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
+                                         X86VectorVTInfo _, RegisterClass GRC> {
+  let Predicates = [HasDQI] in {
+    def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst),
+        (ins _.RC:$src1, GRC:$src2, u8imm:$src3),
+        OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+        [(set _.RC:$dst,
+            (_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>,
+        EVEX_4V, TAPD;
+
+    defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _,
+                                    _.ScalarLdFrag>, TAPD;
+  }
+}
+
+defm VPINSRBZ : avx512_insert_elt_bw<0x20, "vpinsrb", X86pinsrb, v16i8x_info,
+                                     extloadi8>, TAPD;
+defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info,
+                                     extloadi16>, PD;
+defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>;
+defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W;
+//===----------------------------------------------------------------------===//
+// VSHUFPS - VSHUFPD Operations
+//===----------------------------------------------------------------------===//
+multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I,
+                                                AVX512VLVectorVTInfo VTInfo_FP>{
+  defm NAME:     avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp>,
+                                   EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
+                                   AVX512AIi8Base, EVEX_4V;
+}
+
+defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_i32_info, avx512vl_f32_info>, PS;
+defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, VEX_W;
+//===----------------------------------------------------------------------===//
+// AVX-512 - Byte shift Left/Right
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr,
+                             Format MRMm, string OpcodeStr, X86VectorVTInfo _>{
+  def rr : AVX512<opc, MRMr,
+             (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>;
+  let mayLoad = 1 in
+    def rm : AVX512<opc, MRMm,
+             (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _.RC:$dst,(_.VT (OpNode 
+                                   (_.LdFrag addr:$src1), (i8 imm:$src2))))]>;
+}
+
+multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr, 
+                                 Format MRMm, string OpcodeStr, Predicate prd>{
+  let Predicates = [prd] in
+    defm Z512 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, 
+                                    OpcodeStr, v8i64_info>, EVEX_V512;
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, 
+                                    OpcodeStr, v4i64x_info>, EVEX_V256;
+    defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, 
+                                    OpcodeStr, v2i64x_info>, EVEX_V128;
+  }
+}
+defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq", 
+                                       HasBWI>, AVX512PDIi8Base, EVEX_4V;
+defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", 
+                                       HasBWI>, AVX512PDIi8Base, EVEX_4V;
+
+
+multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode, 
+                                string OpcodeStr, X86VectorVTInfo _dst,
+                                X86VectorVTInfo _src>{
+  def rr : AVX512BI<opc, MRMSrcReg,
+             (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _dst.RC:$dst,(_dst.VT
+                                (OpNode (_src.VT _src.RC:$src1),
+                                        (_src.VT _src.RC:$src2))))]>;
+  let mayLoad = 1 in
+    def rm : AVX512BI<opc, MRMSrcMem,
+             (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _dst.RC:$dst,(_dst.VT
+                                (OpNode (_src.VT _src.RC:$src1),
+                                (_src.VT (bitconvert
+                                          (_src.LdFrag addr:$src2))))))]>;
+}
+
+multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode, 
+                                    string OpcodeStr, Predicate prd> {
+  let Predicates = [prd] in
+    defm Z512 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v8i64_info,
+                                    v64i8_info>, EVEX_V512;
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v4i64x_info,
+                                    v32i8x_info>, EVEX_V256;
+    defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v2i64x_info,
+                                    v16i8x_info>, EVEX_V128;
+  }
+}
+
+defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw", 
+                                       HasBWI>, EVEX_4V;
+
+multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _>{
+  let Constraints = "$src1 = $dst" in {
+  defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src2, _.RC:$src3, u8imm:$src4),
+                      OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (_.VT _.RC:$src3),
+                              (i8 imm:$src4))>, AVX512AIi8Base, EVEX_4V;
+  let mayLoad = 1 in {
+    defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
+                      OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (_.VT (bitconvert (_.LdFrag addr:$src3))),
+                              (i8 imm:$src4))>,
+                      AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+    defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
+                      OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2",
+                      "$src2, ${src3}"##_.BroadcastStr##", $src4",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+                              (i8 imm:$src4))>, EVEX_B,
+                      AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+  }
+  }// Constraints = "$src1 = $dst"
+}
+
+multiclass avx512_common_ternlog<string OpcodeStr, AVX512VLVectorVTInfo _>{
+  let Predicates = [HasAVX512] in
+    defm Z    : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info512>, EVEX_V512;
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info128>, EVEX_V128;
+    defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info256>, EVEX_V256;
+  }
+}
+
+defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", avx512vl_i32_info>;
+defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", avx512vl_i64_info>, VEX_W;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
new file mode 100644
index 0000000..1a2e786
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -0,0 +1,1375 @@
+//===-- X86InstrArithmetic.td - Integer Arithmetic Instrs --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the integer arithmetic instructions in the X86
+// architecture.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LEA - Load Effective Address
+let SchedRW = [WriteLEA] in {
+let hasSideEffects = 0 in
+def LEA16r   : I<0x8D, MRMSrcMem,
+                 (outs GR16:$dst), (ins anymem:$src),
+                 "lea{w}\t{$src|$dst}, {$dst|$src}", [], IIC_LEA_16>, OpSize16;
+let isReMaterializable = 1 in
+def LEA32r   : I<0x8D, MRMSrcMem,
+                 (outs GR32:$dst), (ins anymem:$src),
+                 "lea{l}\t{$src|$dst}, {$dst|$src}",
+                 [(set GR32:$dst, lea32addr:$src)], IIC_LEA>,
+                 OpSize32, Requires<[Not64BitMode]>;
+
+def LEA64_32r : I<0x8D, MRMSrcMem,
+                  (outs GR32:$dst), (ins lea64_32mem:$src),
+                  "lea{l}\t{$src|$dst}, {$dst|$src}",
+                  [(set GR32:$dst, lea64_32addr:$src)], IIC_LEA>,
+                  OpSize32, Requires<[In64BitMode]>;
+
+let isReMaterializable = 1 in
+def LEA64r   : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
+                  "lea{q}\t{$src|$dst}, {$dst|$src}",
+                  [(set GR64:$dst, lea64addr:$src)], IIC_LEA>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+//  Fixed-Register Multiplication and Division Instructions.
+//
+
+// SchedModel info for instruction that loads one value and gets the second
+// (and possibly third) value from a register.
+// This is used for instructions that put the memory operands before other
+// uses.
+class SchedLoadReg<SchedWrite SW> : Sched<[SW,
+  // Memory operand.
+  ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+  // Register reads (implicit or explicit).
+  ReadAfterLd, ReadAfterLd]>;
+
+// Extra precision multiplication
+
+// AL is really implied by AX, but the registers in Defs must match the
+// SDNode results (i8, i32).
+// AL,AH = AL*GR8
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def MUL8r  : I<0xF6, MRM4r, (outs),  (ins GR8:$src), "mul{b}\t$src",
+               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+               // This probably ought to be moved to a def : Pat<> if the
+               // syntax can be accepted.
+               [(set AL, (mul AL, GR8:$src)),
+                (implicit EFLAGS)], IIC_MUL8>, Sched<[WriteIMul]>;
+// AX,DX = AX*GR16
+let Defs = [AX,DX,EFLAGS], Uses = [AX], hasSideEffects = 0 in
+def MUL16r : I<0xF7, MRM4r, (outs),  (ins GR16:$src),
+               "mul{w}\t$src",
+               [], IIC_MUL16_REG>, OpSize16, Sched<[WriteIMul]>;
+// EAX,EDX = EAX*GR32
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], hasSideEffects = 0 in
+def MUL32r : I<0xF7, MRM4r, (outs),  (ins GR32:$src),
+               "mul{l}\t$src",
+               [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/],
+               IIC_MUL32_REG>, OpSize32, Sched<[WriteIMul]>;
+// RAX,RDX = RAX*GR64
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], hasSideEffects = 0 in
+def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src),
+                "mul{q}\t$src",
+                [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/],
+                IIC_MUL64>, Sched<[WriteIMul]>;
+// AL,AH = AL*[mem8]
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def MUL8m  : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
+               "mul{b}\t$src",
+               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+               // This probably ought to be moved to a def : Pat<> if the
+               // syntax can be accepted.
+               [(set AL, (mul AL, (loadi8 addr:$src))),
+                (implicit EFLAGS)], IIC_MUL8>, SchedLoadReg<WriteIMulLd>;
+// AX,DX = AX*[mem16]
+let mayLoad = 1, hasSideEffects = 0 in {
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src),
+               "mul{w}\t$src",
+               [], IIC_MUL16_MEM>, OpSize16, SchedLoadReg<WriteIMulLd>;
+// EAX,EDX = EAX*[mem32]
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src),
+              "mul{l}\t$src",
+              [], IIC_MUL32_MEM>, OpSize32, SchedLoadReg<WriteIMulLd>;
+// RAX,RDX = RAX*[mem64]
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
+                "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>;
+}
+
+let hasSideEffects = 0 in {
+// AL,AH = AL*GR8
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def IMUL8r  : I<0xF6, MRM5r, (outs),  (ins GR8:$src), "imul{b}\t$src", [],
+              IIC_IMUL8>, Sched<[WriteIMul]>;
+// AX,DX = AX*GR16
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16r : I<0xF7, MRM5r, (outs),  (ins GR16:$src), "imul{w}\t$src", [],
+              IIC_IMUL16_RR>, OpSize16, Sched<[WriteIMul]>;
+// EAX,EDX = EAX*GR32
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32r : I<0xF7, MRM5r, (outs),  (ins GR32:$src), "imul{l}\t$src", [],
+              IIC_IMUL32_RR>, OpSize32, Sched<[WriteIMul]>;
+// RAX,RDX = RAX*GR64
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", [],
+              IIC_IMUL64_RR>, Sched<[WriteIMul]>;
+
+let mayLoad = 1 in {
+// AL,AH = AL*[mem8]
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def IMUL8m  : I<0xF6, MRM5m, (outs), (ins i8mem :$src),
+                "imul{b}\t$src", [], IIC_IMUL8>, SchedLoadReg<WriteIMulLd>;
+// AX,DX = AX*[mem16]
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src),
+                "imul{w}\t$src", [], IIC_IMUL16_MEM>, OpSize16,
+              SchedLoadReg<WriteIMulLd>;
+// EAX,EDX = EAX*[mem32]
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
+                "imul{l}\t$src", [], IIC_IMUL32_MEM>, OpSize32,
+              SchedLoadReg<WriteIMulLd>;
+// RAX,RDX = RAX*[mem64]
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
+                 "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>;
+}
+} // hasSideEffects
+
+
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst" in {
+
+let isCommutable = 1, SchedRW = [WriteIMul] in {
+// X = IMUL Y, Z --> X = IMUL Z, Y
+// Register-Register Signed Integer Multiply
+def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
+                 "imul{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, EFLAGS,
+                       (X86smul_flag GR16:$src1, GR16:$src2))], IIC_IMUL16_RR>,
+                       TB, OpSize16;
+def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
+                 "imul{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, EFLAGS,
+                       (X86smul_flag GR32:$src1, GR32:$src2))], IIC_IMUL32_RR>,
+                 TB, OpSize32;
+def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
+                                   (ins GR64:$src1, GR64:$src2),
+                  "imul{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, EFLAGS,
+                        (X86smul_flag GR64:$src1, GR64:$src2))], IIC_IMUL64_RR>,
+                 TB;
+} // isCommutable, SchedRW
+
+// Register-Memory Signed Integer Multiply
+let SchedRW = [WriteIMulLd, ReadAfterLd] in {
+def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
+                                  (ins GR16:$src1, i16mem:$src2),
+                 "imul{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, EFLAGS,
+                       (X86smul_flag GR16:$src1, (load addr:$src2)))],
+                       IIC_IMUL16_RM>,
+               TB, OpSize16;
+def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst),
+                 (ins GR32:$src1, i32mem:$src2),
+                 "imul{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, EFLAGS,
+                       (X86smul_flag GR32:$src1, (load addr:$src2)))],
+                       IIC_IMUL32_RM>,
+               TB, OpSize32;
+def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
+                                   (ins GR64:$src1, i64mem:$src2),
+                  "imul{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, EFLAGS,
+                        (X86smul_flag GR64:$src1, (load addr:$src2)))],
+                        IIC_IMUL64_RM>,
+               TB;
+} // SchedRW
+} // Constraints = "$src1 = $dst"
+
+} // Defs = [EFLAGS]
+
+// Surprisingly enough, these are not two address instructions!
+let Defs = [EFLAGS] in {
+let SchedRW = [WriteIMul] in {
+// Register-Integer Signed Integer Multiply
+def IMUL16rri  : Ii16<0x69, MRMSrcReg,                      // GR16 = GR16*I16
+                      (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR16:$dst, EFLAGS,
+                            (X86smul_flag GR16:$src1, imm:$src2))],
+                            IIC_IMUL16_RRI>, OpSize16;
+def IMUL16rri8 : Ii8<0x6B, MRMSrcReg,                       // GR16 = GR16*I8
+                     (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+                     "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR16:$dst, EFLAGS,
+                           (X86smul_flag GR16:$src1, i16immSExt8:$src2))],
+                           IIC_IMUL16_RRI>, OpSize16;
+def IMUL32rri  : Ii32<0x69, MRMSrcReg,                      // GR32 = GR32*I32
+                      (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR32:$dst, EFLAGS,
+                            (X86smul_flag GR32:$src1, imm:$src2))],
+                            IIC_IMUL32_RRI>, OpSize32;
+def IMUL32rri8 : Ii8<0x6B, MRMSrcReg,                       // GR32 = GR32*I8
+                     (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+                     "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR32:$dst, EFLAGS,
+                           (X86smul_flag GR32:$src1, i32immSExt8:$src2))],
+                           IIC_IMUL32_RRI>, OpSize32;
+def IMUL64rri32 : RIi32S<0x69, MRMSrcReg,                    // GR64 = GR64*I32
+                         (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+                         "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         [(set GR64:$dst, EFLAGS,
+                             (X86smul_flag GR64:$src1, i64immSExt32:$src2))],
+                             IIC_IMUL64_RRI>;
+def IMUL64rri8 : RIi8<0x6B, MRMSrcReg,                      // GR64 = GR64*I8
+                      (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+                      "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR64:$dst, EFLAGS,
+                            (X86smul_flag GR64:$src1, i64immSExt8:$src2))],
+                            IIC_IMUL64_RRI>;
+} // SchedRW
+
+// Memory-Integer Signed Integer Multiply
+let SchedRW = [WriteIMulLd] in {
+def IMUL16rmi  : Ii16<0x69, MRMSrcMem,                     // GR16 = [mem16]*I16
+                      (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
+                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR16:$dst, EFLAGS,
+                            (X86smul_flag (load addr:$src1), imm:$src2))],
+                            IIC_IMUL16_RMI>,
+                 OpSize16;
+def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR16 = [mem16]*I8
+                     (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
+                     "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR16:$dst, EFLAGS,
+                           (X86smul_flag (load addr:$src1),
+                                         i16immSExt8:$src2))], IIC_IMUL16_RMI>,
+                                         OpSize16;
+def IMUL32rmi  : Ii32<0x69, MRMSrcMem,                     // GR32 = [mem32]*I32
+                      (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
+                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR32:$dst, EFLAGS,
+                            (X86smul_flag (load addr:$src1), imm:$src2))],
+                            IIC_IMUL32_RMI>, OpSize32;
+def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR32 = [mem32]*I8
+                     (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
+                     "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR32:$dst, EFLAGS,
+                           (X86smul_flag (load addr:$src1),
+                                         i32immSExt8:$src2))],
+                                         IIC_IMUL32_RMI>, OpSize32;
+def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem,                   // GR64 = [mem64]*I32
+                         (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
+                         "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         [(set GR64:$dst, EFLAGS,
+                              (X86smul_flag (load addr:$src1),
+                                            i64immSExt32:$src2))],
+                                            IIC_IMUL64_RMI>;
+def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem,                      // GR64 = [mem64]*I8
+                      (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
+                      "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR64:$dst, EFLAGS,
+                            (X86smul_flag (load addr:$src1),
+                                          i64immSExt8:$src2))],
+                                          IIC_IMUL64_RMI>;
+} // SchedRW
+} // Defs = [EFLAGS]
+
+
+
+
+// unsigned division/remainder
+let hasSideEffects = 1 in { // so that we don't speculatively execute
+let SchedRW = [WriteIDiv] in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def DIV8r  : I<0xF6, MRM6r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
+               "div{b}\t$src", [], IIC_DIV8_REG>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16r : I<0xF7, MRM6r, (outs),  (ins GR16:$src),   // DX:AX/r16 = AX,DX
+               "div{w}\t$src", [], IIC_DIV16>, OpSize16;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def DIV32r : I<0xF7, MRM6r, (outs),  (ins GR32:$src),   // EDX:EAX/r32 = EAX,EDX
+               "div{l}\t$src", [], IIC_DIV32>, OpSize32;
+// RDX:RAX/r64 = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),
+                "div{q}\t$src", [], IIC_DIV64>;
+} // SchedRW
+
+let mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def DIV8m  : I<0xF6, MRM6m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
+               "div{b}\t$src", [], IIC_DIV8_MEM>,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src),  // DX:AX/[mem16] = AX,DX
+               "div{w}\t$src", [], IIC_DIV16>, OpSize16,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in    // EDX:EAX/[mem32] = EAX,EDX
+def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src),
+               "div{l}\t$src", [], IIC_DIV32>,
+             SchedLoadReg<WriteIDivLd>, OpSize32;
+// RDX:RAX/[mem64] = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
+                "div{q}\t$src", [], IIC_DIV64>,
+             SchedLoadReg<WriteIDivLd>;
+}
+
+// Signed division/remainder.
+let SchedRW = [WriteIDiv] in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def IDIV8r : I<0xF6, MRM7r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
+               "idiv{b}\t$src", [], IIC_IDIV8>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16r: I<0xF7, MRM7r, (outs),  (ins GR16:$src),   // DX:AX/r16 = AX,DX
+               "idiv{w}\t$src", [], IIC_IDIV16>, OpSize16;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def IDIV32r: I<0xF7, MRM7r, (outs),  (ins GR32:$src),   // EDX:EAX/r32 = EAX,EDX
+               "idiv{l}\t$src", [], IIC_IDIV32>, OpSize32;
+// RDX:RAX/r64 = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),
+                "idiv{q}\t$src", [], IIC_IDIV64>;
+} // SchedRW
+
+let mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
+               "idiv{b}\t$src", [], IIC_IDIV8>,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src),  // DX:AX/[mem16] = AX,DX
+               "idiv{w}\t$src", [], IIC_IDIV16>, OpSize16,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in    // EDX:EAX/[mem32] = EAX,EDX
+def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),
+               "idiv{l}\t$src", [], IIC_IDIV32>, OpSize32,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
+def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
+                "idiv{q}\t$src", [], IIC_IDIV64>,
+             SchedLoadReg<WriteIDivLd>;
+}
+} // hasSideEffects = 0
+
+//===----------------------------------------------------------------------===//
+//  Two address Instructions.
+//
+
+// unary instructions
+let CodeSize = 2 in {
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+def NEG8r  : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "neg{b}\t$dst",
+               [(set GR8:$dst, (ineg GR8:$src1)),
+                (implicit EFLAGS)], IIC_UNARY_REG>;
+def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+               "neg{w}\t$dst",
+               [(set GR16:$dst, (ineg GR16:$src1)),
+                (implicit EFLAGS)], IIC_UNARY_REG>, OpSize16;
+def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+               "neg{l}\t$dst",
+               [(set GR32:$dst, (ineg GR32:$src1)),
+                (implicit EFLAGS)], IIC_UNARY_REG>, OpSize32;
+def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "neg{q}\t$dst",
+                [(set GR64:$dst, (ineg GR64:$src1)),
+                 (implicit EFLAGS)], IIC_UNARY_REG>;
+} // Constraints = "$src1 = $dst", SchedRW
+
+// Read-modify-write negate.
+let SchedRW = [WriteALULd, WriteRMW] in {
+def NEG8m  : I<0xF6, MRM3m, (outs), (ins i8mem :$dst),
+               "neg{b}\t$dst",
+               [(store (ineg (loadi8 addr:$dst)), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>;
+def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst),
+               "neg{w}\t$dst",
+               [(store (ineg (loadi16 addr:$dst)), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst),
+               "neg{l}\t$dst",
+               [(store (ineg (loadi32 addr:$dst)), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
+                [(store (ineg (loadi64 addr:$dst)), addr:$dst),
+                 (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // SchedRW
+} // Defs = [EFLAGS]
+
+
+// Note: NOT does not set EFLAGS!
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+// Match xor -1 to not. Favors these over a move imm + xor to save code size.
+let AddedComplexity = 15 in {
+def NOT8r  : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "not{b}\t$dst",
+               [(set GR8:$dst, (not GR8:$src1))], IIC_UNARY_REG>;
+def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+               "not{w}\t$dst",
+               [(set GR16:$dst, (not GR16:$src1))], IIC_UNARY_REG>, OpSize16;
+def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+               "not{l}\t$dst",
+               [(set GR32:$dst, (not GR32:$src1))], IIC_UNARY_REG>, OpSize32;
+def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "not{q}\t$dst",
+                [(set GR64:$dst, (not GR64:$src1))], IIC_UNARY_REG>;
+}
+} // Constraints = "$src1 = $dst", SchedRW
+
+let SchedRW = [WriteALULd, WriteRMW] in {
+def NOT8m  : I<0xF6, MRM2m, (outs), (ins i8mem :$dst),
+               "not{b}\t$dst",
+               [(store (not (loadi8 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst),
+               "not{w}\t$dst",
+               [(store (not (loadi16 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+               OpSize16;
+def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst),
+               "not{l}\t$dst",
+               [(store (not (loadi32 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+               OpSize32;
+def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
+                [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+} // SchedRW
+} // CodeSize
+
+// TODO: inc/dec is slow for P4, but fast for Pentium-M.
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+let CodeSize = 2 in
+def INC8r  : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "inc{b}\t$dst",
+               [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))],
+               IIC_UNARY_REG>;
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
+def INC16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+               "inc{w}\t$dst",
+               [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))],
+               IIC_UNARY_REG>, OpSize16;
+def INC32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+               "inc{l}\t$dst",
+               [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
+               IIC_UNARY_REG>, OpSize32;
+def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst",
+                [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))],
+                IIC_UNARY_REG>;
+} // isConvertibleToThreeAddress = 1, CodeSize = 2
+
+// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
+let CodeSize = 1, hasSideEffects = 0 in {
+def INC16r_alt : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+                   "inc{w}\t$dst", [], IIC_UNARY_REG>,
+                 OpSize16, Requires<[Not64BitMode]>;
+def INC32r_alt : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+                   "inc{l}\t$dst", [], IIC_UNARY_REG>,
+                 OpSize32, Requires<[Not64BitMode]>;
+} // CodeSize = 1, hasSideEffects = 0
+} // Constraints = "$src1 = $dst", SchedRW
+
+let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+  def INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
+               [(store (add (loadi8 addr:$dst), 1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>;
+  def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+               [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+  def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+               [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+  def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
+                  [(store (add (loadi64 addr:$dst), 1), addr:$dst),
+                   (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // CodeSize = 2, SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+let CodeSize = 2 in
+def DEC8r  : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "dec{b}\t$dst",
+               [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))],
+               IIC_UNARY_REG>;
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
+def DEC16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+               "dec{w}\t$dst",
+               [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
+               IIC_UNARY_REG>, OpSize16;
+def DEC32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+               "dec{l}\t$dst",
+               [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
+               IIC_UNARY_REG>, OpSize32;
+def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst",
+                [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))],
+                IIC_UNARY_REG>;
+} // isConvertibleToThreeAddress = 1, CodeSize = 2
+
+// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
+let CodeSize = 1, hasSideEffects = 0 in {
+def DEC16r_alt : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+                   "dec{w}\t$dst", [], IIC_UNARY_REG>,
+                 OpSize16, Requires<[Not64BitMode]>;
+def DEC32r_alt : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+                   "dec{l}\t$dst", [], IIC_UNARY_REG>,
+                 OpSize32, Requires<[Not64BitMode]>;
+} // CodeSize = 1, hasSideEffects = 0
+} // Constraints = "$src1 = $dst", SchedRW
+
+
+let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+  def DEC8m  : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst",
+               [(store (add (loadi8 addr:$dst), -1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>;
+  def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+               [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+  def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+               [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+  def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+                  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+                   (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // CodeSize = 2, SchedRW
+} // Defs = [EFLAGS]
+
+/// X86TypeInfo - This is a bunch of information that describes relevant X86
+/// information about value types.  For example, it can tell you what the
+/// register class and preferred load to use.
+class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
+                  PatFrag loadnode, X86MemOperand memoperand, ImmType immkind,
+                  Operand immoperand, SDPatternOperator immoperator,
+                  Operand imm8operand, SDPatternOperator imm8operator,
+                  bit hasOddOpcode, OperandSize opSize,
+                  bit hasREX_WPrefix> {
+  /// VT - This is the value type itself.
+  ValueType VT = vt;
+
+  /// InstrSuffix - This is the suffix used on instructions with this type.  For
+  /// example, i8 -> "b", i16 -> "w", i32 -> "l", i64 -> "q".
+  string InstrSuffix = instrsuffix;
+
+  /// RegClass - This is the register class associated with this type.  For
+  /// example, i8 -> GR8, i16 -> GR16, i32 -> GR32, i64 -> GR64.
+  RegisterClass RegClass = regclass;
+
+  /// LoadNode - This is the load node associated with this type.  For
+  /// example, i8 -> loadi8, i16 -> loadi16, i32 -> loadi32, i64 -> loadi64.
+  PatFrag LoadNode = loadnode;
+
+  /// MemOperand - This is the memory operand associated with this type.  For
+  /// example, i8 -> i8mem, i16 -> i16mem, i32 -> i32mem, i64 -> i64mem.
+  X86MemOperand MemOperand = memoperand;
+
+  /// ImmEncoding - This is the encoding of an immediate of this type.  For
+  /// example, i8 -> Imm8, i16 -> Imm16, i32 -> Imm32.  Note that i64 -> Imm32
+  /// since the immediate fields of i64 instructions is a 32-bit sign extended
+  /// value.
+  ImmType ImmEncoding = immkind;
+
+  /// ImmOperand - This is the operand kind of an immediate of this type.  For
+  /// example, i8 -> i8imm, i16 -> i16imm, i32 -> i32imm.  Note that i64 ->
+  /// i64i32imm since the immediate fields of i64 instructions is a 32-bit sign
+  /// extended value.
+  Operand ImmOperand = immoperand;
+
+  /// ImmOperator - This is the operator that should be used to match an
+  /// immediate of this kind in a pattern (e.g. imm, or i64immSExt32).
+  SDPatternOperator ImmOperator = immoperator;
+
+  /// Imm8Operand - This is the operand kind to use for an imm8 of this type.
+  /// For example, i8 -> <invalid>, i16 -> i16i8imm, i32 -> i32i8imm.  This is
+  /// only used for instructions that have a sign-extended imm8 field form.
+  Operand Imm8Operand = imm8operand;
+
+  /// Imm8Operator - This is the operator that should be used to match an 8-bit
+  /// sign extended immediate of this kind in a pattern (e.g. imm16immSExt8).
+  SDPatternOperator Imm8Operator = imm8operator;
+
+  /// HasOddOpcode - This bit is true if the instruction should have an odd (as
+  /// opposed to even) opcode.  Operations on i8 are usually even, operations on
+  /// other datatypes are odd.
+  bit HasOddOpcode = hasOddOpcode;
+
+  /// OpSize - Selects whether the instruction needs a 0x66 prefix based on
+  /// 16-bit vs 32-bit mode. i8/i64 set this to OpSizeFixed. i16 sets this
+  /// to Opsize16. i32 sets this to OpSize32.
+  OperandSize OpSize = opSize;
+
+  /// HasREX_WPrefix - This bit is set to true if the instruction should have
+  /// the 0x40 REX prefix.  This is set for i64 types.
+  bit HasREX_WPrefix = hasREX_WPrefix;
+}
+
+def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
+
+
+def Xi8  : X86TypeInfo<i8, "b", GR8, loadi8, i8mem,
+                       Imm8, i8imm, imm8_su, i8imm, invalid_node,
+                       0, OpSizeFixed, 0>;
+def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem,
+                       Imm16, i16imm, imm16_su, i16i8imm, i16immSExt8_su,
+                       1, OpSize16, 0>;
+def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem,
+                       Imm32, i32imm, imm32_su, i32i8imm, i32immSExt8_su,
+                       1, OpSize32, 0>;
+def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
+                       Imm32S, i64i32imm, i64immSExt32, i64i8imm, i64immSExt8,
+                       1, OpSizeFixed, 1>;
+
+/// ITy - This instruction base class takes the type info for the instruction.
+/// Using this, it:
+/// 1. Concatenates together the instruction mnemonic with the appropriate
+///    suffix letter, a tab, and the arguments.
+/// 2. Infers whether the instruction should have a 0x66 prefix byte.
+/// 3. Infers whether the instruction should have a 0x40 REX_W prefix.
+/// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations)
+///    or 1 (for i16,i32,i64 operations).
+class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
+          string mnemonic, string args, list<dag> pattern,
+          InstrItinClass itin = IIC_BIN_NONMEM>
+  : I<{opcode{7}, opcode{6}, opcode{5}, opcode{4},
+       opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode },
+      f, outs, ins,
+      !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern,
+      itin> {
+
+  // Infer instruction prefixes from type info.
+  let OpSize = typeinfo.OpSize;
+  let hasREX_WPrefix  = typeinfo.HasREX_WPrefix;
+}
+
+// BinOpRR - Instructions like "add reg, reg, reg".
+class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              dag outlist, list<dag> pattern, InstrItinClass itin,
+              Format f = MRMDestReg>
+  : ITy<opcode, f, typeinfo, outlist,
+        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+    Sched<[WriteALU]>;
+
+// BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has
+// just a EFLAGS as a result.
+class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                SDPatternOperator opnode, Format f = MRMDestReg>
+  : BinOpRR<opcode, mnemonic, typeinfo, (outs),
+            [(set EFLAGS,
+                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+            IIC_BIN_NONMEM, f>;
+
+// BinOpRR_RF - Instructions like "add reg, reg, reg", where the pattern has
+// both a regclass and EFLAGS as a result.
+class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode>
+  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+                  IIC_BIN_NONMEM>;
+
+// BinOpRR_RFF - Instructions like "adc reg, reg, reg", where the pattern has
+// both a regclass and EFLAGS as a result, and has EFLAGS as input.
+class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode>
+  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2,
+                          EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+
+// BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding).
+class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 InstrItinClass itin = IIC_BIN_NONMEM>
+  : ITy<opcode, MRMSrcReg, typeinfo,
+        (outs typeinfo.RegClass:$dst),
+        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+        mnemonic, "{$src2, $dst|$dst, $src2}", [], itin>,
+    Sched<[WriteALU]> {
+  // The disassembler should know about this, but not the asmparser.
+  let isCodeGenOnly = 1;
+  let ForceDisassemble = 1;
+  let hasSideEffects = 0;
+}
+
+// BinOpRR_RDD_Rev - Instructions like "adc reg, reg, reg" (reversed encoding).
+class BinOpRR_RFF_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+  : BinOpRR_Rev<opcode, mnemonic, typeinfo, IIC_BIN_CARRY_NONMEM>;
+
+// BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding).
+class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+  : ITy<opcode, MRMSrcReg, typeinfo, (outs),
+        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", [], IIC_BIN_NONMEM>,
+    Sched<[WriteALU]> {
+  // The disassembler should know about this, but not the asmparser.
+  let isCodeGenOnly = 1;
+  let ForceDisassemble = 1;
+  let hasSideEffects = 0;
+}
+
+// BinOpRM - Instructions like "add reg, reg, [mem]".
+class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              dag outlist, list<dag> pattern,
+              InstrItinClass itin = IIC_BIN_MEM>
+  : ITy<opcode, MRMSrcMem, typeinfo, outlist,
+        (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+    Sched<[WriteALULd, ReadAfterLd]>;
+
+// BinOpRM_R - Instructions like "add reg, reg, [mem]".
+class BinOpRM_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              SDNode opnode>
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst,
+            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_F - Instructions like "cmp reg, [mem]".
+class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              SDPatternOperator opnode>
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs),
+            [(set EFLAGS,
+            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_RF - Instructions like "add reg, reg, [mem]".
+class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode>
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_RFF - Instructions like "adc reg, reg, [mem]".
+class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode>
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2),
+                    EFLAGS))], IIC_BIN_CARRY_MEM>;
+
+// BinOpRI - Instructions like "add reg, reg, imm".
+class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              Format f, dag outlist, list<dag> pattern,
+              InstrItinClass itin = IIC_BIN_NONMEM>
+  : ITy<opcode, f, typeinfo, outlist,
+        (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+    Sched<[WriteALU]> {
+  let ImmT = typeinfo.ImmEncoding;
+}
+
+// BinOpRI_F - Instructions like "cmp reg, imm".
+class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                SDPatternOperator opnode, Format f>
+  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs),
+            [(set EFLAGS,
+                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+
+// BinOpRI_RF - Instructions like "add reg, reg, imm".
+class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode, Format f>
+  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+// BinOpRI_RFF - Instructions like "adc reg, reg, imm".
+class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode, Format f>
+  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2,
+                        EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+
+// BinOpRI8 - Instructions like "add reg, reg, imm8".
+class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+               Format f, dag outlist, list<dag> pattern,
+               InstrItinClass itin = IIC_BIN_NONMEM>
+  : ITy<opcode, f, typeinfo, outlist,
+        (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+    Sched<[WriteALU]> {
+  let ImmT = Imm8; // Always 8-bit immediate.
+}
+
+// BinOpRI8_F - Instructions like "cmp reg, imm8".
+class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDPatternOperator opnode, Format f>
+  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs),
+             [(set EFLAGS,
+               (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_RF - Instructions like "add reg, reg, imm8".
+class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDPatternOperator opnode, Format f>
+  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+             [(set typeinfo.RegClass:$dst, EFLAGS,
+               (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_RFF - Instructions like "adc reg, reg, imm8".
+class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                   SDPatternOperator opnode, Format f>
+  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+             [(set typeinfo.RegClass:$dst, EFLAGS,
+               (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2,
+                       EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+
+// BinOpMR - Instructions like "add [mem], reg".
+class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              list<dag> pattern, InstrItinClass itin = IIC_BIN_MEM>
+  : ITy<opcode, MRMDestMem, typeinfo,
+        (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src),
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
+    Sched<[WriteALULd, WriteRMW]>;
+
+// BinOpMR_RMW - Instructions like "add [mem], reg".
+class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode>
+  : BinOpMR<opcode, mnemonic, typeinfo,
+          [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst),
+           (implicit EFLAGS)]>;
+
+// BinOpMR_RMW_FF - Instructions like "adc [mem], reg".
+class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                    SDNode opnode>
+  : BinOpMR<opcode, mnemonic, typeinfo,
+          [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
+                  addr:$dst),
+           (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+
+// BinOpMR_F - Instructions like "cmp [mem], reg".
+class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode>
+  : BinOpMR<opcode, mnemonic, typeinfo,
+            [(set EFLAGS, (opnode (load addr:$dst), typeinfo.RegClass:$src))]>;
+
+// BinOpMI - Instructions like "add [mem], imm".
+class BinOpMI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              Format f, list<dag> pattern,
+              InstrItinClass itin = IIC_BIN_MEM>
+  : ITy<opcode, f, typeinfo,
+        (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src),
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
+    Sched<[WriteALULd, WriteRMW]> {
+  let ImmT = typeinfo.ImmEncoding;
+}
+
+// BinOpMI_RMW - Instructions like "add [mem], imm".
+class BinOpMI_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode, Format f>
+  : BinOpMI<opcode, mnemonic, typeinfo, f,
+            [(store (opnode (typeinfo.VT (load addr:$dst)),
+                            typeinfo.ImmOperator:$src), addr:$dst),
+             (implicit EFLAGS)]>;
+// BinOpMI_RMW_FF - Instructions like "adc [mem], imm".
+class BinOpMI_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                     SDNode opnode, Format f>
+  : BinOpMI<opcode, mnemonic, typeinfo, f,
+            [(store (opnode (typeinfo.VT (load addr:$dst)),
+                            typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
+             (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+
+// BinOpMI_F - Instructions like "cmp [mem], imm".
+class BinOpMI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                SDPatternOperator opnode, Format f>
+  : BinOpMI<opcode, mnemonic, typeinfo, f,
+            [(set EFLAGS, (opnode (typeinfo.VT (load addr:$dst)),
+                                               typeinfo.ImmOperator:$src))]>;
+
+// BinOpMI8 - Instructions like "add [mem], imm8".
+class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
+               Format f, list<dag> pattern,
+               InstrItinClass itin = IIC_BIN_MEM>
+  : ITy<0x82, f, typeinfo,
+        (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src),
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
+    Sched<[WriteALULd, WriteRMW]> {
+  let ImmT = Imm8; // Always 8-bit immediate.
+}
+
+// BinOpMI8_RMW - Instructions like "add [mem], imm8".
+class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo,
+                   SDPatternOperator opnode, Format f>
+  : BinOpMI8<mnemonic, typeinfo, f,
+             [(store (opnode (load addr:$dst),
+                             typeinfo.Imm8Operator:$src), addr:$dst),
+              (implicit EFLAGS)]>;
+
+// BinOpMI8_RMW_FF - Instructions like "adc [mem], imm8".
+class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
+                      SDPatternOperator opnode, Format f>
+  : BinOpMI8<mnemonic, typeinfo, f,
+             [(store (opnode (load addr:$dst),
+                             typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst),
+              (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+
+// BinOpMI8_F - Instructions like "cmp [mem], imm8".
+class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
+                 SDPatternOperator opnode, Format f>
+  : BinOpMI8<mnemonic, typeinfo, f,
+             [(set EFLAGS, (opnode (load addr:$dst),
+                                   typeinfo.Imm8Operator:$src))]>;
+
+// BinOpAI - Instructions like "add %eax, %eax, imm", that imp-def EFLAGS.
+class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              Register areg, string operands,
+              InstrItinClass itin = IIC_BIN_NONMEM>
+  : ITy<opcode, RawFrm, typeinfo,
+        (outs), (ins typeinfo.ImmOperand:$src),
+        mnemonic, operands, [], itin>, Sched<[WriteALU]> {
+  let ImmT = typeinfo.ImmEncoding;
+  let Uses = [areg];
+  let Defs = [areg, EFLAGS];
+  let hasSideEffects = 0;
+}
+
+// BinOpAI_RFF - Instructions like "adc %eax, %eax, imm", that implicitly define
+// and use EFLAGS.
+class BinOpAI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  Register areg, string operands>
+  : BinOpAI<opcode, mnemonic, typeinfo, areg, operands,
+            IIC_BIN_CARRY_NONMEM> {
+  let Uses = [areg, EFLAGS];
+}
+
+// BinOpAI_F - Instructions like "cmp %eax, %eax, imm", that imp-def EFLAGS.
+class BinOpAI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                Register areg, string operands>
+  : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> {
+  let Defs = [EFLAGS];
+}
+
+/// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is
+/// defined with "(set GPR:$dst, EFLAGS, (...".
+///
+/// It would be nice to get rid of the second and third argument here, but
+/// tblgen can't handle dependent type references aggressively enough: PR8330
+multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+                         string mnemonic, Format RegMRM, Format MemMRM,
+                         SDNode opnodeflag, SDNode opnode,
+                         bit CommutableRR, bit ConvertibleToThreeAddress> {
+  let Defs = [EFLAGS] in {
+    let Constraints = "$src1 = $dst" in {
+      let isCommutable = CommutableRR in {
+        def NAME#8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
+        let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+          def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
+          def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
+          def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
+        } // isConvertibleToThreeAddress
+      } // isCommutable
+
+      def NAME#8rr_REV  : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
+      def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
+      def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
+      def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+
+      def NAME#8rm   : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
+      def NAME#16rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>;
+      def NAME#32rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>;
+      def NAME#64rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
+
+      def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
+
+      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        // NOTE: These are order specific, we want the ri8 forms to be listed
+        // first so that they are slightly preferred to the ri forms.
+        def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>;
+        def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>;
+        def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>;
+
+        def NAME#16ri  : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>;
+        def NAME#32ri  : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>;
+        def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>;
+      }
+    } // Constraints = "$src1 = $dst"
+
+    def NAME#8mr    : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>;
+    def NAME#16mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>;
+    def NAME#32mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>;
+    def NAME#64mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>;
+
+    // NOTE: These are order specific, we want the mi8 forms to be listed
+    // first so that they are slightly preferred to the mi forms.
+    def NAME#16mi8  : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi8  : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi8  : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>;
+
+    def NAME#8mi    : BinOpMI_RMW<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+    def NAME#16mi   : BinOpMI_RMW<0x80, mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi   : BinOpMI_RMW<0x80, mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi32 : BinOpMI_RMW<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+    // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+    // not in 64-bit mode.
+    let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
+        hasSideEffects = 0 in {
+      let Constraints = "$src1 = $dst" in
+        def NAME#8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, null_frag, RegMRM>;
+      let mayLoad = 1, mayStore = 1 in
+        def NAME#8mi8 : BinOpMI8_RMW<mnemonic, Xi8, null_frag, MemMRM>;
+    }
+  } // Defs = [EFLAGS]
+
+  def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+                           "{$src, %al|al, $src}">;
+  def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+                           "{$src, %ax|ax, $src}">;
+  def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+                           "{$src, %eax|eax, $src}">;
+  def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+                           "{$src, %rax|rax, $src}">;
+}
+
+/// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is
+/// defined with "(set GPR:$dst, EFLAGS, (node LHS, RHS, EFLAGS))" like ADC and
+/// SBB.
+///
+/// It would be nice to get rid of the second and third argument here, but
+/// tblgen can't handle dependent type references aggressively enough: PR8330
+multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+                          string mnemonic, Format RegMRM, Format MemMRM,
+                          SDNode opnode, bit CommutableRR,
+                           bit ConvertibleToThreeAddress> {
+  let Uses = [EFLAGS], Defs = [EFLAGS] in {
+    let Constraints = "$src1 = $dst" in {
+      let isCommutable = CommutableRR in {
+        def NAME#8rr  : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>;
+        let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+          def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>;
+          def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>;
+          def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>;
+        } // isConvertibleToThreeAddress
+      } // isCommutable
+
+      def NAME#8rr_REV  : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>;
+      def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>;
+      def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>;
+      def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>;
+
+      def NAME#8rm   : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>;
+      def NAME#16rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>;
+      def NAME#32rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>;
+      def NAME#64rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>;
+
+      def NAME#8ri   : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+
+      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        // NOTE: These are order specific, we want the ri8 forms to be listed
+        // first so that they are slightly preferred to the ri forms.
+        def NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, opnode, RegMRM>;
+        def NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>;
+        def NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>;
+
+        def NAME#16ri  : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>;
+        def NAME#32ri  : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>;
+        def NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>;
+      }
+    } // Constraints = "$src1 = $dst"
+
+    def NAME#8mr    : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>;
+    def NAME#16mr   : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>;
+    def NAME#32mr   : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>;
+    def NAME#64mr   : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>;
+
+    // NOTE: These are order specific, we want the mi8 forms to be listed
+    // first so that they are slightly preferred to the mi forms.
+    def NAME#16mi8  : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi8  : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi8  : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
+
+    def NAME#8mi    : BinOpMI_RMW_FF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+    def NAME#16mi   : BinOpMI_RMW_FF<0x80, mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi   : BinOpMI_RMW_FF<0x80, mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi32 : BinOpMI_RMW_FF<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+    // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+    // not in 64-bit mode.
+    let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
+        hasSideEffects = 0 in {
+      let Constraints = "$src1 = $dst" in
+        def NAME#8ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi8, null_frag, RegMRM>;
+      let mayLoad = 1, mayStore = 1 in
+        def NAME#8mi8 : BinOpMI8_RMW_FF<mnemonic, Xi8, null_frag, MemMRM>;
+    }
+  } // Uses = [EFLAGS], Defs = [EFLAGS]
+
+  def NAME#8i8   : BinOpAI_RFF<BaseOpc4, mnemonic, Xi8 , AL,
+                               "{$src, %al|al, $src}">;
+  def NAME#16i16 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi16, AX,
+                               "{$src, %ax|ax, $src}">;
+  def NAME#32i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi32, EAX,
+                               "{$src, %eax|eax, $src}">;
+  def NAME#64i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi64, RAX,
+                               "{$src, %rax|rax, $src}">;
+}
+
+/// ArithBinOp_F - This is an arithmetic binary operator where the pattern is
+/// defined with "(set EFLAGS, (...".  It would be really nice to find a way
+/// to factor this with the other ArithBinOp_*.
+///
+multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+                        string mnemonic, Format RegMRM, Format MemMRM,
+                        SDNode opnode,
+                        bit CommutableRR, bit ConvertibleToThreeAddress> {
+  let Defs = [EFLAGS] in {
+    let isCommutable = CommutableRR in {
+      def NAME#8rr  : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>;
+        def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>;
+        def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
+      }
+    } // isCommutable
+
+    def NAME#8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
+    def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>;
+    def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>;
+    def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
+
+    def NAME#8rm   : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
+    def NAME#16rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
+    def NAME#32rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>;
+    def NAME#64rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
+
+    def NAME#8ri   : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+
+    let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+      // NOTE: These are order specific, we want the ri8 forms to be listed
+      // first so that they are slightly preferred to the ri forms.
+      def NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, opnode, RegMRM>;
+      def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>;
+      def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>;
+
+      def NAME#16ri  : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>;
+      def NAME#32ri  : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>;
+      def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>;
+    }
+
+    def NAME#8mr    : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+    def NAME#16mr   : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>;
+    def NAME#32mr   : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>;
+    def NAME#64mr   : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
+
+    // NOTE: These are order specific, we want the mi8 forms to be listed
+    // first so that they are slightly preferred to the mi forms.
+    def NAME#16mi8  : BinOpMI8_F<mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi8  : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi8  : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>;
+
+    def NAME#8mi    : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+    def NAME#16mi   : BinOpMI_F<0x80, mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi   : BinOpMI_F<0x80, mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi32 : BinOpMI_F<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+    // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+    // not in 64-bit mode.
+    let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
+        hasSideEffects = 0 in {
+      def NAME#8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, null_frag, RegMRM>;
+      let mayLoad = 1 in
+        def NAME#8mi8 : BinOpMI8_F<mnemonic, Xi8, null_frag, MemMRM>;
+    }
+  } // Defs = [EFLAGS]
+
+  def NAME#8i8   : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL,
+                             "{$src, %al|al, $src}">;
+  def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX,
+                             "{$src, %ax|ax, $src}">;
+  def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX,
+                             "{$src, %eax|eax, $src}">;
+  def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX,
+                             "{$src, %rax|rax, $src}">;
+}
+
+
+defm AND : ArithBinOp_RF<0x20, 0x22, 0x24, "and", MRM4r, MRM4m,
+                         X86and_flag, and, 1, 0>;
+defm OR  : ArithBinOp_RF<0x08, 0x0A, 0x0C, "or", MRM1r, MRM1m,
+                         X86or_flag, or, 1, 0>;
+defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m,
+                         X86xor_flag, xor, 1, 0>;
+defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m,
+                         X86add_flag, add, 1, 1>;
+let isCompare = 1 in {
+defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
+                         X86sub_flag, sub, 0, 0>;
+}
+
+// Arithmetic.
+defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
+                          1, 0>;
+defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag,
+                          0, 0>;
+
+let isCompare = 1 in {
+defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Semantically, test instructions are similar like AND, except they don't
+// generate a result.  From an encoding perspective, they are very different:
+// they don't have all the usual imm8 and REV forms, and are encoded into a
+// different space.
+def X86testpat : PatFrag<(ops node:$lhs, node:$rhs),
+                         (X86cmp (and_su node:$lhs, node:$rhs), 0)>;
+
+let isCompare = 1 in {
+  let Defs = [EFLAGS] in {
+    let isCommutable = 1 in {
+      def TEST8rr  : BinOpRR_F<0x84, "test", Xi8 , X86testpat>;
+      def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat>;
+      def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat>;
+      def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat>;
+    } // isCommutable
+
+    def TEST8rm    : BinOpRM_F<0x84, "test", Xi8 , X86testpat>;
+    def TEST16rm   : BinOpRM_F<0x84, "test", Xi16, X86testpat>;
+    def TEST32rm   : BinOpRM_F<0x84, "test", Xi32, X86testpat>;
+    def TEST64rm   : BinOpRM_F<0x84, "test", Xi64, X86testpat>;
+
+    def TEST8ri    : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
+    def TEST16ri   : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
+    def TEST32ri   : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
+    def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
+
+    def TEST8mi    : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>;
+    def TEST16mi   : BinOpMI_F<0xF6, "test", Xi16, X86testpat, MRM0m>;
+    def TEST32mi   : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>;
+    def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>;
+
+    // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
+    // register class is constrained to GR8_NOREX. This pseudo is explicitly
+    // marked side-effect free, since it doesn't have an isel pattern like
+    // other test instructions.
+    let isPseudo = 1, hasSideEffects = 0 in
+    def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
+                          "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+  } // Defs = [EFLAGS]
+
+  def TEST8i8    : BinOpAI_F<0xA8, "test", Xi8 , AL,
+                             "{$src, %al|al, $src}">;
+  def TEST16i16  : BinOpAI_F<0xA8, "test", Xi16, AX,
+                             "{$src, %ax|ax, $src}">;
+  def TEST32i32  : BinOpAI_F<0xA8, "test", Xi32, EAX,
+                             "{$src, %eax|eax, $src}">;
+  def TEST64i32  : BinOpAI_F<0xA8, "test", Xi64, RAX,
+                             "{$src, %rax|rax, $src}">;
+} // isCompare
+
+//===----------------------------------------------------------------------===//
+// ANDN Instruction
+//
+multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
+                    PatFrag ld_frag> {
+  def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+            !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))],
+            IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+  def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+            !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            [(set RC:$dst, EFLAGS,
+             (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))], IIC_BIN_MEM>,
+           Sched<[WriteALULd, ReadAfterLd]>;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8PS, VEX_4V;
+  defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8PS, VEX_4V, VEX_W;
+}
+
+let Predicates = [HasBMI] in {
+  def : Pat<(and (not GR32:$src1), GR32:$src2),
+            (ANDN32rr GR32:$src1, GR32:$src2)>;
+  def : Pat<(and (not GR64:$src1), GR64:$src2),
+            (ANDN64rr GR64:$src1, GR64:$src2)>;
+  def : Pat<(and (not GR32:$src1), (loadi32 addr:$src2)),
+            (ANDN32rm GR32:$src1, addr:$src2)>;
+  def : Pat<(and (not GR64:$src1), (loadi64 addr:$src2)),
+            (ANDN64rm GR64:$src1, addr:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// MULX Instruction
+//
+multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+  let isCommutable = 1 in
+  def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
+             !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+             [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul, WriteIMulH]>;
+
+  let mayLoad = 1 in
+  def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
+             !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+             [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd, WriteIMulH]>;
+}
+}
+
+let Predicates = [HasBMI2] in {
+  let Uses = [EDX] in
+    defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem>;
+  let Uses = [RDX] in
+    defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem>, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// ADCX Instruction
+//
+let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS],
+    Constraints = "$src0 = $dst", AddedComplexity = 10 in {
+  let SchedRW = [WriteALU] in {
+  def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst),
+             (ins GR32:$src0, GR32:$src), "adcx{l}\t{$src, $dst|$dst, $src}",
+             [(set GR32:$dst, EFLAGS,
+                 (X86adc_flag GR32:$src0, GR32:$src, EFLAGS))],
+             IIC_BIN_CARRY_NONMEM>, T8PD;
+  def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst),
+             (ins GR64:$src0, GR64:$src), "adcx{q}\t{$src, $dst|$dst, $src}",
+             [(set GR64:$dst, EFLAGS,
+                 (X86adc_flag GR64:$src0, GR64:$src, EFLAGS))],
+             IIC_BIN_CARRY_NONMEM>, T8PD;
+  } // SchedRW
+
+  let mayLoad = 1, SchedRW = [WriteALULd] in {
+  def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst),
+             (ins GR32:$src0, i32mem:$src), "adcx{l}\t{$src, $dst|$dst, $src}",
+             [(set GR32:$dst, EFLAGS,
+                 (X86adc_flag GR32:$src0, (loadi32 addr:$src), EFLAGS))],
+             IIC_BIN_CARRY_MEM>, T8PD;
+
+  def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst),
+             (ins GR64:$src0, i64mem:$src), "adcx{q}\t{$src, $dst|$dst, $src}",
+             [(set GR64:$dst, EFLAGS,
+                 (X86adc_flag GR64:$src0, (loadi64 addr:$src), EFLAGS))],
+             IIC_BIN_CARRY_MEM>, T8PD;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// ADOX Instruction
+//
+let Predicates = [HasADX], hasSideEffects = 0, Defs = [EFLAGS],
+    Uses = [EFLAGS] in {
+  let SchedRW = [WriteALU] in {
+  def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+             "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS;
+
+  def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+             "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS;
+  } // SchedRW
+
+  let mayLoad = 1, SchedRW = [WriteALULd] in {
+  def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+             "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS;
+
+  def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+             "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS;
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
new file mode 100644
index 0000000..787f15b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
@@ -0,0 +1,183 @@
+//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle X86'isms in a clean way.
+//
+// The BuildMem function may be used with the BuildMI function to add entire
+// memory references in a single, typed, function call.  X86 memory references
+// can be very complex expressions (described in the README), so wrapping them
+// up behind an easier to use interface makes sense.  Descriptions of the
+// functions are included below.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Base, Scale, Index, Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H
+#define LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+
+namespace llvm {
+
+/// X86AddressMode - This struct holds a generalized full x86 address mode.
+/// The base register can be a frame index, which will eventually be replaced
+/// with BP or SP and Disp being offsetted accordingly.  The displacement may
+/// also include the offset of a global value.
+struct X86AddressMode {
+  enum {
+    RegBase,
+    FrameIndexBase
+  } BaseType;
+
+  union {
+    unsigned Reg;
+    int FrameIndex;
+  } Base;
+
+  unsigned Scale;
+  unsigned IndexReg;
+  int Disp;
+  const GlobalValue *GV;
+  unsigned GVOpFlags;
+
+  X86AddressMode()
+    : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(nullptr),
+      GVOpFlags(0) {
+    Base.Reg = 0;
+  }
+
+
+  void getFullAddress(SmallVectorImpl<MachineOperand> &MO) {
+    assert(Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8);
+
+    if (BaseType == X86AddressMode::RegBase)
+      MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false,
+                                             false, false, false, 0, false));
+    else {
+      assert(BaseType == X86AddressMode::FrameIndexBase);
+      MO.push_back(MachineOperand::CreateFI(Base.FrameIndex));
+    }
+
+    MO.push_back(MachineOperand::CreateImm(Scale));
+    MO.push_back(MachineOperand::CreateReg(IndexReg, false, false,
+                                           false, false, false, 0, false));
+
+    if (GV)
+      MO.push_back(MachineOperand::CreateGA(GV, Disp, GVOpFlags));
+    else
+      MO.push_back(MachineOperand::CreateImm(Disp));
+
+    MO.push_back(MachineOperand::CreateReg(0, false, false,
+                                           false, false, false, 0, false));
+  }
+};
+
+/// addDirectMem - This function is used to add a direct memory reference to the
+/// current instruction -- that is, a dereference of an address in a register,
+/// with no scale, index or displacement. An example is: DWORD PTR [EAX].
+///
+static inline const MachineInstrBuilder &
+addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) {
+  // Because memory references are always represented with five
+  // values, this adds: Reg, 1, NoReg, 0, NoReg to the instruction.
+  return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0).addReg(0);
+}
+
+
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, int Offset) {
+  return MIB.addImm(1).addReg(0).addImm(Offset).addReg(0);
+}
+
+/// addRegOffset - This function is used to add a memory reference of the form
+/// [Reg + Offset], i.e., one with no scale or index, but with a
+/// displacement. An example is: DWORD PTR [EAX + 4].
+///
+static inline const MachineInstrBuilder &
+addRegOffset(const MachineInstrBuilder &MIB,
+             unsigned Reg, bool isKill, int Offset) {
+  return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
+}
+
+/// addRegReg - This function is used to add a memory reference of the form:
+/// [Reg + Reg].
+static inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
+                                            unsigned Reg1, bool isKill1,
+                                            unsigned Reg2, bool isKill2) {
+  return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(1)
+    .addReg(Reg2, getKillRegState(isKill2)).addImm(0).addReg(0);
+}
+
+static inline const MachineInstrBuilder &
+addFullAddress(const MachineInstrBuilder &MIB,
+               const X86AddressMode &AM) {
+  assert(AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
+
+  if (AM.BaseType == X86AddressMode::RegBase)
+    MIB.addReg(AM.Base.Reg);
+  else {
+    assert(AM.BaseType == X86AddressMode::FrameIndexBase);
+    MIB.addFrameIndex(AM.Base.FrameIndex);
+  }
+
+  MIB.addImm(AM.Scale).addReg(AM.IndexReg);
+  if (AM.GV)
+    MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags);
+  else
+    MIB.addImm(AM.Disp);
+
+  return MIB.addReg(0);
+}
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+  MachineInstr *MI = MIB;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  const MCInstrDesc &MCID = MI->getDesc();
+  unsigned Flags = 0;
+  if (MCID.mayLoad())
+    Flags |= MachineMemOperand::MOLoad;
+  if (MCID.mayStore())
+    Flags |= MachineMemOperand::MOStore;
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags,
+      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+  return addOffset(MIB.addFrameIndex(FI), Offset)
+            .addMemOperand(MMO);
+}
+
+/// addConstantPoolReference - This function is used to add a reference to the
+/// base of a constant value spilled to the per-function constant pool.  The
+/// reference uses the abstract ConstantPoolIndex which is retained until
+/// either machine code emission or assembly output. In PIC mode on x86-32,
+/// the GlobalBaseReg parameter can be used to make this a
+/// GlobalBaseReg-relative reference.
+///
+static inline const MachineInstrBuilder &
+addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
+                         unsigned GlobalBaseReg, unsigned char OpFlags) {
+  //FIXME: factor this
+  return MIB.addReg(GlobalBaseReg).addImm(1).addReg(0)
+    .addConstantPoolIndex(CPI, 0, OpFlags).addReg(0);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
new file mode 100644
index 0000000..c73c950
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -0,0 +1,112 @@
+//===-- X86InstrCMovSetCC.td - Conditional Move and SetCC --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 conditional move and set on condition
+// instructions.
+//
+//===----------------------------------------------------------------------===//
+
+
+// CMOV instructions.
+multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
+  let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+      isCommutable = 1, SchedRW = [WriteALU] in {
+    def NAME#16rr
+      : I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+          !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR16:$dst,
+                (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))],
+                IIC_CMOV16_RR>, TB, OpSize16;
+    def NAME#32rr
+      : I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+          !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR32:$dst,
+                (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))],
+                IIC_CMOV32_RR>, TB, OpSize32;
+    def NAME#64rr
+      :RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+          !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR64:$dst,
+                (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))],
+                IIC_CMOV32_RR>, TB;
+  }
+
+  let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+      SchedRW = [WriteALULd, ReadAfterLd] in {
+    def NAME#16rm
+      : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+          !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                    CondNode, EFLAGS))], IIC_CMOV16_RM>,
+                                    TB, OpSize16;
+    def NAME#32rm
+      : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+          !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                    CondNode, EFLAGS))], IIC_CMOV32_RM>,
+                                    TB, OpSize32;
+    def NAME#64rm
+      :RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+          !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    CondNode, EFLAGS))], IIC_CMOV32_RM>, TB;
+  } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
+} // end multiclass
+
+
+// Conditional Moves.
+defm CMOVO  : CMOV<0x40, "cmovo" , X86_COND_O>;
+defm CMOVNO : CMOV<0x41, "cmovno", X86_COND_NO>;
+defm CMOVB  : CMOV<0x42, "cmovb" , X86_COND_B>;
+defm CMOVAE : CMOV<0x43, "cmovae", X86_COND_AE>;
+defm CMOVE  : CMOV<0x44, "cmove" , X86_COND_E>;
+defm CMOVNE : CMOV<0x45, "cmovne", X86_COND_NE>;
+defm CMOVBE : CMOV<0x46, "cmovbe", X86_COND_BE>;
+defm CMOVA  : CMOV<0x47, "cmova" , X86_COND_A>;
+defm CMOVS  : CMOV<0x48, "cmovs" , X86_COND_S>;
+defm CMOVNS : CMOV<0x49, "cmovns", X86_COND_NS>;
+defm CMOVP  : CMOV<0x4A, "cmovp" , X86_COND_P>;
+defm CMOVNP : CMOV<0x4B, "cmovnp", X86_COND_NP>;
+defm CMOVL  : CMOV<0x4C, "cmovl" , X86_COND_L>;
+defm CMOVGE : CMOV<0x4D, "cmovge", X86_COND_GE>;
+defm CMOVLE : CMOV<0x4E, "cmovle", X86_COND_LE>;
+defm CMOVG  : CMOV<0x4F, "cmovg" , X86_COND_G>;
+
+
+// SetCC instructions.
+multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> {
+  let Uses = [EFLAGS] in {
+    def r    : I<opc, MRMXr,  (outs GR8:$dst), (ins),
+                     !strconcat(Mnemonic, "\t$dst"),
+                     [(set GR8:$dst, (X86setcc OpNode, EFLAGS))],
+                     IIC_SET_R>, TB, Sched<[WriteALU]>;
+    def m    : I<opc, MRMXm,  (outs), (ins i8mem:$dst),
+                     !strconcat(Mnemonic, "\t$dst"),
+                     [(store (X86setcc OpNode, EFLAGS), addr:$dst)],
+                     IIC_SET_M>, TB, Sched<[WriteALU, WriteStore]>;
+  } // Uses = [EFLAGS]
+}
+
+defm SETO  : SETCC<0x90, "seto",  X86_COND_O>;   // is overflow bit set
+defm SETNO : SETCC<0x91, "setno", X86_COND_NO>;  // is overflow bit not set
+defm SETB  : SETCC<0x92, "setb",  X86_COND_B>;   // unsigned less than
+defm SETAE : SETCC<0x93, "setae", X86_COND_AE>;  // unsigned greater or equal
+defm SETE  : SETCC<0x94, "sete",  X86_COND_E>;   // equal to
+defm SETNE : SETCC<0x95, "setne", X86_COND_NE>;  // not equal to
+defm SETBE : SETCC<0x96, "setbe", X86_COND_BE>;  // unsigned less than or equal
+defm SETA  : SETCC<0x97, "seta",  X86_COND_A>;   // unsigned greater than
+defm SETS  : SETCC<0x98, "sets",  X86_COND_S>;   // is signed bit set
+defm SETNS : SETCC<0x99, "setns", X86_COND_NS>;  // is not signed
+defm SETP  : SETCC<0x9A, "setp",  X86_COND_P>;   // is parity bit set
+defm SETNP : SETCC<0x9B, "setnp", X86_COND_NP>;  // is parity bit not set
+defm SETL  : SETCC<0x9C, "setl",  X86_COND_L>;   // signed less than
+defm SETGE : SETCC<0x9D, "setge", X86_COND_GE>;  // signed greater or equal
+defm SETLE : SETCC<0x9E, "setle", X86_COND_LE>;  // signed less than or equal
+defm SETG  : SETCC<0x9F, "setg",  X86_COND_G>;   // signed greater than
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
new file mode 100644
index 0000000..96a29ca
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -0,0 +1,1864 @@
+//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the various pseudo instructions used by the compiler,
+// as well as Pat patterns used during instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Pattern Matching Support
+
+def GetLo32XForm : SDNodeXForm<imm, [{
+  // Transformation function: get the low 32 bits.
+  return getI32Imm((unsigned)N->getZExtValue(), SDLoc(N));
+}]>;
+
+def GetLo8XForm : SDNodeXForm<imm, [{
+  // Transformation function: get the low 8 bits.
+  return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
+}]>;
+
+
+//===----------------------------------------------------------------------===//
+// Random Pseudo Instructions.
+
+// PIC base construction.  This expands to code that looks like this:
+//     call  $next_inst
+//     popl %destreg"
+let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in
+  def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
+                      "", []>;
+
+
+// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [ESP, EFLAGS], Uses = [ESP] in {
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                           "#ADJCALLSTACKDOWN",
+                           []>,
+                          Requires<[NotLP64]>;
+def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                           "#ADJCALLSTACKUP",
+                           [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+                          Requires<[NotLP64]>;
+}
+def : Pat<(X86callseq_start timm:$amt1),
+          (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>;
+
+
+// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [RSP, EFLAGS], Uses = [RSP] in {
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                           "#ADJCALLSTACKDOWN",
+                           []>,
+                          Requires<[IsLP64]>;
+def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                           "#ADJCALLSTACKUP",
+                           [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+                          Requires<[IsLP64]>;
+}
+def : Pat<(X86callseq_start timm:$amt1),
+          (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>;
+
+
+// x86-64 va_start lowering magic.
+let usesCustomInserter = 1, Defs = [EFLAGS] in {
+def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
+                              (outs),
+                              (ins GR8:$al,
+                                   i64imm:$regsavefi, i64imm:$offset,
+                                   variable_ops),
+                              "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
+                              [(X86vastart_save_xmm_regs GR8:$al,
+                                                         imm:$regsavefi,
+                                                         imm:$offset),
+                               (implicit EFLAGS)]>;
+
+// The VAARG_64 pseudo-instruction takes the address of the va_list,
+// and places the address of the next argument into a register.
+let Defs = [EFLAGS] in
+def VAARG_64 : I<0, Pseudo,
+                 (outs GR64:$dst),
+                 (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
+                 "#VAARG_64 $dst, $ap, $size, $mode, $align",
+                 [(set GR64:$dst,
+                    (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)),
+                  (implicit EFLAGS)]>;
+
+// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows
+// targets.  These calls are needed to probe the stack when allocating more than
+// 4k bytes in one go. Touching the stack at 4K increments is necessary to
+// ensure that the guard pages used by the OS virtual memory manager are
+// allocated in correct sequence.
+// The main point of having separate instruction are extra unmodelled effects
+// (compared to ordinary calls) like stack pointer change.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+  def WIN_ALLOCA : I<0, Pseudo, (outs), (ins),
+                     "# dynamic stack allocation",
+                     [(X86WinAlloca)]>;
+
+// When using segmented stacks these are lowered into instructions which first
+// check if the current stacklet has enough free memory. If it does, memory is
+// allocated by bumping the stack pointer. Otherwise memory is allocated from
+// the heap.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
+                      "# variable sized alloca for segmented stacks",
+                      [(set GR32:$dst,
+                         (X86SegAlloca GR32:$size))]>,
+                    Requires<[NotLP64]>;
+
+let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
+def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
+                      "# variable sized alloca for segmented stacks",
+                      [(set GR64:$dst,
+                         (X86SegAlloca GR64:$size))]>,
+                    Requires<[In64BitMode]>;
+}
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let SchedRW = [WriteSystem] in {
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN   : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
+                    "ret\t#eh_return, addr: $addr",
+                    [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
+
+}
+
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN64   : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
+                     "ret\t#eh_return, addr: $addr",
+                     [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
+
+}
+
+let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
+    isCodeGenOnly = 1, isReturn = 1 in {
+  def CLEANUPRET : I<0, Pseudo, (outs), (ins), "# CLEANUPRET", [(cleanupret)]>;
+
+  // CATCHRET needs a custom inserter for SEH.
+  let usesCustomInserter = 1 in
+    def CATCHRET : I<0, Pseudo, (outs), (ins brtarget32:$dst, brtarget32:$from),
+                     "# CATCHRET",
+                     [(catchret bb:$dst, bb:$from)]>;
+}
+
+let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1,
+    usesCustomInserter = 1 in
+def CATCHPAD : I<0, Pseudo, (outs), (ins), "# CATCHPAD", [(catchpad)]>;
+
+// This instruction is responsible for re-establishing stack pointers after an
+// exception has been caught and we are rejoining normal control flow in the
+// parent function or funclet. It generally sets ESP and EBP, and optionally
+// ESI. It is only needed for 32-bit WinEH, as the runtime restores CSRs for us
+// elsewhere.
+let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in
+def EH_RESTORE : I<0, Pseudo, (outs), (ins), "# EH_RESTORE", []>;
+
+let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+    usesCustomInserter = 1 in {
+  def EH_SjLj_SetJmp32  : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
+                            "#EH_SJLJ_SETJMP32",
+                            [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+                          Requires<[Not64BitMode]>;
+  def EH_SjLj_SetJmp64  : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf),
+                            "#EH_SJLJ_SETJMP64",
+                            [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+                          Requires<[In64BitMode]>;
+  let isTerminator = 1 in {
+  def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf),
+                            "#EH_SJLJ_LONGJMP32",
+                            [(X86eh_sjlj_longjmp addr:$buf)]>,
+                          Requires<[Not64BitMode]>;
+  def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf),
+                            "#EH_SJLJ_LONGJMP64",
+                            [(X86eh_sjlj_longjmp addr:$buf)]>,
+                          Requires<[In64BitMode]>;
+  }
+}
+} // SchedRW
+
+let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
+  def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst),
+                        "#EH_SjLj_Setup\t$dst", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions used by unwind info.
+//
+let isPseudo = 1 in {
+  def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg),
+                            "#SEH_PushReg $reg", []>;
+  def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
+                            "#SEH_SaveReg $reg, $dst", []>;
+  def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
+                            "#SEH_SaveXMM $reg, $dst", []>;
+  def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size),
+                            "#SEH_StackAlloc $size", []>;
+  def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset),
+                            "#SEH_SetFrame $reg, $offset", []>;
+  def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode),
+                            "#SEH_PushFrame $mode", []>;
+  def SEH_EndPrologue : I<0, Pseudo, (outs), (ins),
+                            "#SEH_EndPrologue", []>;
+  def SEH_Epilogue : I<0, Pseudo, (outs), (ins),
+                            "#SEH_Epilogue", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions used by segmented stacks.
+//
+
+// This is lowered into a RET instruction by MCInstLower.  We need
+// this so that we don't have to have a MachineBasicBlock which ends
+// with a RET and also has successors.
+let isPseudo = 1 in {
+def MORESTACK_RET: I<0, Pseudo, (outs), (ins),
+                          "", []>;
+
+// This instruction is lowered to a RET followed by a MOV.  The two
+// instructions are not generated on a higher level since then the
+// verifier sees a MachineBasicBlock ending with a non-terminator.
+def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
+                                  "", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instruction mapping movr0 to xor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
+    isPseudo = 1 in
+def MOV32r0  : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+                 [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
+
+// Other widths can also make use of the 32-bit xor, which may have a smaller
+// encoding and avoid partial register updates.
+def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
+def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
+def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
+  let AddedComplexity = 20;
+}
+
+let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
+    AddedComplexity = 1 in {
+  // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC,
+  // which only require 3 bytes compared to MOV32ri which requires 5.
+  let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
+    def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+                        [(set GR32:$dst, 1)]>;
+    def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+                        [(set GR32:$dst, -1)]>;
+  }
+
+  // MOV16ri is 4 bytes, so the instructions above are smaller.
+  def : Pat<(i16 1), (EXTRACT_SUBREG (MOV32r1), sub_16bit)>;
+  def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>;
+}
+
+// Materialize i64 constant where top 32-bits are zero. This could theoretically
+// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
+// that would make it more difficult to rematerialize.
+let isReMaterializable = 1, isAsCheapAsAMove = 1,
+    isPseudo = 1, hasSideEffects = 0 in
+def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", []>;
+
+// This 64-bit pseudo-move can be used for both a 64-bit constant that is
+// actually the zero-extension of a 32-bit constant and for labels in the
+// x86-64 small code model.
+def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [imm, X86Wrapper]>;
+
+let AddedComplexity = 1 in
+def : Pat<(i64 mov64imm32:$src),
+          (SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>;
+
+// Use sbb to materialize carry bit.
+let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in {
+// FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
+// However, Pat<> can't replicate the destination reg into the inputs of the
+// result.
+def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "",
+                 [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "",
+                 [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+                 [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "",
+                 [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+} // isCodeGenOnly
+
+
+def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C16r)>;
+def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C32r)>;
+def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C64r)>;
+
+def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C16r)>;
+def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C32r)>;
+def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C64r)>;
+
+// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and
+// will be eliminated and that the sbb can be extended up to a wider type.  When
+// this happens, it is great.  However, if we are left with an 8-bit sbb and an
+// and, we might as well just match it as a setb.
+def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
+          (SETBr)>;
+
+// (add OP, SETB) -> (adc OP, 0)
+def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op),
+          (ADC8ri GR8:$op, 0)>;
+def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op),
+          (ADC32ri8 GR32:$op, 0)>;
+def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op),
+          (ADC64ri8 GR64:$op, 0)>;
+
+// (sub OP, SETB) -> (sbb OP, 0)
+def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+          (SBB8ri GR8:$op, 0)>;
+def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+          (SBB32ri8 GR32:$op, 0)>;
+def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+          (SBB64ri8 GR64:$op, 0)>;
+
+// (sub OP, SETCC_CARRY) -> (adc OP, 0)
+def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))),
+          (ADC8ri GR8:$op, 0)>;
+def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))),
+          (ADC32ri8 GR32:$op, 0)>;
+def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
+          (ADC64ri8 GR64:$op, 0)>;
+
+//===----------------------------------------------------------------------===//
+// String Pseudo Instructions
+//
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
+def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+                    [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+                   Requires<[Not64BitMode]>;
+def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
+                   Requires<[Not64BitMode]>;
+def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
+                   Requires<[Not64BitMode]>;
+}
+
+let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in {
+def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+                    [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+                   Requires<[In64BitMode]>;
+def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
+                   Requires<[In64BitMode]>;
+def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
+                   Requires<[In64BitMode]>;
+def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
+                    [(X86rep_movs i64)], IIC_REP_MOVS>, REP,
+                   Requires<[In64BitMode]>;
+}
+
+// FIXME: Should use "(X86rep_stos AL)" as the pattern.
+let Defs = [ECX,EDI], isCodeGenOnly = 1 in {
+  let Uses = [AL,ECX,EDI] in
+  def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+                      [(X86rep_stos i8)], IIC_REP_STOS>, REP,
+                     Requires<[Not64BitMode]>;
+  let Uses = [AX,ECX,EDI] in
+  def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
+                     Requires<[Not64BitMode]>;
+  let Uses = [EAX,ECX,EDI] in
+  def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+                      [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
+                     Requires<[Not64BitMode]>;
+}
+
+let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
+  let Uses = [AL,RCX,RDI] in
+  def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+                      [(X86rep_stos i8)], IIC_REP_STOS>, REP,
+                     Requires<[In64BitMode]>;
+  let Uses = [AX,RCX,RDI] in
+  def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
+                     Requires<[In64BitMode]>;
+  let Uses = [RAX,RCX,RDI] in
+  def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+                      [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
+                     Requires<[In64BitMode]>;
+
+  let Uses = [RAX,RCX,RDI] in
+  def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
+                      [(X86rep_stos i64)], IIC_REP_STOS>, REP,
+                     Requires<[In64BitMode]>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//
+
+// ELF TLS Support
+// All calls clobber the non-callee saved registers. ESP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
+            ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
+            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+    Uses = [ESP] in {
+def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                  "# TLS_addr32",
+                  [(X86tlsaddr tls32addr:$sym)]>,
+                  Requires<[Not64BitMode]>;
+def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                  "# TLS_base_addr32",
+                  [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
+                  Requires<[Not64BitMode]>;
+}
+
+// All calls clobber the non-callee saved registers. RSP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+            FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
+            ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
+            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+    Uses = [RSP] in {
+def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+                   "# TLS_addr64",
+                  [(X86tlsaddr tls64addr:$sym)]>,
+                  Requires<[In64BitMode]>;
+def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+                   "# TLS_base_addr64",
+                  [(X86tlsbaseaddr tls64baseaddr:$sym)]>,
+                  Requires<[In64BitMode]>;
+}
+
+// Darwin TLS Support
+// For i386, the address of the thunk is passed on the stack, on return the
+// address of the variable is in %eax.  %ecx is trashed during the function
+// call.  All other registers are preserved.
+let Defs = [EAX, ECX, EFLAGS],
+    Uses = [ESP],
+    usesCustomInserter = 1 in
+def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                "# TLSCall_32",
+                [(X86TLSCall addr:$sym)]>,
+                Requires<[Not64BitMode]>;
+
+// For x86_64, the address of the thunk is passed in %rdi, on return
+// the address of the variable is in %rax.  All other registers are preserved.
+let Defs = [RAX, EFLAGS],
+    Uses = [RSP, RDI],
+    usesCustomInserter = 1 in
+def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+                  "# TLSCall_64",
+                  [(X86TLSCall addr:$sym)]>,
+                  Requires<[In64BitMode]>;
+
+
+//===----------------------------------------------------------------------===//
+// Conditional Move Pseudo Instructions
+
+// CMOV* - Used to implement the SELECT DAG operation.  Expanded after
+// instruction selection into a branch sequence.
+multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> {
+  def CMOV#NAME  : I<0, Pseudo,
+                    (outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond),
+                    "#CMOV_"#NAME#" PSEUDO!",
+                    [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, imm:$cond,
+                                                EFLAGS)))]>;
+}
+
+let usesCustomInserter = 1, Uses = [EFLAGS] in {
+  // X86 doesn't have 8-bit conditional moves. Use a customInserter to
+  // emit control flow. An alternative to this is to mark i8 SELECT as Promote,
+  // however that requires promoting the operands, and can induce additional
+  // i8 register pressure.
+  defm _GR8 : CMOVrr_PSEUDO<GR8, i8>;
+
+  let Predicates = [NoCMov] in {
+    defm _GR32 : CMOVrr_PSEUDO<GR32, i32>;
+    defm _GR16 : CMOVrr_PSEUDO<GR16, i16>;
+  } // Predicates = [NoCMov]
+
+  // fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
+  // SSE1/SSE2.
+  let Predicates = [FPStackf32] in
+    defm _RFP32 : CMOVrr_PSEUDO<RFP32, f32>;
+
+  let Predicates = [FPStackf64] in
+    defm _RFP64 : CMOVrr_PSEUDO<RFP64, f64>;
+
+  defm _RFP80 : CMOVrr_PSEUDO<RFP80, f80>;
+
+  defm _FR32   : CMOVrr_PSEUDO<FR32, f32>;
+  defm _FR64   : CMOVrr_PSEUDO<FR64, f64>;
+  defm _FR128  : CMOVrr_PSEUDO<FR128, f128>;
+  defm _V4F32  : CMOVrr_PSEUDO<VR128, v4f32>;
+  defm _V2F64  : CMOVrr_PSEUDO<VR128, v2f64>;
+  defm _V2I64  : CMOVrr_PSEUDO<VR128, v2i64>;
+  defm _V8F32  : CMOVrr_PSEUDO<VR256, v8f32>;
+  defm _V4F64  : CMOVrr_PSEUDO<VR256, v4f64>;
+  defm _V4I64  : CMOVrr_PSEUDO<VR256, v4i64>;
+  defm _V8I64  : CMOVrr_PSEUDO<VR512, v8i64>;
+  defm _V8F64  : CMOVrr_PSEUDO<VR512, v8f64>;
+  defm _V16F32 : CMOVrr_PSEUDO<VR512, v16f32>;
+  defm _V8I1   : CMOVrr_PSEUDO<VK8,  v8i1>;
+  defm _V16I1  : CMOVrr_PSEUDO<VK16, v16i1>;
+  defm _V32I1  : CMOVrr_PSEUDO<VK32, v32i1>;
+  defm _V64I1  : CMOVrr_PSEUDO<VK64, v64i1>;
+} // usesCustomInserter = 1, Uses = [EFLAGS]
+
+//===----------------------------------------------------------------------===//
+// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+// FIXME: Use normal instructions and add lock prefix dynamically.
+
+// Memory barriers
+
+// TODO: Get this to fold the constant into the instruction.
+let isCodeGenOnly = 1, Defs = [EFLAGS] in
+def OR32mrLocked  : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
+                      "or{l}\t{$zero, $dst|$dst, $zero}", [],
+                      IIC_ALU_MEM>, Requires<[Not64BitMode]>, OpSize32, LOCK,
+                    Sched<[WriteALULd, WriteRMW]>;
+
+let hasSideEffects = 1 in
+def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
+                     "#MEMBARRIER",
+                     [(X86MemBarrier)]>, Sched<[WriteLoad]>;
+
+// RegOpc corresponds to the mr version of the instruction
+// ImmOpc corresponds to the mi version of the instruction
+// ImmOpc8 corresponds to the mi8 version of the instruction
+// ImmMod corresponds to the instruction format of the mi and mi8 versions
+multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
+                           Format ImmMod, string mnemonic> {
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+    SchedRW = [WriteALULd, WriteRMW] in {
+
+def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                  RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
+                  MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+                  !strconcat(mnemonic, "{b}\t",
+                             "{$src2, $dst|$dst, $src2}"),
+                  [], IIC_ALU_NONMEM>, LOCK;
+def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                   RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+                   MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                   !strconcat(mnemonic, "{w}\t",
+                              "{$src2, $dst|$dst, $src2}"),
+                   [], IIC_ALU_NONMEM>, OpSize16, LOCK;
+def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                   RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+                   MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                   !strconcat(mnemonic, "{l}\t",
+                              "{$src2, $dst|$dst, $src2}"),
+                   [], IIC_ALU_NONMEM>, OpSize32, LOCK;
+def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+                    MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                    !strconcat(mnemonic, "{q}\t",
+                               "{$src2, $dst|$dst, $src2}"),
+                    [], IIC_ALU_NONMEM>, LOCK;
+
+def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                    ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
+                    ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
+                    !strconcat(mnemonic, "{b}\t",
+                               "{$src2, $dst|$dst, $src2}"),
+                    [], IIC_ALU_MEM>, LOCK;
+
+def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                      ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                      ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
+                      !strconcat(mnemonic, "{w}\t",
+                                 "{$src2, $dst|$dst, $src2}"),
+                      [], IIC_ALU_MEM>, OpSize16, LOCK;
+
+def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                      ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                      ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
+                      !strconcat(mnemonic, "{l}\t",
+                                 "{$src2, $dst|$dst, $src2}"),
+                      [], IIC_ALU_MEM>, OpSize32, LOCK;
+
+def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                          ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                          ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
+                          !strconcat(mnemonic, "{q}\t",
+                                     "{$src2, $dst|$dst, $src2}"),
+                          [], IIC_ALU_MEM>, LOCK;
+
+def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+                      ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+                      ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
+                      !strconcat(mnemonic, "{w}\t",
+                                 "{$src2, $dst|$dst, $src2}"),
+                      [], IIC_ALU_MEM>, OpSize16, LOCK;
+def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+                      ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+                      ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
+                      !strconcat(mnemonic, "{l}\t",
+                                 "{$src2, $dst|$dst, $src2}"),
+                      [], IIC_ALU_MEM>, OpSize32, LOCK;
+def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+                       ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+                       ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
+                       !strconcat(mnemonic, "{q}\t",
+                                  "{$src2, $dst|$dst, $src2}"),
+                       [], IIC_ALU_MEM>, LOCK;
+
+}
+
+}
+
+defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">;
+defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">;
+defm LOCK_OR  : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">;
+defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">;
+defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">;
+
+// Optimized codegen when the non-memory output is not used.
+multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form,
+                          string mnemonic> {
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+    SchedRW = [WriteALULd, WriteRMW] in {
+
+def NAME#8m  : I<Opc8, Form, (outs), (ins i8mem :$dst),
+                 !strconcat(mnemonic, "{b}\t$dst"),
+                 [], IIC_UNARY_MEM>, LOCK;
+def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
+                 !strconcat(mnemonic, "{w}\t$dst"),
+                 [], IIC_UNARY_MEM>, OpSize16, LOCK;
+def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
+                 !strconcat(mnemonic, "{l}\t$dst"),
+                 [], IIC_UNARY_MEM>, OpSize32, LOCK;
+def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
+                  !strconcat(mnemonic, "{q}\t$dst"),
+                  [], IIC_UNARY_MEM>, LOCK;
+}
+}
+
+defm LOCK_INC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">;
+defm LOCK_DEC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">;
+
+// Atomic compare and swap.
+multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
+                         SDPatternOperator frag, X86MemOperand x86memop,
+                         InstrItinClass itin> {
+let isCodeGenOnly = 1 in {
+  def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
+               !strconcat(mnemonic, "\t$ptr"),
+               [(frag addr:$ptr)], itin>, TB, LOCK;
+}
+}
+
+multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
+                          string mnemonic, SDPatternOperator frag,
+                          InstrItinClass itin8, InstrItinClass itin> {
+let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in {
+  let Defs = [AL, EFLAGS], Uses = [AL] in
+  def NAME#8  : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
+                  !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"),
+                  [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK;
+  let Defs = [AX, EFLAGS], Uses = [AX] in
+  def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
+                  !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"),
+                  [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize16, LOCK;
+  let Defs = [EAX, EFLAGS], Uses = [EAX] in
+  def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
+                  !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"),
+                  [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, OpSize32, LOCK;
+  let Defs = [RAX, EFLAGS], Uses = [RAX] in
+  def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
+                   !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"),
+                   [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK;
+}
+}
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
+    SchedRW = [WriteALULd, WriteRMW] in {
+defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
+                                X86cas8, i64mem,
+                                IIC_CMPX_LOCK_8B>;
+}
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
+    Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in {
+defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
+                                 X86cas16, i128mem,
+                                 IIC_CMPX_LOCK_16B>, REX_W;
+}
+
+defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
+                               X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>;
+
+// Atomic exchange and add
+multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
+                             string frag,
+                             InstrItinClass itin8, InstrItinClass itin> {
+  let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1,
+      SchedRW = [WriteALULd, WriteRMW] in {
+    def NAME#8  : I<opc8, MRMSrcMem, (outs GR8:$dst),
+                    (ins GR8:$val, i8mem:$ptr),
+                    !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+                    [(set GR8:$dst,
+                          (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+                    itin8>;
+    def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
+                    (ins GR16:$val, i16mem:$ptr),
+                    !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+                    [(set
+                       GR16:$dst,
+                       (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+                    itin>, OpSize16;
+    def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
+                    (ins GR32:$val, i32mem:$ptr),
+                    !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+                    [(set
+                       GR32:$dst,
+                       (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+                    itin>, OpSize32;
+    def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
+                     (ins GR64:$val, i64mem:$ptr),
+                     !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+                     [(set
+                        GR64:$dst,
+                        (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+                     itin>;
+  }
+}
+
+defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
+                               IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>,
+             TB, LOCK;
+
+/* The following multiclass tries to make sure that in code like
+ *    x.store (immediate op x.load(acquire), release)
+ * and
+ *    x.store (register op x.load(acquire), release)
+ * an operation directly on memory is generated instead of wasting a register.
+ * It is not automatic as atomic_store/load are only lowered to MOV instructions
+ * extremely late to prevent them from being accidentally reordered in the backend
+ * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions)
+ */
+multiclass RELEASE_BINOP_MI<SDNode op> {
+    def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
+        "#BINOP "#NAME#"8mi PSEUDO!",
+        [(atomic_store_8 addr:$dst, (op
+            (atomic_load_8 addr:$dst), (i8 imm:$src)))]>;
+    def NAME#8mr : I<0, Pseudo, (outs), (ins i8mem:$dst, GR8:$src),
+        "#BINOP "#NAME#"8mr PSEUDO!",
+        [(atomic_store_8 addr:$dst, (op
+            (atomic_load_8 addr:$dst), GR8:$src))]>;
+    // NAME#16 is not generated as 16-bit arithmetic instructions are considered
+    // costly and avoided as far as possible by this backend anyway
+    def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
+        "#BINOP "#NAME#"32mi PSEUDO!",
+        [(atomic_store_32 addr:$dst, (op
+            (atomic_load_32 addr:$dst), (i32 imm:$src)))]>;
+    def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
+        "#BINOP "#NAME#"32mr PSEUDO!",
+        [(atomic_store_32 addr:$dst, (op
+            (atomic_load_32 addr:$dst), GR32:$src))]>;
+    def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
+        "#BINOP "#NAME#"64mi32 PSEUDO!",
+        [(atomic_store_64 addr:$dst, (op
+            (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>;
+    def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
+        "#BINOP "#NAME#"64mr PSEUDO!",
+        [(atomic_store_64 addr:$dst, (op
+            (atomic_load_64 addr:$dst), GR64:$src))]>;
+}
+let Defs = [EFLAGS] in {
+  defm RELEASE_ADD : RELEASE_BINOP_MI<add>;
+  defm RELEASE_AND : RELEASE_BINOP_MI<and>;
+  defm RELEASE_OR  : RELEASE_BINOP_MI<or>;
+  defm RELEASE_XOR : RELEASE_BINOP_MI<xor>;
+  // Note: we don't deal with sub, because substractions of constants are
+  //       optimized into additions before this code can run.
+}
+
+// Same as above, but for floating-point.
+// FIXME: imm version.
+// FIXME: Version that doesn't clobber $src, using AVX's VADDSS.
+// FIXME: This could also handle SIMD operations with *ps and *pd instructions.
+let usesCustomInserter = 1 in {
+multiclass RELEASE_FP_BINOP_MI<SDNode op> {
+    def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src),
+        "#BINOP "#NAME#"32mr PSEUDO!",
+        [(atomic_store_32 addr:$dst,
+	   (i32 (bitconvert (op
+             (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))),
+	      FR32:$src))))]>, Requires<[HasSSE1]>;
+    def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src),
+        "#BINOP "#NAME#"64mr PSEUDO!",
+        [(atomic_store_64 addr:$dst,
+	   (i64 (bitconvert (op
+             (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))),
+	      FR64:$src))))]>, Requires<[HasSSE2]>;
+}
+defm RELEASE_FADD : RELEASE_FP_BINOP_MI<fadd>;
+// FIXME: Add fsub, fmul, fdiv, ...
+}
+
+multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> {
+    def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst),
+        "#UNOP "#NAME#"8m PSEUDO!",
+        [(atomic_store_8 addr:$dst, dag8)]>;
+    def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst),
+        "#UNOP "#NAME#"16m PSEUDO!",
+        [(atomic_store_16 addr:$dst, dag16)]>;
+    def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst),
+        "#UNOP "#NAME#"32m PSEUDO!",
+        [(atomic_store_32 addr:$dst, dag32)]>;
+    def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst),
+        "#UNOP "#NAME#"64m PSEUDO!",
+        [(atomic_store_64 addr:$dst, dag64)]>;
+}
+
+let Defs = [EFLAGS] in {
+  defm RELEASE_INC : RELEASE_UNOP<
+      (add (atomic_load_8  addr:$dst), (i8 1)),
+      (add (atomic_load_16 addr:$dst), (i16 1)),
+      (add (atomic_load_32 addr:$dst), (i32 1)),
+      (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>;
+  defm RELEASE_DEC : RELEASE_UNOP<
+      (add (atomic_load_8  addr:$dst), (i8 -1)),
+      (add (atomic_load_16 addr:$dst), (i16 -1)),
+      (add (atomic_load_32 addr:$dst), (i32 -1)),
+      (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>;
+}
+/*
+TODO: These don't work because the type inference of TableGen fails.
+TODO: find a way to fix it.
+let Defs = [EFLAGS] in {
+  defm RELEASE_NEG : RELEASE_UNOP<
+      (ineg (atomic_load_8  addr:$dst)),
+      (ineg (atomic_load_16 addr:$dst)),
+      (ineg (atomic_load_32 addr:$dst)),
+      (ineg (atomic_load_64 addr:$dst))>;
+}
+// NOT doesn't set flags.
+defm RELEASE_NOT : RELEASE_UNOP<
+    (not (atomic_load_8  addr:$dst)),
+    (not (atomic_load_16 addr:$dst)),
+    (not (atomic_load_32 addr:$dst)),
+    (not (atomic_load_64 addr:$dst))>;
+*/
+
+def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
+			"#RELEASE_MOV8mi PSEUDO!",
+			[(atomic_store_8 addr:$dst, (i8 imm:$src))]>;
+def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src),
+			"#RELEASE_MOV16mi PSEUDO!",
+			[(atomic_store_16 addr:$dst, (i16 imm:$src))]>;
+def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
+			"#RELEASE_MOV32mi PSEUDO!",
+			[(atomic_store_32 addr:$dst, (i32 imm:$src))]>;
+def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
+			"#RELEASE_MOV64mi32 PSEUDO!",
+			[(atomic_store_64 addr:$dst, i64immSExt32:$src)]>;
+
+def RELEASE_MOV8mr  : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
+                        "#RELEASE_MOV8mr PSEUDO!",
+                        [(atomic_store_8  addr:$dst, GR8 :$src)]>;
+def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src),
+                        "#RELEASE_MOV16mr PSEUDO!",
+                        [(atomic_store_16 addr:$dst, GR16:$src)]>;
+def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
+                        "#RELEASE_MOV32mr PSEUDO!",
+                        [(atomic_store_32 addr:$dst, GR32:$src)]>;
+def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
+                        "#RELEASE_MOV64mr PSEUDO!",
+                        [(atomic_store_64 addr:$dst, GR64:$src)]>;
+
+def ACQUIRE_MOV8rm  : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
+                      "#ACQUIRE_MOV8rm PSEUDO!",
+                      [(set GR8:$dst,  (atomic_load_8  addr:$src))]>;
+def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
+                      "#ACQUIRE_MOV16rm PSEUDO!",
+                      [(set GR16:$dst, (atomic_load_16 addr:$src))]>;
+def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
+                      "#ACQUIRE_MOV32rm PSEUDO!",
+                      [(set GR32:$dst, (atomic_load_32 addr:$src))]>;
+def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
+                      "#ACQUIRE_MOV64rm PSEUDO!",
+                      [(set GR64:$dst, (atomic_load_64 addr:$src))]>;
+
+//===----------------------------------------------------------------------===//
+// DAG Pattern Matching Rules
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
+def : Pat<(i32 (X86Wrapper tconstpool  :$dst)), (MOV32ri tconstpool  :$dst)>;
+def : Pat<(i32 (X86Wrapper tjumptable  :$dst)), (MOV32ri tjumptable  :$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>;
+def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
+def : Pat<(i32 (X86Wrapper mcsym:$dst)), (MOV32ri mcsym:$dst)>;
+def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>;
+
+def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)),
+          (ADD32ri GR32:$src1, tconstpool:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)),
+          (ADD32ri GR32:$src1, tjumptable:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)),
+          (ADD32ri GR32:$src1, tglobaladdr:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)),
+          (ADD32ri GR32:$src1, texternalsym:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper mcsym:$src2)),
+          (ADD32ri GR32:$src1, mcsym:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)),
+          (ADD32ri GR32:$src1, tblockaddress:$src2)>;
+
+def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV32mi addr:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
+          (MOV32mi addr:$dst, texternalsym:$src)>;
+def : Pat<(store (i32 (X86Wrapper mcsym:$src)), addr:$dst),
+          (MOV32mi addr:$dst, mcsym:$src)>;
+def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst),
+          (MOV32mi addr:$dst, tblockaddress:$src)>;
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small
+// code model mode, should use 'movabs'.  FIXME: This is really a hack, the
+//  'movabs' predicate should handle this sort of thing.
+def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
+          (MOV64ri tconstpool  :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
+          (MOV64ri tjumptable  :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+          (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+          (MOV64ri texternalsym:$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper mcsym:$dst)),
+          (MOV64ri mcsym:$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+          (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>;
+
+// In kernel code model, we can get the address of a label
+// into a register with 'movq'.  FIXME: This is a hack, the 'imm' predicate of
+// the MOV64ri32 should accept these.
+def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
+          (MOV64ri32 tconstpool  :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
+          (MOV64ri32 tjumptable  :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+          (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+          (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper mcsym:$dst)),
+          (MOV64ri32 mcsym:$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+          (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>;
+
+// If we have small model and -static mode, it is safe to store global addresses
+// directly as immediates.  FIXME: This is really a hack, the 'imm' predicate
+// for MOV64mi32 should handle this sort of thing.
+def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tconstpool:$src)>,
+          Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tjumptable:$src)>,
+          Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
+          Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, texternalsym:$src)>,
+          Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper mcsym:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, mcsym:$src)>,
+          Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tblockaddress:$src)>,
+          Requires<[NearData, IsStatic]>;
+
+def : Pat<(i32 (X86RecoverFrameAlloc mcsym:$dst)), (MOV32ri mcsym:$dst)>;
+def : Pat<(i64 (X86RecoverFrameAlloc mcsym:$dst)), (MOV64ri mcsym:$dst)>;
+
+// Calls
+
+// tls has some funny stuff here...
+// This corresponds to movabs $foo@tpoff, %rax
+def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)),
+          (MOV64ri32 tglobaltlsaddr :$dst)>;
+// This corresponds to add $foo@tpoff, %rax
+def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)),
+          (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>;
+
+
+// Direct PC relative function call for small code model. 32-bit displacement
+// sign extended to 64-bit.
+def : Pat<(X86call (i64 tglobaladdr:$dst)),
+          (CALL64pcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i64 texternalsym:$dst)),
+          (CALL64pcrel32 texternalsym:$dst)>;
+
+// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
+// can never use callee-saved registers. That is the purpose of the GR64_TC
+// register classes.
+//
+// The only volatile register that is never used by the calling convention is
+// %r11. This happens when calling a vararg function with 6 arguments.
+//
+// Match an X86tcret that uses less than 7 volatile registers.
+def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
+                             (X86tcret node:$ptr, node:$off), [{
+  // X86tcret args: (*chain, ptr, imm, regs..., glue)
+  unsigned NumRegs = 0;
+  for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
+    if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6)
+      return false;
+  return true;
+}]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+          (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
+          Requires<[Not64BitMode]>;
+
+// FIXME: This is disabled for 32-bit PIC mode because the global base
+// register which is part of the address mode may be assigned a
+// callee-saved register.
+def : Pat<(X86tcret (load addr:$dst), imm:$off),
+          (TCRETURNmi addr:$dst, imm:$off)>,
+          Requires<[Not64BitMode, IsNotPIC]>;
+
+def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
+          (TCRETURNdi tglobaladdr:$dst, imm:$off)>,
+          Requires<[NotLP64]>;
+
+def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
+          (TCRETURNdi texternalsym:$dst, imm:$off)>,
+          Requires<[NotLP64]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+          (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
+          Requires<[In64BitMode]>;
+
+// Don't fold loads into X86tcret requiring more than 6 regs.
+// There wouldn't be enough scratch registers for base+index.
+def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
+          (TCRETURNmi64 addr:$dst, imm:$off)>,
+          Requires<[In64BitMode]>;
+
+def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
+          (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
+          Requires<[IsLP64]>;
+
+def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
+          (TCRETURNdi64 texternalsym:$dst, imm:$off)>,
+          Requires<[IsLP64]>;
+
+// Normal calls, with various flavors of addresses.
+def : Pat<(X86call (i32 tglobaladdr:$dst)),
+          (CALLpcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i32 texternalsym:$dst)),
+          (CALLpcrel32 texternalsym:$dst)>;
+def : Pat<(X86call (i32 imm:$dst)),
+          (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(X86cmp GR8:$src1, 0),
+          (TEST8rr GR8:$src1, GR8:$src1)>;
+def : Pat<(X86cmp GR16:$src1, 0),
+          (TEST16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(X86cmp GR32:$src1, 0),
+          (TEST32rr GR32:$src1, GR32:$src1)>;
+def : Pat<(X86cmp GR64:$src1, 0),
+          (TEST64rr GR64:$src1, GR64:$src1)>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32,
+                  Instruction Inst64> {
+  let Predicates = [HasCMov] in {
+    def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS),
+              (Inst16 GR16:$src2, addr:$src1)>;
+    def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS),
+              (Inst32 GR32:$src2, addr:$src1)>;
+    def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS),
+              (Inst64 GR64:$src2, addr:$src1)>;
+  }
+}
+
+defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>;
+defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>;
+defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>;
+defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>;
+defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>;
+defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>;
+defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>;
+defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>;
+defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>;
+defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>;
+defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>;
+defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>;
+defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>;
+defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>;
+defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>;
+defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;
+
+// zextload bool -> zextload byte
+def : Pat<(zextloadi8i1  addr:$src), (AND8ri (MOV8rm addr:$src), (i8 1))>;
+def : Pat<(zextloadi16i1 addr:$src), (AND16ri8 (MOVZX16rm8 addr:$src), (i16 1))>;
+def : Pat<(zextloadi32i1 addr:$src), (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1))>;
+def : Pat<(zextloadi64i1 addr:$src),
+          (SUBREG_TO_REG (i64 0),
+           (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), sub_32bit)>;
+
+// extload bool -> extload byte
+// When extloading from 16-bit and smaller memory locations into 64-bit
+// registers, use zero-extending loads so that the entire 64-bit register is
+// defined, avoiding partial-register updates.
+
+def : Pat<(extloadi8i1 addr:$src),   (MOV8rm      addr:$src)>;
+def : Pat<(extloadi16i1 addr:$src),  (MOVZX16rm8  addr:$src)>;
+def : Pat<(extloadi32i1 addr:$src),  (MOVZX32rm8  addr:$src)>;
+def : Pat<(extloadi16i8 addr:$src),  (MOVZX16rm8  addr:$src)>;
+def : Pat<(extloadi32i8 addr:$src),  (MOVZX32rm8  addr:$src)>;
+def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
+
+// For other extloads, use subregs, since the high contents of the register are
+// defined after an extload.
+def : Pat<(extloadi64i1 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i8 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i16 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i32 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
+
+// anyext. Define these to do an explicit zero-extend to
+// avoid partial-register updates.
+def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG
+                                     (MOVZX32rr8 GR8 :$src), sub_16bit)>;
+def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8  GR8 :$src)>;
+
+// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32.
+def : Pat<(i32 (anyext GR16:$src)),
+          (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>;
+
+def : Pat<(i64 (anyext GR8 :$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr8  GR8  :$src), sub_32bit)>;
+def : Pat<(i64 (anyext GR16:$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>;
+def : Pat<(i64 (anyext GR32:$src)),
+          (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. And x86's cmov doesn't do anything if the
+// condition is false. But any other 32-bit operation will zero-extend
+// up to 64 bits.
+def def32 : PatLeaf<(i32 GR32:$src), [{
+  return N->getOpcode() != ISD::TRUNCATE &&
+         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+         N->getOpcode() != ISD::CopyFromReg &&
+         N->getOpcode() != ISD::AssertSext &&
+         N->getOpcode() != X86ISD::CMOV;
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)),
+          (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+
+//===----------------------------------------------------------------------===//
+// Pattern match OR as ADD
+//===----------------------------------------------------------------------===//
+
+// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be
+// 3-addressified into an LEA instruction to avoid copies.  However, we also
+// want to finally emit these instructions as an or at the end of the code
+// generator to make the generated code easier to read.  To do this, we select
+// into "disjoint bits" pseudo ops.
+
+// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
+def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
+
+  APInt KnownZero0, KnownOne0;
+  CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
+  APInt KnownZero1, KnownOne1;
+  CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
+  return (~KnownZero0 & ~KnownZero1) == 0;
+}]>;
+
+
+// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
+// Try this before the selecting to OR.
+let AddedComplexity = 5, SchedRW = [WriteALU] in {
+
+let isConvertibleToThreeAddress = 1,
+    Constraints = "$src1 = $dst", Defs = [EFLAGS] in {
+let isCommutable = 1 in {
+def ADD16rr_DB  : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    "", // orw/addw REG, REG
+                    [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>;
+def ADD32rr_DB  : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                    "", // orl/addl REG, REG
+                    [(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>;
+def ADD64rr_DB  : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                    "", // orq/addq REG, REG
+                    [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>;
+} // isCommutable
+
+// NOTE: These are order specific, we want the ri8 forms to be listed
+// first so that they are slightly preferred to the ri forms.
+
+def ADD16ri8_DB : I<0, Pseudo,
+                    (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+                    "", // orw/addw REG, imm8
+                    [(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>;
+def ADD16ri_DB  : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    "", // orw/addw REG, imm
+                    [(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>;
+
+def ADD32ri8_DB : I<0, Pseudo,
+                    (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+                    "", // orl/addl REG, imm8
+                    [(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>;
+def ADD32ri_DB  : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                    "", // orl/addl REG, imm
+                    [(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>;
+
+
+def ADD64ri8_DB : I<0, Pseudo,
+                    (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+                    "", // orq/addq REG, imm8
+                    [(set GR64:$dst, (or_is_add GR64:$src1,
+                                                i64immSExt8:$src2))]>;
+def ADD64ri32_DB : I<0, Pseudo,
+                     (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+                      "", // orq/addq REG, imm
+                      [(set GR64:$dst, (or_is_add GR64:$src1,
+                                                  i64immSExt32:$src2))]>;
+}
+} // AddedComplexity, SchedRW
+
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// Odd encoding trick: -128 fits into an 8-bit immediate field while
+// +128 doesn't, so in this special case use a sub instead of an add.
+def : Pat<(add GR16:$src1, 128),
+          (SUB16ri8 GR16:$src1, -128)>;
+def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst),
+          (SUB16mi8 addr:$dst, -128)>;
+
+def : Pat<(add GR32:$src1, 128),
+          (SUB32ri8 GR32:$src1, -128)>;
+def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
+          (SUB32mi8 addr:$dst, -128)>;
+
+def : Pat<(add GR64:$src1, 128),
+          (SUB64ri8 GR64:$src1, -128)>;
+def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
+          (SUB64mi8 addr:$dst, -128)>;
+
+// The same trick applies for 32-bit immediate fields in 64-bit
+// instructions.
+def : Pat<(add GR64:$src1, 0x0000000080000000),
+          (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
+def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst),
+          (SUB64mi32 addr:$dst, 0xffffffff80000000)>;
+
+// To avoid needing to materialize an immediate in a register, use a 32-bit and
+// with implicit zero-extension instead of a 64-bit and if the immediate has at
+// least 32 bits of leading zeros. If in addition the last 32 bits can be
+// represented with a sign extension of a 8 bit constant, use that.
+// This can also reduce instruction size by eliminating the need for the REX
+// prefix.
+
+// AddedComplexity is needed to give priority over i64immSExt8 and i64immSExt32.
+let AddedComplexity = 1 in {
+def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm),
+          (SUBREG_TO_REG
+            (i64 0),
+            (AND32ri8
+              (EXTRACT_SUBREG GR64:$src, sub_32bit),
+              (i32 (GetLo8XForm imm:$imm))),
+            sub_32bit)>;
+
+def : Pat<(and GR64:$src, i64immZExt32:$imm),
+          (SUBREG_TO_REG
+            (i64 0),
+            (AND32ri
+              (EXTRACT_SUBREG GR64:$src, sub_32bit),
+              (i32 (GetLo32XForm imm:$imm))),
+            sub_32bit)>;
+} // AddedComplexity = 1
+
+
+// AddedComplexity is needed due to the increased complexity on the
+// i64immZExt32SExt8 and i64immZExt32 patterns above. Applying this to all
+// the MOVZX patterns keeps thems together in DAGIsel tables.
+let AddedComplexity = 1 in {
+// r & (2^16-1) ==> movz
+def : Pat<(and GR32:$src1, 0xffff),
+          (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1,
+                                                             GR32_ABCD)),
+                                      sub_8bit))>,
+      Requires<[Not64BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+           (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG
+            (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)),
+             sub_16bit)>,
+      Requires<[Not64BitMode]>;
+
+// r & (2^32-1) ==> movz
+def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
+          (SUBREG_TO_REG (i64 0),
+                         (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)),
+                         sub_32bit)>;
+// r & (2^16-1) ==> movz
+def : Pat<(and GR64:$src, 0xffff),
+          (SUBREG_TO_REG (i64 0),
+                      (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))),
+                      sub_32bit)>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR64:$src, 0xff),
+          (SUBREG_TO_REG (i64 0),
+                         (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))),
+                         sub_32bit)>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+           (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>,
+      Requires<[In64BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+           (EXTRACT_SUBREG (MOVZX32rr8 (i8
+            (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>,
+      Requires<[In64BitMode]>;
+} // AddedComplexity = 1
+
+
+// sext_inreg patterns
+def : Pat<(sext_inreg GR32:$src, i16),
+          (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+          (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+                                                             GR32_ABCD)),
+                                      sub_8bit))>,
+      Requires<[Not64BitMode]>;
+
+def : Pat<(sext_inreg GR16:$src, i8),
+           (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG
+            (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))),
+             sub_16bit)>,
+      Requires<[Not64BitMode]>;
+
+def : Pat<(sext_inreg GR64:$src, i32),
+          (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
+def : Pat<(sext_inreg GR64:$src, i16),
+          (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>;
+def : Pat<(sext_inreg GR64:$src, i8),
+          (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+          (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>,
+      Requires<[In64BitMode]>;
+def : Pat<(sext_inreg GR16:$src, i8),
+           (EXTRACT_SUBREG (MOVSX32rr8
+            (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>,
+      Requires<[In64BitMode]>;
+
+// sext, sext_load, zext, zext_load
+def: Pat<(i16 (sext GR8:$src)),
+          (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(sextloadi16i8 addr:$src),
+          (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>;
+def: Pat<(i16 (zext GR8:$src)),
+          (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(zextloadi16i8 addr:$src),
+          (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
+
+// trunc patterns
+def : Pat<(i16 (trunc GR32:$src)),
+          (EXTRACT_SUBREG GR32:$src, sub_16bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                          sub_8bit)>,
+      Requires<[Not64BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+          (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                          sub_8bit)>,
+      Requires<[Not64BitMode]>;
+def : Pat<(i32 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, sub_32bit)>;
+def : Pat<(i16 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, sub_16bit)>;
+def : Pat<(i8 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, sub_8bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+          (EXTRACT_SUBREG GR32:$src, sub_8bit)>,
+      Requires<[In64BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+          (EXTRACT_SUBREG GR16:$src, sub_8bit)>,
+      Requires<[In64BitMode]>;
+
+// h-register tricks
+def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
+          (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                          sub_8bit_hi)>,
+      Requires<[Not64BitMode]>;
+def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
+          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                          sub_8bit_hi)>,
+      Requires<[Not64BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+          (EXTRACT_SUBREG
+            (MOVZX32rr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              sub_8bit_hi)),
+            sub_16bit)>,
+      Requires<[Not64BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
+                                                             GR16_ABCD)),
+                                      sub_8bit_hi))>,
+      Requires<[Not64BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
+                                                             GR16_ABCD)),
+                                      sub_8bit_hi))>,
+      Requires<[Not64BitMode]>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+                                                             GR32_ABCD)),
+                                      sub_8bit_hi))>,
+      Requires<[Not64BitMode]>;
+def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+                                                             GR32_ABCD)),
+                                      sub_8bit_hi))>,
+      Requires<[Not64BitMode]>;
+
+// h-register tricks.
+// For now, be conservative on x86-64 and use an h-register extract only if the
+// value is immediately zero-extended or stored, which are somewhat common
+// cases. This uses a bunch of code to prevent a register requiring a REX prefix
+// from being allocated in the same instruction as the h register, as there's
+// currently no way to describe this requirement to the register allocator.
+
+// h-register extract and zero-extend.
+def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
+                              sub_8bit_hi)),
+            sub_32bit)>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
+          (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+                                                                   GR32_ABCD)),
+                                             sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+          (EXTRACT_SUBREG
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              sub_8bit_hi)),
+            sub_16bit)>,
+      Requires<[In64BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              sub_8bit_hi)),
+            sub_32bit)>;
+def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              sub_8bit_hi)),
+            sub_32bit)>;
+
+// h-register extract and store.
+def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
+                            sub_8bit_hi))>;
+def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+
+
+// (shl x, 1) ==> (add x, x)
+// Note that if x is undef (immediate or otherwise), we could theoretically
+// end up with the two uses of x getting different values, producing a result
+// where the least significant bit is not 0. However, the probability of this
+// happening is considered low enough that this is officially not a
+// "real problem".
+def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr  GR8 :$src1, GR8 :$src1)>;
+def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
+def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
+
+// Helper imms that check if a mask doesn't change significant shift bits.
+def immShift32 : ImmLeaf<i8, [{
+  return countTrailingOnes<uint64_t>(Imm) >= 5;
+}]>;
+def immShift64 : ImmLeaf<i8, [{
+  return countTrailingOnes<uint64_t>(Imm) >= 6;
+}]>;
+
+// Shift amount is implicitly masked.
+multiclass MaskedShiftAmountPats<SDNode frag, string name> {
+  // (shift x (and y, 31)) ==> (shift x, y)
+  def : Pat<(frag GR8:$src1, (and CL, immShift32)),
+            (!cast<Instruction>(name # "8rCL") GR8:$src1)>;
+  def : Pat<(frag GR16:$src1, (and CL, immShift32)),
+            (!cast<Instruction>(name # "16rCL") GR16:$src1)>;
+  def : Pat<(frag GR32:$src1, (and CL, immShift32)),
+            (!cast<Instruction>(name # "32rCL") GR32:$src1)>;
+  def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
+            (!cast<Instruction>(name # "8mCL") addr:$dst)>;
+  def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
+            (!cast<Instruction>(name # "16mCL") addr:$dst)>;
+  def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
+            (!cast<Instruction>(name # "32mCL") addr:$dst)>;
+
+  // (shift x (and y, 63)) ==> (shift x, y)
+  def : Pat<(frag GR64:$src1, (and CL, immShift64)),
+            (!cast<Instruction>(name # "64rCL") GR64:$src1)>;
+  def : Pat<(store (frag (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
+            (!cast<Instruction>(name # "64mCL") addr:$dst)>;
+}
+
+defm : MaskedShiftAmountPats<shl, "SHL">;
+defm : MaskedShiftAmountPats<srl, "SHR">;
+defm : MaskedShiftAmountPats<sra, "SAR">;
+defm : MaskedShiftAmountPats<rotl, "ROL">;
+defm : MaskedShiftAmountPats<rotr, "ROR">;
+
+// (anyext (setcc_carry)) -> (setcc_carry)
+def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C16r)>;
+def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C32r)>;
+def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C32r)>;
+
+
+
+
+//===----------------------------------------------------------------------===//
+// EFLAGS-defining Patterns
+//===----------------------------------------------------------------------===//
+
+// add reg, reg
+def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>;
+
+// add reg, mem
+def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
+          (ADD8rm GR8:$src1, addr:$src2)>;
+def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
+          (ADD16rm GR16:$src1, addr:$src2)>;
+def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
+          (ADD32rm GR32:$src1, addr:$src2)>;
+
+// add reg, imm
+def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri  GR8:$src1 , imm:$src2)>;
+def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(add GR16:$src1, i16immSExt8:$src2),
+          (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(add GR32:$src1, i32immSExt8:$src2),
+          (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// sub reg, reg
+def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>;
+
+// sub reg, mem
+def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
+          (SUB8rm GR8:$src1, addr:$src2)>;
+def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
+          (SUB16rm GR16:$src1, addr:$src2)>;
+def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
+          (SUB32rm GR32:$src1, addr:$src2)>;
+
+// sub reg, imm
+def : Pat<(sub GR8:$src1, imm:$src2),
+          (SUB8ri GR8:$src1, imm:$src2)>;
+def : Pat<(sub GR16:$src1, imm:$src2),
+          (SUB16ri GR16:$src1, imm:$src2)>;
+def : Pat<(sub GR32:$src1, imm:$src2),
+          (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
+          (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
+          (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// sub 0, reg
+def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r  GR8 :$src)>;
+def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
+def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
+def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
+
+// mul reg, reg
+def : Pat<(mul GR16:$src1, GR16:$src2),
+          (IMUL16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(mul GR32:$src1, GR32:$src2),
+          (IMUL32rr GR32:$src1, GR32:$src2)>;
+
+// mul reg, mem
+def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
+          (IMUL16rm GR16:$src1, addr:$src2)>;
+def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
+          (IMUL32rm GR32:$src1, addr:$src2)>;
+
+// mul reg, imm
+def : Pat<(mul GR16:$src1, imm:$src2),
+          (IMUL16rri GR16:$src1, imm:$src2)>;
+def : Pat<(mul GR32:$src1, imm:$src2),
+          (IMUL32rri GR32:$src1, imm:$src2)>;
+def : Pat<(mul GR16:$src1, i16immSExt8:$src2),
+          (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(mul GR32:$src1, i32immSExt8:$src2),
+          (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// reg = mul mem, imm
+def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
+          (IMUL16rmi addr:$src1, imm:$src2)>;
+def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
+          (IMUL32rmi addr:$src1, imm:$src2)>;
+def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
+          (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
+def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
+          (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
+
+// Patterns for nodes that do not produce flags, for instructions that do.
+
+// addition
+def : Pat<(add GR64:$src1, GR64:$src2),
+          (ADD64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt8:$src2),
+          (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt32:$src2),
+          (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
+          (ADD64rm GR64:$src1, addr:$src2)>;
+
+// subtraction
+def : Pat<(sub GR64:$src1, GR64:$src2),
+          (SUB64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
+          (SUB64rm GR64:$src1, addr:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
+          (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
+          (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Multiply
+def : Pat<(mul GR64:$src1, GR64:$src2),
+          (IMUL64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
+          (IMUL64rm GR64:$src1, addr:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
+          (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
+          (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
+          (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
+          (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
+
+// Increment/Decrement reg.
+// Do not make INC/DEC if it is slow
+let Predicates = [NotSlowIncDec] in {
+  def : Pat<(add GR8:$src, 1),   (INC8r GR8:$src)>;
+  def : Pat<(add GR16:$src, 1),  (INC16r GR16:$src)>;
+  def : Pat<(add GR32:$src, 1),  (INC32r GR32:$src)>;
+  def : Pat<(add GR64:$src, 1),  (INC64r GR64:$src)>;
+  def : Pat<(add GR8:$src, -1),  (DEC8r GR8:$src)>;
+  def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>;
+  def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>;
+  def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>;
+}
+
+// or reg/reg.
+def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>;
+
+// or reg/mem
+def : Pat<(or GR8:$src1, (loadi8 addr:$src2)),
+          (OR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(or GR16:$src1, (loadi16 addr:$src2)),
+          (OR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(or GR32:$src1, (loadi32 addr:$src2)),
+          (OR32rm GR32:$src1, addr:$src2)>;
+def : Pat<(or GR64:$src1, (loadi64 addr:$src2)),
+          (OR64rm GR64:$src1, addr:$src2)>;
+
+// or reg/imm
+def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri  GR8 :$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, i16immSExt8:$src2),
+          (OR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(or GR32:$src1, i32immSExt8:$src2),
+          (OR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt8:$src2),
+          (OR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt32:$src2),
+          (OR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// xor reg/reg
+def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>;
+
+// xor reg/mem
+def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)),
+          (XOR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)),
+          (XOR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)),
+          (XOR32rm GR32:$src1, addr:$src2)>;
+def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)),
+          (XOR64rm GR64:$src1, addr:$src2)>;
+
+// xor reg/imm
+def : Pat<(xor GR8:$src1, imm:$src2),
+          (XOR8ri GR8:$src1, imm:$src2)>;
+def : Pat<(xor GR16:$src1, imm:$src2),
+          (XOR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(xor GR32:$src1, imm:$src2),
+          (XOR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(xor GR16:$src1, i16immSExt8:$src2),
+          (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(xor GR32:$src1, i32immSExt8:$src2),
+          (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(xor GR64:$src1, i64immSExt8:$src2),
+          (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(xor GR64:$src1, i64immSExt32:$src2),
+          (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// and reg/reg
+def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>;
+
+// and reg/mem
+def : Pat<(and GR8:$src1, (loadi8 addr:$src2)),
+          (AND8rm GR8:$src1, addr:$src2)>;
+def : Pat<(and GR16:$src1, (loadi16 addr:$src2)),
+          (AND16rm GR16:$src1, addr:$src2)>;
+def : Pat<(and GR32:$src1, (loadi32 addr:$src2)),
+          (AND32rm GR32:$src1, addr:$src2)>;
+def : Pat<(and GR64:$src1, (loadi64 addr:$src2)),
+          (AND64rm GR64:$src1, addr:$src2)>;
+
+// and reg/imm
+def : Pat<(and GR8:$src1, imm:$src2),
+          (AND8ri GR8:$src1, imm:$src2)>;
+def : Pat<(and GR16:$src1, imm:$src2),
+          (AND16ri GR16:$src1, imm:$src2)>;
+def : Pat<(and GR32:$src1, imm:$src2),
+          (AND32ri GR32:$src1, imm:$src2)>;
+def : Pat<(and GR16:$src1, i16immSExt8:$src2),
+          (AND16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(and GR32:$src1, i32immSExt8:$src2),
+          (AND32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(and GR64:$src1, i64immSExt8:$src2),
+          (AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(and GR64:$src1, i64immSExt32:$src2),
+          (AND64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Bit scan instruction patterns to match explicit zero-undef behavior.
+def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>;
+def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>;
+def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
+def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
+
+// When HasMOVBE is enabled it is possible to get a non-legalized
+// register-register 16 bit bswap. This maps it to a ROL instruction.
+let Predicates = [HasMOVBE] in {
+ def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td
new file mode 100644
index 0000000..8c351a5
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td
@@ -0,0 +1,329 @@
+//===-- X86InstrControl.td - Control Flow Instructions -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 jump, return, call, and related instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions.
+//
+
+// Return instructions.
+//
+// The X86retflag return instructions are variadic because we may add ST0 and
+// ST1 arguments when returning values on the x87 stack.
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in {
+  def RETL   : I   <0xC3, RawFrm, (outs), (ins variable_ops),
+                    "ret{l}", [(X86retflag 0)], IIC_RET>, OpSize32,
+                    Requires<[Not64BitMode]>;
+  def RETQ   : I   <0xC3, RawFrm, (outs), (ins variable_ops),
+                    "ret{q}", [(X86retflag 0)], IIC_RET>, OpSize32,
+                    Requires<[In64BitMode]>;
+  def RETW   : I   <0xC3, RawFrm, (outs), (ins),
+                    "ret{w}",
+                    [], IIC_RET>, OpSize16;
+  def RETIL  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+                    "ret{l}\t$amt",
+                    [(X86retflag timm:$amt)], IIC_RET_IMM>, OpSize32,
+               Requires<[Not64BitMode]>;
+  def RETIQ  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+                    "ret{q}\t$amt",
+                    [(X86retflag timm:$amt)], IIC_RET_IMM>, OpSize32,
+               Requires<[In64BitMode]>;
+  def RETIW  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
+                    "ret{w}\t$amt",
+                    [], IIC_RET_IMM>, OpSize16;
+  def LRETL  : I   <0xCB, RawFrm, (outs), (ins),
+                    "{l}ret{l|f}", [], IIC_RET>, OpSize32;
+  def LRETQ  : RI  <0xCB, RawFrm, (outs), (ins),
+                    "{l}ret{|f}q", [], IIC_RET>, Requires<[In64BitMode]>;
+  def LRETW  : I   <0xCB, RawFrm, (outs), (ins),
+                    "{l}ret{w|f}", [], IIC_RET>, OpSize16;
+  def LRETIL : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+                    "{l}ret{l|f}\t$amt", [], IIC_RET>, OpSize32;
+  def LRETIQ : RIi16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+                    "{l}ret{|f}q\t$amt", [], IIC_RET>, Requires<[In64BitMode]>;
+  def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+                    "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize16;
+
+  // The machine return from interrupt instruction, but sometimes we need to
+  // perform a post-epilogue stack adjustment. Codegen emits the pseudo form
+  // which expands to include an SP adjustment if necessary.
+  def IRET16 : I   <0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>,
+               OpSize16;
+  def IRET32 : I   <0xcf, RawFrm, (outs), (ins), "iret{l|d}", [],
+                    IIC_IRET>, OpSize32;
+  def IRET64 : RI  <0xcf, RawFrm, (outs), (ins), "iretq", [],
+                    IIC_IRET>, Requires<[In64BitMode]>;
+  let isCodeGenOnly = 1 in
+  def IRET : PseudoI<(outs), (ins i16imm:$adj), [(X86iret timm:$adj)]>;
+  
+}
+
+// Unconditional branches.
+let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
+  def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
+                       "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
+  let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
+    def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget16:$dst),
+                          "jmp\t$dst", [], IIC_JMP_REL>, OpSize16;
+    def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget32:$dst),
+                          "jmp\t$dst", [], IIC_JMP_REL>, OpSize32;
+  }
+}
+
+// Conditional Branches.
+let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
+  multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
+    def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm,
+                       [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>;
+    let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
+      def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget16:$dst), asm,
+                         [], IIC_Jcc>, OpSize16, TB;
+      def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget32:$dst), asm,
+                         [], IIC_Jcc>, TB, OpSize32;
+    }
+  }
+}
+
+defm JO  : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>;
+defm JNO : ICBr<0x71, 0x81, "jno\t$dst", X86_COND_NO>;
+defm JB  : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>;
+defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>;
+defm JE  : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>;
+defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>;
+defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>;
+defm JA  : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>;
+defm JS  : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>;
+defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>;
+defm JP  : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>;
+defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>;
+defm JL  : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>;
+defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>;
+defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
+defm JG  : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;
+
+// jcx/jecx/jrcx instructions.
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in {
+  // These are the 32-bit versions of this instruction for the asmparser.  In
+  // 32-bit mode, the address size prefix is jcxz and the unprefixed version is
+  // jecxz.
+  let Uses = [CX] in
+    def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+                        "jcxz\t$dst", [], IIC_JCXZ>, AdSize16,
+                        Requires<[Not64BitMode]>;
+  let Uses = [ECX] in
+    def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+                        "jecxz\t$dst", [], IIC_JCXZ>, AdSize32;
+
+  let Uses = [RCX] in
+    def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+                         "jrcxz\t$dst", [], IIC_JCXZ>, AdSize64,
+                         Requires<[In64BitMode]>;
+}
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def JMP16r     : I<0xFF, MRM4r, (outs), (ins GR16:$dst), "jmp{w}\t{*}$dst",
+                     [(brind GR16:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
+                   OpSize16, Sched<[WriteJump]>;
+  def JMP16m     : I<0xFF, MRM4m, (outs), (ins i16mem:$dst), "jmp{w}\t{*}$dst",
+                     [(brind (loadi16 addr:$dst))], IIC_JMP_MEM>,
+                   Requires<[Not64BitMode]>, OpSize16, Sched<[WriteJumpLd]>;
+
+  def JMP32r     : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
+                     [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
+                   OpSize32, Sched<[WriteJump]>;
+  def JMP32m     : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
+                     [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>,
+                   Requires<[Not64BitMode]>, OpSize32, Sched<[WriteJumpLd]>;
+
+  def JMP64r     : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
+                     [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>,
+                   Sched<[WriteJump]>;
+  def JMP64m     : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
+                     [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>,
+                   Requires<[In64BitMode]>, Sched<[WriteJumpLd]>;
+
+  let Predicates = [Not64BitMode] in {
+    def FARJMP16i  : Iseg16<0xEA, RawFrmImm16, (outs),
+                            (ins i16imm:$off, i16imm:$seg),
+                            "ljmp{w}\t$seg, $off", [],
+                            IIC_JMP_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
+    def FARJMP32i  : Iseg32<0xEA, RawFrmImm16, (outs),
+                            (ins i32imm:$off, i16imm:$seg),
+                            "ljmp{l}\t$seg, $off", [],
+                            IIC_JMP_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
+  }
+  def FARJMP64   : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
+                      "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
+                   Sched<[WriteJump]>;
+
+  def FARJMP16m  : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst),
+                     "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize16,
+                   Sched<[WriteJumpLd]>;
+  def FARJMP32m  : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
+                     "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize32,
+                   Sched<[WriteJumpLd]>;
+}
+
+
+// Loop instructions
+let SchedRW = [WriteJump] in {
+def LOOP   : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>;
+def LOOPE  : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>;
+def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. ESP is marked as
+  // a use to prevent stack-pointer assignments that appear immediately
+  // before calls from potentially appearing dead. Uses for argument
+  // registers are added manually.
+  let Uses = [ESP] in {
+    def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
+                           (outs), (ins i32imm_pcrel:$dst),
+                           "call{l}\t$dst", [], IIC_CALL_RI>, OpSize32,
+                      Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+    let hasSideEffects = 0 in
+      def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
+                             (outs), (ins i16imm_pcrel:$dst),
+                             "call{w}\t$dst", [], IIC_CALL_RI>, OpSize16,
+                        Sched<[WriteJump]>;
+    def CALL16r     : I<0xFF, MRM2r, (outs), (ins GR16:$dst),
+                        "call{w}\t{*}$dst", [(X86call GR16:$dst)], IIC_CALL_RI>,
+                      OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+    def CALL16m     : I<0xFF, MRM2m, (outs), (ins i16mem:$dst),
+                        "call{w}\t{*}$dst", [(X86call (loadi16 addr:$dst))],
+                        IIC_CALL_MEM>, OpSize16,
+                      Requires<[Not64BitMode,FavorMemIndirectCall]>,
+                      Sched<[WriteJumpLd]>;
+    def CALL32r     : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
+                        "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
+                      OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+    def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
+                        "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))],
+                        IIC_CALL_MEM>, OpSize32,
+                      Requires<[Not64BitMode,FavorMemIndirectCall]>,
+                      Sched<[WriteJumpLd]>;
+
+    let Predicates = [Not64BitMode] in {
+      def FARCALL16i  : Iseg16<0x9A, RawFrmImm16, (outs),
+                               (ins i16imm:$off, i16imm:$seg),
+                               "lcall{w}\t$seg, $off", [],
+                               IIC_CALL_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
+      def FARCALL32i  : Iseg32<0x9A, RawFrmImm16, (outs),
+                               (ins i32imm:$off, i16imm:$seg),
+                               "lcall{l}\t$seg, $off", [],
+                               IIC_CALL_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
+    }
+
+    def FARCALL16m  : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
+                        "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize16,
+                      Sched<[WriteJumpLd]>;
+    def FARCALL32m  : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
+                        "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize32,
+                      Sched<[WriteJumpLd]>;
+  }
+
+
+// Tail call stuff.
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+  let Uses = [ESP] in {
+  def TCRETURNdi : PseudoI<(outs),
+                     (ins i32imm_pcrel:$dst, i32imm:$offset), []>;
+  def TCRETURNri : PseudoI<(outs),
+                     (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
+  let mayLoad = 1 in
+  def TCRETURNmi : PseudoI<(outs),
+                     (ins i32mem_TC:$dst, i32imm:$offset), []>;
+
+  // FIXME: The should be pseudo instructions that are lowered when going to
+  // mcinst.
+  def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
+                           (ins i32imm_pcrel:$dst),
+                           "jmp\t$dst",
+                           [], IIC_JMP_REL>;
+  def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+                   "", [], IIC_JMP_REG>;  // FIXME: Remove encoding when JIT is dead.
+  let mayLoad = 1 in
+  def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
+                   "jmp{l}\t{*}$dst", [], IIC_JMP_MEM>;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+
+// RSP is marked as a use to prevent stack-pointer assignments that appear
+// immediately before calls from potentially appearing dead. Uses for argument
+// registers are added manually.
+let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in {
+  // NOTE: this pattern doesn't match "X86call imm", because we do not know
+  // that the offset between an arbitrary immediate and the call will fit in
+  // the 32-bit pcrel field that we have.
+  def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
+                        (outs), (ins i64i32imm_pcrel:$dst),
+                        "call{q}\t$dst", [], IIC_CALL_RI>, OpSize32,
+                      Requires<[In64BitMode]>;
+  def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
+                        "call{q}\t{*}$dst", [(X86call GR64:$dst)],
+                        IIC_CALL_RI>,
+                      Requires<[In64BitMode]>;
+  def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
+                        "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
+                        IIC_CALL_MEM>,
+                      Requires<[In64BitMode,FavorMemIndirectCall]>;
+
+  def FARCALL64   : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
+                       "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
+}
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1,
+    SchedRW = [WriteJump] in {
+  def TCRETURNdi64 : PseudoI<(outs),
+                      (ins i64i32imm_pcrel:$dst, i32imm:$offset),
+                      []>;
+  def TCRETURNri64 : PseudoI<(outs),
+                      (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
+  let mayLoad = 1 in
+  def TCRETURNmi64 : PseudoI<(outs),
+                       (ins i64mem_TC:$dst, i32imm:$offset), []>;
+
+  def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst),
+                   "jmp\t$dst", [], IIC_JMP_REL>;
+  def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+                     "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+
+  let mayLoad = 1 in
+  def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
+                     "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+
+  // Win64 wants jumps leaving the function to have a REX_W prefix.
+  let hasREX_WPrefix = 1 in {
+    def TAILJMPd64_REX : Ii32PCRel<0xE9, RawFrm, (outs),
+                                   (ins i64i32imm_pcrel:$dst),
+                                   "rex64 jmp\t$dst", [], IIC_JMP_REL>;
+    def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+                           "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+
+    let mayLoad = 1 in
+    def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
+                           "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
new file mode 100644
index 0000000..c4b2d6d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
@@ -0,0 +1,182 @@
+//===-- X86InstrExtension.td - Sign and Zero Extensions ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the sign and zero extension operations.
+//
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in {
+  let Defs = [AX], Uses = [AL] in
+  def CBW : I<0x98, RawFrm, (outs), (ins),
+              "{cbtw|cbw}", [], IIC_CBW>, OpSize16;  // AX = signext(AL)
+  let Defs = [EAX], Uses = [AX] in
+  def CWDE : I<0x98, RawFrm, (outs), (ins),
+              "{cwtl|cwde}", [], IIC_CBW>, OpSize32; // EAX = signext(AX)
+
+  let Defs = [AX,DX], Uses = [AX] in
+  def CWD : I<0x99, RawFrm, (outs), (ins),
+              "{cwtd|cwd}", [], IIC_CBW>, OpSize16; // DX:AX = signext(AX)
+  let Defs = [EAX,EDX], Uses = [EAX] in
+  def CDQ : I<0x99, RawFrm, (outs), (ins),
+              "{cltd|cdq}", [], IIC_CBW>, OpSize32; // EDX:EAX = signext(EAX)
+
+
+  let Defs = [RAX], Uses = [EAX] in
+  def CDQE : RI<0x98, RawFrm, (outs), (ins),
+               "{cltq|cdqe}", [], IIC_CBW>;     // RAX = signext(EAX)
+
+  let Defs = [RAX,RDX], Uses = [RAX] in
+  def CQO  : RI<0x99, RawFrm, (outs), (ins),
+                "{cqto|cqo}", [], IIC_CBW>; // RDX:RAX = signext(RAX)
+}
+
+
+
+// Sign/Zero extenders
+let hasSideEffects = 0 in {
+def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+                   "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>,
+                   TB, OpSize16, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+                   "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>,
+                   TB, OpSize16, Sched<[WriteALULd]>;
+} // hasSideEffects = 0
+def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src),
+                   "movs{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+                   OpSize32, Sched<[WriteALU]>;
+def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+                   "movs{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sextloadi32i8 addr:$src))], IIC_MOVSX>, TB,
+                   OpSize32, Sched<[WriteALULd]>;
+def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+                   "movs{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+                   OpSize32, Sched<[WriteALU]>;
+def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+                   "movs{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>,
+                   OpSize32, TB, Sched<[WriteALULd]>;
+
+let hasSideEffects = 0 in {
+def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+                   "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>,
+                   TB, OpSize16, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+                   "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>,
+                   TB, OpSize16, Sched<[WriteALULd]>;
+} // hasSideEffects = 0
+def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
+                   "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB,
+                   OpSize32, Sched<[WriteALU]>;
+def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+                   "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zextloadi32i8 addr:$src))], IIC_MOVZX>, TB,
+                   OpSize32, Sched<[WriteALULd]>;
+def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+                   "movz{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zext GR16:$src))], IIC_MOVZX>, TB,
+                   OpSize32, Sched<[WriteALU]>;
+def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+                   "movz{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zextloadi32i16 addr:$src))], IIC_MOVZX>,
+                   TB, OpSize32, Sched<[WriteALULd]>;
+
+// These are the same as the regular MOVZX32rr8 and MOVZX32rm8
+// except that they use GR32_NOREX for the output operand register class
+// instead of GR32. This allows them to operate on h registers on x86-64.
+let hasSideEffects = 0, isCodeGenOnly = 1 in {
+def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
+                         (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         [], IIC_MOVZX>, TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
+                         (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         [], IIC_MOVZX>, TB, Sched<[WriteALULd]>;
+
+def MOVSX32_NOREXrr8 : I<0xBE, MRMSrcReg,
+                         (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
+                         "movs{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         [], IIC_MOVSX>, TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem,
+                         (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
+                         "movs{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         [], IIC_MOVSX>, TB, Sched<[WriteALULd]>;
+}
+
+// MOVSX64rr8 always has a REX prefix and it has an 8-bit register
+// operand, which makes it a rare instruction with an 8-bit register
+// operand that can never access an h register. If support for h registers
+// were generalized, this would require a special register class.
+def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
+                    "movs{bq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+                    Sched<[WriteALU]>;
+def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
+                    "movs{bq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i8 addr:$src))], IIC_MOVSX>,
+                    TB, Sched<[WriteALULd]>;
+def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+                    "movs{wq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+                    Sched<[WriteALU]>;
+def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                    "movs{wq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i16 addr:$src))], IIC_MOVSX>,
+                    TB, Sched<[WriteALULd]>;
+def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+                    "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>,
+                    Sched<[WriteALU]>, Requires<[In64BitMode]>;
+def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
+                    "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>,
+                    Sched<[WriteALULd]>, Requires<[In64BitMode]>;
+
+// movzbq and movzwq encodings for the disassembler
+def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
+                       "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                       TB, Sched<[WriteALU]>;
+def MOVZX64rm8_Q : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src),
+                       "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                       TB, Sched<[WriteALULd]>;
+def MOVZX64rr16_Q : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+                       "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                       TB, Sched<[WriteALU]>;
+def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                       "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                       TB, Sched<[WriteALULd]>;
+
+// 64-bit zero-extension patterns use SUBREG_TO_REG and an operation writing a
+// 32-bit register.
+def : Pat<(i64 (zext GR8:$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8:$src), sub_32bit)>;
+def : Pat<(zextloadi64i8 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+
+def : Pat<(i64 (zext GR16:$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16:$src), sub_32bit)>;
+def : Pat<(zextloadi64i16 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
+
+// The preferred way to do 32-bit-to-64-bit zero extension on x86-64 is to use a
+// SUBREG_TO_REG to utilize implicit zero-extension, however this isn't possible
+// when the 32-bit value is defined by a truncate or is copied from something
+// where the high bits aren't necessarily all zero. In such cases, we fall back
+// to these explicit zext instructions.
+def : Pat<(i64 (zext GR32:$src)),
+          (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src), sub_32bit)>;
+def : Pat<(i64 (zextloadi64i32 addr:$src)),
+          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
new file mode 100644
index 0000000..fd800cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
@@ -0,0 +1,441 @@
+//===-- X86InstrFMA.td - FMA Instruction Set ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes FMA (Fused Multiply-Add) instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FMA3 - Intel 3 operand Fused Multiply-Add instructions
+//===----------------------------------------------------------------------===//
+
+// For all FMA opcodes declared in fma3p_rm and fma3s_rm milticlasses defined
+// below, both the register and memory variants are commutable.
+// For the register form the commutable operands are 1, 2 and 3.
+// For the memory variant the folded operand must be in 3. Thus,
+// in that case, only the operands 1 and 2 can be swapped.
+// Commuting some of operands may require the opcode change.
+// FMA*213*:
+//   operands 1 and 2 (memory & register forms): *213* --> *213*(no changes);
+//   operands 1 and 3 (register forms only):     *213* --> *231*;
+//   operands 2 and 3 (register forms only):     *213* --> *132*.
+// FMA*132*:
+//   operands 1 and 2 (memory & register forms): *132* --> *231*;
+//   operands 1 and 3 (register forms only):     *132* --> *132*(no changes);
+//   operands 2 and 3 (register forms only):     *132* --> *213*.
+// FMA*231*:
+//   operands 1 and 2 (memory & register forms): *231* --> *132*;
+//   operands 1 and 3 (register forms only):     *231* --> *213*;
+//   operands 2 and 3 (register forms only):     *231* --> *231*(no changes).
+
+let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in
+multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
+                    PatFrag MemFrag128, PatFrag MemFrag256,
+                    ValueType OpVT128, ValueType OpVT256,
+                    SDPatternOperator Op = null_frag> {
+  let usesCustomInserter = 1 in
+  def r     : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, VR128:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, (OpVT128 (Op VR128:$src2,
+                                               VR128:$src1, VR128:$src3)))]>;
+
+  let mayLoad = 1 in
+  def m     : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
+                                               (MemFrag128 addr:$src3))))]>;
+
+  let usesCustomInserter = 1 in
+  def rY    : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
+                   (ins VR256:$src1, VR256:$src2, VR256:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
+                                               VR256:$src3)))]>, VEX_L;
+
+  let mayLoad = 1 in
+  def mY    : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
+                   (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR256:$dst,
+                     (OpVT256 (Op VR256:$src2, VR256:$src1,
+                               (MemFrag256 addr:$src3))))]>, VEX_L;
+}
+
+multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                       string OpcodeStr, string PackTy,
+                       PatFrag MemFrag128, PatFrag MemFrag256,
+                       SDNode Op, ValueType OpTy128, ValueType OpTy256> {
+  defm r213 : fma3p_rm<opc213,
+                       !strconcat(OpcodeStr, "213", PackTy),
+                       MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
+  defm r132 : fma3p_rm<opc132,
+                       !strconcat(OpcodeStr, "132", PackTy),
+                       MemFrag128, MemFrag256, OpTy128, OpTy256>;
+  defm r231 : fma3p_rm<opc231,
+                       !strconcat(OpcodeStr, "231", PackTy),
+                       MemFrag128, MemFrag256, OpTy128, OpTy256>;
+}
+
+// Fused Multiply-Add
+let ExeDomain = SSEPackedSingle in {
+  defm VFMADDPS    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", loadv4f32,
+                                 loadv8f32, X86Fmadd, v4f32, v8f32>;
+  defm VFMSUBPS    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", loadv4f32,
+                                 loadv8f32, X86Fmsub, v4f32, v8f32>;
+  defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps",
+                                 loadv4f32, loadv8f32, X86Fmaddsub,
+                                 v4f32, v8f32>;
+  defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps",
+                                 loadv4f32, loadv8f32, X86Fmsubadd,
+                                 v4f32, v8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+  defm VFMADDPD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", loadv2f64,
+                                 loadv4f64, X86Fmadd, v2f64, v4f64>, VEX_W;
+  defm VFMSUBPD    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", loadv2f64,
+                                 loadv4f64, X86Fmsub, v2f64, v4f64>, VEX_W;
+  defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd",
+                                 loadv2f64, loadv4f64, X86Fmaddsub,
+                                 v2f64, v4f64>, VEX_W;
+  defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd",
+                                 loadv2f64, loadv4f64, X86Fmsubadd,
+                                 v2f64, v4f64>, VEX_W;
+}
+
+// Fused Negative Multiply-Add
+let ExeDomain = SSEPackedSingle in {
+  defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps",  loadv4f32,
+                               loadv8f32, X86Fnmadd, v4f32, v8f32>;
+  defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps",  loadv4f32,
+                               loadv8f32, X86Fnmsub, v4f32, v8f32>;
+}
+let ExeDomain = SSEPackedDouble in {
+  defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", loadv2f64,
+                               loadv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
+  defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd",
+                               loadv2f64, loadv4f64, X86Fnmsub, v2f64,
+                               v4f64>, VEX_W;
+}
+
+// All source register operands of FMA opcodes defined in fma3s_rm multiclass
+// can be commuted. In many cases such commute transformation requres an opcode
+// adjustment, for example, commuting the operands 1 and 2 in FMA*132 form
+// would require an opcode change to FMA*231:
+//     FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2;
+//     -->
+//     FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2;
+// Please see more detailed comment at the very beginning of the section
+// defining FMA3 opcodes above.
+let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
+multiclass fma3s_rm<bits<8> opc, string OpcodeStr,
+                    X86MemOperand x86memop, RegisterClass RC,
+                    SDPatternOperator OpNode = null_frag> {
+  let usesCustomInserter = 1 in
+  def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>;
+
+  let mayLoad = 1 in
+  def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, x86memop:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>;
+}
+
+// These FMA*_Int instructions are defined specially for being used when
+// the scalar FMA intrinsics are lowered to machine instructions, and in that
+// sense, they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc.
+// instructions.
+//
+// All of the FMA*_Int opcodes are defined as commutable here.
+// Commuting the 2nd and 3rd source register operands of FMAs is quite trivial
+// and the corresponding optimizations have been developed.
+// Commuting the 1st operand of FMA*_Int requires some additional analysis,
+// the commute optimization is legal only if all users of FMA*_Int use only
+// the lowest element of the FMA*_Int instruction. Even though such analysis
+// may be not implemented yet we allow the routines doing the actual commute
+// transformation to decide if one or another instruction is commutable or not.
+let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
+    hasSideEffects = 0 in
+multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
+                        Operand memopr, RegisterClass RC> {
+  def r_Int : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   []>;
+
+  let mayLoad = 1 in
+  def m_Int : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, memopr:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   []>;
+}
+
+multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                       string OpStr, string PackTy,
+                       SDNode OpNode, RegisterClass RC,
+                       X86MemOperand x86memop> {
+  defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), x86memop, RC>;
+  defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), x86memop, RC,
+                       OpNode>;
+  defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), x86memop, RC>;
+}
+
+// The FMA 213 form is created for lowering of scalar FMA intrinscis
+// to machine instructions.
+// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands
+// of FMA 213 form.
+// The FMA 231 form can be get only by commuting the 1st operand of 213 or 132
+// forms and is possible only after special analysis of all uses of the initial
+// instruction. Such analysis do not exist yet and thus introducing the 231
+// form of FMA*_Int instructions is done using an optimistic assumption that
+// such analysis will be implemented eventually.
+multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                           string OpStr, string PackTy,
+                           RegisterClass RC, Operand memop> {
+  defm r132 : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy),
+                           memop, RC>;
+  defm r213 : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
+                           memop, RC>;
+  defm r231 : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy),
+                           memop, RC>;
+}
+
+multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                 string OpStr, Intrinsic IntF32, Intrinsic IntF64,
+                 SDNode OpNode> {
+  let ExeDomain = SSEPackedSingle in
+  defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", OpNode,
+                        FR32, f32mem>,
+            fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", VR128, ssmem>;
+
+  let ExeDomain = SSEPackedDouble in
+  defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", OpNode,
+                        FR64, f64mem>,
+            fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", VR128, sdmem>,
+            VEX_W;
+
+  // These patterns use the 123 ordering, instead of 213, even though
+  // they match the intrinsic to the 213 version of the instruction.
+  // This is because src1 is tied to dest, and the scalar intrinsics
+  // require the pass-through values to come from the first source
+  // operand, not the second.
+  def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
+            (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SSr213r_Int") 
+             $src1, $src2, $src3), VR128)>;
+
+  def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
+            (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SDr213r_Int") 
+             $src1, $src2, $src3), VR128)>;
+}
+
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
+                    int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
+                    int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
+
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
+                     int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG;
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
+                     int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG;
+
+
+//===----------------------------------------------------------------------===//
+// FMA4 - AMD 4 operand Fused Multiply-Add instructions
+//===----------------------------------------------------------------------===//
+
+
+multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                 X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
+                 PatFrag mem_frag> {
+  let isCommutable = 1 in
+  def rr : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, RC:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set RC:$dst,
+             (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG, MemOp4;
+  def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, x86memop:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
+                           (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG, MemOp4;
+  def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, x86memop:$src2, RC:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set RC:$dst,
+             (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG;
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2, RC:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+               VEX_LIG;
+}
+
+multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
+                     ComplexPattern mem_cpat, Intrinsic Int> {
+let isCodeGenOnly = 1 in {
+  let isCommutable = 1 in
+  def rr_Int : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst,
+                 (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG, MemOp4;
+  def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, memop:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
+                                  mem_cpat:$src3))]>, VEX_W, VEX_LIG, MemOp4;
+  def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+               (ins VR128:$src1, memop:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst,
+                 (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG;
+} // isCodeGenOnly = 1
+}
+
+multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                 ValueType OpVT128, ValueType OpVT256,
+                 PatFrag ld_frag128, PatFrag ld_frag256> {
+  let isCommutable = 1 in
+  def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+             (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>,
+           VEX_W, MemOp4;
+  def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2,
+                              (ld_frag128 addr:$src3)))]>, VEX_W, MemOp4;
+  def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+             (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
+  let isCommutable = 1 in
+  def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, VR256:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst,
+             (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>,
+           VEX_W, MemOp4, VEX_L;
+  def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2,
+                              (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4, VEX_L;
+  def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst, (OpNode VR256:$src1,
+                              (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L;
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+  def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+  def rrY_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
+                (ins VR256:$src1, VR256:$src2, VR256:$src3),
+                !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+                VEX_L;
+} // isCodeGenOnly = 1
+}
+
+let ExeDomain = SSEPackedSingle in {
+  // Scalar Instructions
+  defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
+                    fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
+                              int_x86_fma_vfmadd_ss>;
+  defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
+                    fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
+                              int_x86_fma_vfmsub_ss>;
+  defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
+                          X86Fnmadd, loadf32>,
+                    fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
+                              int_x86_fma_vfnmadd_ss>;
+  defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
+                          X86Fnmsub, loadf32>,
+                    fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
+                              int_x86_fma_vfnmsub_ss>;
+  // Packed Instructions
+  defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
+                            loadv4f32, loadv8f32>;
+  defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
+                            loadv4f32, loadv8f32>;
+  defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
+                            loadv4f32, loadv8f32>;
+  defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
+                            loadv4f32, loadv8f32>;
+  defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
+                            loadv4f32, loadv8f32>;
+  defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32,
+                            loadv4f32, loadv8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+  // Scalar Instructions
+  defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
+                    fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfmadd_sd>;
+  defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
+                    fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfmsub_sd>;
+  defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
+                          X86Fnmadd, loadf64>,
+                    fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfnmadd_sd>;
+  defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
+                          X86Fnmsub, loadf64>,
+                    fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfnmsub_sd>;
+  // Packed Instructions
+  defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
+                            loadv2f64, loadv4f64>;
+  defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
+                            loadv2f64, loadv4f64>;
+  defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
+                            loadv2f64, loadv4f64>;
+  defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
+                            loadv2f64, loadv4f64>;
+  defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
+                            loadv2f64, loadv4f64>;
+  defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64,
+                            loadv2f64, loadv4f64>;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
new file mode 100644
index 0000000..03ae211
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -0,0 +1,729 @@
+//===- X86InstrFPStack.td - FPU Instruction Set ------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 x87 FPU instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FPStack specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FpGet2    : SDTypeProfile<2, 0, [SDTCisVT<0, f80>,
+                                           SDTCisVT<1, f80>]>;
+def SDTX86Fld       : SDTypeProfile<1, 2, [SDTCisFP<0>,
+                                           SDTCisPtrTy<1>,
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fst       : SDTypeProfile<0, 3, [SDTCisFP<0>,
+                                           SDTCisPtrTy<1>,
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fild      : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fnstsw    : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
+def SDTX86FpToIMem  : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
+
+def SDTX86CwdStore  : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def X86fld          : SDNode<"X86ISD::FLD", SDTX86Fld,
+                             [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86fst          : SDNode<"X86ISD::FST", SDTX86Fst,
+                             [SDNPHasChain, SDNPInGlue, SDNPMayStore,
+                              SDNPMemOperand]>;
+def X86fild         : SDNode<"X86ISD::FILD", SDTX86Fild,
+                             [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86fildflag     : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
+                             [SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
+                              SDNPMemOperand]>;
+def X86fp_stsw      : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>;
+def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
+                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
+                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem,
+                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m",          SDTX86CwdStore,
+                             [SDNPHasChain, SDNPMayStore, SDNPSideEffect,
+                              SDNPMemOperand]>;
+
+//===----------------------------------------------------------------------===//
+// FPStack pattern fragments
+//===----------------------------------------------------------------------===//
+
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimmneg0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-0.0);
+}]>;
+
+def fpimm1 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+1.0);
+}]>;
+
+def fpimmneg1 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-1.0);
+}]>;
+
+// Some 'special' instructions
+let usesCustomInserter = 1 in {  // Expanded after instruction selection.
+  def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src),
+                              [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>;
+  def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src),
+                              [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>;
+  def FP32_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP32:$src),
+                              [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>;
+  def FP64_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP64:$src),
+                              [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>;
+  def FP64_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP64:$src),
+                              [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>;
+  def FP64_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP64:$src),
+                              [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>;
+  def FP80_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP80:$src),
+                              [(X86fp_to_i16mem RFP80:$src, addr:$dst)]>;
+  def FP80_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP80:$src),
+                              [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>;
+  def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src),
+                              [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;
+}
+
+// All FP Stack operations are represented with four instructions here.  The
+// first three instructions, generated by the instruction selector, use "RFP32"
+// "RFP64" or "RFP80" registers: traditional register files to reference 32-bit,
+// 64-bit or 80-bit floating point values.  These sizes apply to the values,
+// not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be
+// copied to each other without losing information.  These instructions are all
+// pseudo instructions and use the "_Fp" suffix.
+// In some cases there are additional variants with a mixture of different
+// register sizes.
+// The second instruction is defined with FPI, which is the actual instruction
+// emitted by the assembler.  These use "RST" registers, although frequently
+// the actual register(s) used are implicit.  These are always 80 bits.
+// The FP stackifier pass converts one to the other after register allocation
+// occurs.
+//
+// Note that the FpI instruction should have instruction selection info (e.g.
+// a pattern) and the FPI instruction should have emission info (e.g. opcode
+// encoding and asm printing info).
+
+// FpIf32, FpIf64 - Floating Point Pseudo Instruction template.
+// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
+// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
+// f80 instructions cannot use SSE and use neither of these.
+class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>;
+class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>;
+
+// Factoring for arithmetic.
+multiclass FPBinary_rr<SDNode OpNode> {
+// Register op register -> register
+// These are separated out because they have no reversed form.
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP,
+                [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>;
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP,
+                [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>;
+def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
+                [(set RFP80:$dst, (OpNode RFP80:$src1, RFP80:$src2))]>;
+}
+// The FopST0 series are not included here because of the irregularities
+// in where the 'r' goes in assembly output.
+// These instructions cannot address 80-bit memory.
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring,
+                    bit Forward = 1> {
+// ST(0) = ST(0) + [mem]
+def _Fp32m  : FpIf32<(outs RFP32:$dst),
+                     (ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
+                  [!if(Forward,
+                       (set RFP32:$dst,
+                        (OpNode RFP32:$src1, (loadf32 addr:$src2))),
+                       (set RFP32:$dst,
+                        (OpNode (loadf32 addr:$src2), RFP32:$src1)))]>;
+def _Fp64m  : FpIf64<(outs RFP64:$dst),
+                     (ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
+                  [!if(Forward,
+                       (set RFP64:$dst,
+                        (OpNode RFP64:$src1, (loadf64 addr:$src2))),
+                       (set RFP64:$dst,
+                        (OpNode (loadf64 addr:$src2), RFP64:$src1)))]>;
+def _Fp64m32: FpIf64<(outs RFP64:$dst),
+                     (ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
+                  [!if(Forward,
+                       (set RFP64:$dst,
+                        (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2)))),
+                       (set RFP64:$dst,
+                        (OpNode (f64 (extloadf32 addr:$src2)), RFP64:$src1)))]>;
+def _Fp80m32: FpI_<(outs RFP80:$dst),
+                   (ins RFP80:$src1, f32mem:$src2), OneArgFPRW,
+                  [!if(Forward,
+                       (set RFP80:$dst,
+                        (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2)))),
+                       (set RFP80:$dst,
+                        (OpNode (f80 (extloadf32 addr:$src2)), RFP80:$src1)))]>;
+def _Fp80m64: FpI_<(outs RFP80:$dst),
+                   (ins RFP80:$src1, f64mem:$src2), OneArgFPRW,
+                  [!if(Forward,
+                       (set RFP80:$dst,
+                        (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))),
+                       (set RFP80:$dst,
+                        (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>;
+let mayLoad = 1 in
+def _F32m  : FPI<0xD8, fp, (outs), (ins f32mem:$src),
+                 !strconcat("f", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
+def _F64m  : FPI<0xDC, fp, (outs), (ins f64mem:$src),
+                 !strconcat("f", asmstring, "{l}\t$src")>;
+// ST(0) = ST(0) + [memint]
+def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2),
+                       OneArgFPRW,
+                       [!if(Forward,
+                            (set RFP32:$dst,
+                             (OpNode RFP32:$src1, (X86fild addr:$src2, i16))),
+                            (set RFP32:$dst,
+                             (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>;
+def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2),
+                       OneArgFPRW,
+                       [!if(Forward,
+                            (set RFP32:$dst,
+                             (OpNode RFP32:$src1, (X86fild addr:$src2, i32))),
+                            (set RFP32:$dst,
+                             (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>;
+def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2),
+                       OneArgFPRW,
+                       [!if(Forward,
+                            (set RFP64:$dst,
+                             (OpNode RFP64:$src1, (X86fild addr:$src2, i16))),
+                            (set RFP64:$dst,
+                             (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>;
+def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2),
+                       OneArgFPRW,
+                       [!if(Forward,
+                            (set RFP64:$dst,
+                             (OpNode RFP64:$src1, (X86fild addr:$src2, i32))),
+                            (set RFP64:$dst,
+                             (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>;
+def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2),
+                     OneArgFPRW,
+                     [!if(Forward,
+                          (set RFP80:$dst,
+                           (OpNode RFP80:$src1, (X86fild addr:$src2, i16))),
+                          (set RFP80:$dst,
+                           (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>;
+def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2),
+                     OneArgFPRW,
+                     [!if(Forward,
+                          (set RFP80:$dst,
+                           (OpNode RFP80:$src1, (X86fild addr:$src2, i32))),
+                          (set RFP80:$dst,
+                           (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>;
+let mayLoad = 1 in
+def _FI16m  : FPI<0xDE, fp, (outs), (ins i16mem:$src),
+                  !strconcat("fi", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
+def _FI32m  : FPI<0xDA, fp, (outs), (ins i32mem:$src),
+                  !strconcat("fi", asmstring, "{l}\t$src")>;
+}
+
+let Defs = [FPSW] in {
+// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
+// resources.
+defm ADD : FPBinary_rr<fadd>;
+defm SUB : FPBinary_rr<fsub>;
+defm MUL : FPBinary_rr<fmul>;
+defm DIV : FPBinary_rr<fdiv>;
+// Sets the scheduling resources for the actual NAME#_F<size>m defintions.
+let SchedRW = [WriteFAddLd] in {
+defm ADD : FPBinary<fadd, MRM0m, "add">;
+defm SUB : FPBinary<fsub, MRM4m, "sub">;
+defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;
+}
+let SchedRW = [WriteFMulLd] in {
+defm MUL : FPBinary<fmul, MRM1m, "mul">;
+}
+let SchedRW = [WriteFDivLd] in {
+defm DIV : FPBinary<fdiv, MRM6m, "div">;
+defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>;
+}
+}
+
+class FPST0rInst<Format fp, string asm>
+  : FPI<0xD8, fp, (outs), (ins RST:$op), asm>;
+class FPrST0Inst<Format fp, string asm>
+  : FPI<0xDC, fp, (outs), (ins RST:$op), asm>;
+class FPrST0PInst<Format fp, string asm>
+  : FPI<0xDE, fp, (outs), (ins RST:$op), asm>;
+
+// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
+// of some of the 'reverse' forms of the fsub and fdiv instructions.  As such,
+// we have to put some 'r's in and take them out of weird places.
+let SchedRW = [WriteFAdd] in {
+def ADD_FST0r   : FPST0rInst <MRM0r, "fadd\t$op">;
+def ADD_FrST0   : FPrST0Inst <MRM0r, "fadd\t{%st(0), $op|$op, st(0)}">;
+def ADD_FPrST0  : FPrST0PInst<MRM0r, "faddp\t$op">;
+def SUBR_FST0r  : FPST0rInst <MRM5r, "fsubr\t$op">;
+def SUB_FrST0   : FPrST0Inst <MRM5r, "fsub{r}\t{%st(0), $op|$op, st(0)}">;
+def SUB_FPrST0  : FPrST0PInst<MRM5r, "fsub{r}p\t$op">;
+def SUB_FST0r   : FPST0rInst <MRM4r, "fsub\t$op">;
+def SUBR_FrST0  : FPrST0Inst <MRM4r, "fsub{|r}\t{%st(0), $op|$op, st(0)}">;
+def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t$op">;
+} // SchedRW
+let SchedRW = [WriteFMul] in {
+def MUL_FST0r   : FPST0rInst <MRM1r, "fmul\t$op">;
+def MUL_FrST0   : FPrST0Inst <MRM1r, "fmul\t{%st(0), $op|$op, st(0)}">;
+def MUL_FPrST0  : FPrST0PInst<MRM1r, "fmulp\t$op">;
+} // SchedRW
+let SchedRW = [WriteFDiv] in {
+def DIVR_FST0r  : FPST0rInst <MRM7r, "fdivr\t$op">;
+def DIV_FrST0   : FPrST0Inst <MRM7r, "fdiv{r}\t{%st(0), $op|$op, st(0)}">;
+def DIV_FPrST0  : FPrST0PInst<MRM7r, "fdiv{r}p\t$op">;
+def DIV_FST0r   : FPST0rInst <MRM6r, "fdiv\t$op">;
+def DIVR_FrST0  : FPrST0Inst <MRM6r, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">;
+def DIVR_FPrST0 : FPrST0PInst<MRM6r, "fdiv{|r}p\t$op">;
+} // SchedRW
+
+def COM_FST0r   : FPST0rInst <MRM2r, "fcom\t$op">;
+def COMP_FST0r  : FPST0rInst <MRM3r, "fcomp\t$op">;
+
+// Unary operations.
+multiclass FPUnary<SDNode OpNode, Format fp, string asmstring> {
+def _Fp32  : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
+                 [(set RFP32:$dst, (OpNode RFP32:$src))]>;
+def _Fp64  : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
+                 [(set RFP64:$dst, (OpNode RFP64:$src))]>;
+def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
+                 [(set RFP80:$dst, (OpNode RFP80:$src))]>;
+def _F     : FPI<0xD9, fp, (outs), (ins), asmstring>;
+}
+
+let Defs = [FPSW] in {
+defm CHS : FPUnary<fneg, MRM_E0, "fchs">;
+defm ABS : FPUnary<fabs, MRM_E1, "fabs">;
+let SchedRW = [WriteFSqrt] in {
+defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt">;
+}
+defm SIN : FPUnary<fsin, MRM_FE, "fsin">;
+defm COS : FPUnary<fcos, MRM_FF, "fcos">;
+
+let hasSideEffects = 0 in {
+def TST_Fp32  : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
+def TST_Fp64  : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
+def TST_Fp80  : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
+}
+def TST_F  : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">;
+} // Defs = [FPSW]
+
+// Versions of FP instructions that take a single memory operand.  Added for the
+//   disassembler; remove as they are included with patterns elsewhere.
+def FCOM32m  : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">;
+def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">;
+
+def FLDENVm  : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">;
+def FSTENVm  : FPI<0xD9, MRM6m, (outs f32mem:$dst), (ins), "fnstenv\t$dst">;
+
+def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">;
+def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
+
+def FCOM64m  : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">;
+def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">;
+
+def FRSTORm  : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">;
+def FSAVEm   : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">;
+def FNSTSWm  : FPI<0xDD, MRM7m, (outs i16mem:$dst), (ins), "fnstsw\t$dst">;
+
+def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">;
+def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">;
+
+def FBLDm    : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">;
+def FBSTPm   : FPI<0xDF, MRM6m, (outs f80mem:$dst), (ins), "fbstp\t$dst">;
+
+// Floating point cmovs.
+class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>;
+class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>;
+
+multiclass FPCMov<PatLeaf cc> {
+  def _Fp32  : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2),
+                       CondMovFP,
+                     [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
+                                        cc, EFLAGS))]>;
+  def _Fp64  : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2),
+                       CondMovFP,
+                     [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
+                                        cc, EFLAGS))]>;
+  def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
+                     CondMovFP,
+                     [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2,
+                                        cc, EFLAGS))]>,
+                                        Requires<[HasCMov]>;
+}
+
+let Defs = [FPSW] in {
+let Uses = [EFLAGS], Constraints = "$src1 = $dst" in {
+defm CMOVB  : FPCMov<X86_COND_B>;
+defm CMOVBE : FPCMov<X86_COND_BE>;
+defm CMOVE  : FPCMov<X86_COND_E>;
+defm CMOVP  : FPCMov<X86_COND_P>;
+defm CMOVNB : FPCMov<X86_COND_AE>;
+defm CMOVNBE: FPCMov<X86_COND_A>;
+defm CMOVNE : FPCMov<X86_COND_NE>;
+defm CMOVNP : FPCMov<X86_COND_NP>;
+} // Uses = [EFLAGS], Constraints = "$src1 = $dst"
+
+let Predicates = [HasCMov] in {
+// These are not factored because there's no clean way to pass DA/DB.
+def CMOVB_F  : FPI<0xDA, MRM0r, (outs), (ins RST:$op),
+                  "fcmovb\t{$op, %st(0)|st(0), $op}">;
+def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RST:$op),
+                  "fcmovbe\t{$op, %st(0)|st(0), $op}">;
+def CMOVE_F  : FPI<0xDA, MRM1r, (outs), (ins RST:$op),
+                  "fcmove\t{$op, %st(0)|st(0), $op}">;
+def CMOVP_F  : FPI<0xDA, MRM3r, (outs), (ins RST:$op),
+                  "fcmovu\t{$op, %st(0)|st(0), $op}">;
+def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RST:$op),
+                  "fcmovnb\t{$op, %st(0)|st(0), $op}">;
+def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RST:$op),
+                  "fcmovnbe\t{$op, %st(0)|st(0), $op}">;
+def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RST:$op),
+                  "fcmovne\t{$op, %st(0)|st(0), $op}">;
+def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RST:$op),
+                  "fcmovnu\t{$op, %st(0)|st(0), $op}">;
+} // Predicates = [HasCMov]
+
+// Floating point loads & stores.
+let canFoldAsLoad = 1 in {
+def LD_Fp32m   : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (loadf32 addr:$src))]>;
+let isReMaterializable = 1 in
+  def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (loadf64 addr:$src))]>;
+def LD_Fp80m   : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (loadf80 addr:$src))]>;
+}
+def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>;
+def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>;
+def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>;
+def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (X86fild addr:$src, i64))]>;
+
+def ST_Fp32m   : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP,
+                  [(store RFP32:$src, addr:$op)]>;
+def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP,
+                  [(truncstoref32 RFP64:$src, addr:$op)]>;
+def ST_Fp64m   : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP,
+                  [(store RFP64:$src, addr:$op)]>;
+def ST_Fp80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP,
+                  [(truncstoref32 RFP80:$src, addr:$op)]>;
+def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP,
+                  [(truncstoref64 RFP80:$src, addr:$op)]>;
+// FST does not support 80-bit memory target; FSTP must be used.
+
+let mayStore = 1, hasSideEffects = 0 in {
+def ST_FpP32m    : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>;
+def ST_FpP64m32  : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP64m    : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP80m32  : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>;
+def ST_FpP80m64  : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+def ST_FpP80m    : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP,
+                    [(store RFP80:$src, addr:$op)]>;
+let mayStore = 1, hasSideEffects = 0 in {
+def IST_Fp16m32  : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp32m32  : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp64m32  : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp16m64  : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp32m64  : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp64m64  : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp16m80  : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp32m80  : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp64m80  : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def LD_F32m   : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src",
+                    IIC_FLD>;
+def LD_F64m   : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src",
+                    IIC_FLD>;
+def LD_F80m   : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src",
+                    IIC_FLD80>;
+def ILD_F16m  : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src",
+                    IIC_FILD>;
+def ILD_F32m  : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src",
+                    IIC_FILD>;
+def ILD_F64m  : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src",
+                    IIC_FILD>;
+}
+let mayStore = 1, SchedRW = [WriteStore] in {
+def ST_F32m   : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst",
+                    IIC_FST>;
+def ST_F64m   : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst",
+                    IIC_FST>;
+def ST_FP32m  : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst",
+                    IIC_FST>;
+def ST_FP64m  : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst",
+                    IIC_FST>;
+def ST_FP80m  : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst",
+                    IIC_FST80>;
+def IST_F16m  : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst",
+                    IIC_FIST>;
+def IST_F32m  : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst",
+                    IIC_FIST>;
+def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst",
+                    IIC_FIST>;
+def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst",
+                    IIC_FIST>;
+def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst",
+                    IIC_FIST>;
+}
+
+// FISTTP requires SSE3 even though it's a FPStack op.
+let Predicates = [HasSSE3] in {
+def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP80:$src, addr:$op)]>;
+def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP80:$src, addr:$op)]>;
+def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP80:$src, addr:$op)]>;
+} // Predicates = [HasSSE3]
+
+let mayStore = 1, SchedRW = [WriteStore] in {
+def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst",
+  IIC_FST>;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst",
+  IIC_FST>;
+def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst),
+  "fisttp{ll}\t$dst", IIC_FST>;
+}
+
+// FP Stack manipulation instructions.
+let SchedRW = [WriteMove] in {
+def LD_Frr   : FPI<0xD9, MRM0r, (outs), (ins RST:$op), "fld\t$op", IIC_FLD>;
+def ST_Frr   : FPI<0xDD, MRM2r, (outs), (ins RST:$op), "fst\t$op", IIC_FST>;
+def ST_FPrr  : FPI<0xDD, MRM3r, (outs), (ins RST:$op), "fstp\t$op", IIC_FST>;
+def XCH_F    : FPI<0xD9, MRM1r, (outs), (ins RST:$op), "fxch\t$op", IIC_FXCH>;
+}
+
+// Floating point constant loads.
+let isReMaterializable = 1 in {
+def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+                [(set RFP32:$dst, fpimm0)]>;
+def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+                [(set RFP32:$dst, fpimm1)]>;
+def LD_Fp064 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+                [(set RFP64:$dst, fpimm0)]>;
+def LD_Fp164 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+                [(set RFP64:$dst, fpimm1)]>;
+def LD_Fp080 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+                [(set RFP80:$dst, fpimm0)]>;
+def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+                [(set RFP80:$dst, fpimm1)]>;
+}
+
+let SchedRW = [WriteZero] in {
+def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz", IIC_FLDZ>;
+def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1", IIC_FIST>;
+}
+
+// Floating point compares.
+let SchedRW = [WriteFAdd] in {
+def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+                        [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>;
+def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+                        [(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>;
+def UCOM_Fpr80 : FpI_  <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+                        [(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>;
+} // SchedRW
+} // Defs = [FPSW]
+
+let SchedRW = [WriteFAdd] in {
+// CC = ST(0) cmp ST(i)
+let Defs = [EFLAGS, FPSW] in {
+def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+                  [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>;
+def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+                  [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>;
+def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+                  [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>;
+}
+
+let Defs = [FPSW], Uses = [ST0] in {
+def UCOM_Fr    : FPI<0xDD, MRM4r,    // FPSW = cmp ST(0) with ST(i)
+                    (outs), (ins RST:$reg), "fucom\t$reg", IIC_FUCOM>;
+def UCOM_FPr   : FPI<0xDD, MRM5r,    // FPSW = cmp ST(0) with ST(i), pop
+                    (outs), (ins RST:$reg), "fucomp\t$reg", IIC_FUCOM>;
+def UCOM_FPPr  : FPI<0xDA, MRM_E9,       // cmp ST(0) with ST(1), pop, pop
+                    (outs), (ins), "fucompp", IIC_FUCOM>;
+}
+
+let Defs = [EFLAGS, FPSW], Uses = [ST0] in {
+def UCOM_FIr   : FPI<0xDB, MRM5r,     // CC = cmp ST(0) with ST(i)
+                    (outs), (ins RST:$reg), "fucomi\t$reg", IIC_FUCOMI>;
+def UCOM_FIPr  : FPI<0xDF, MRM5r,     // CC = cmp ST(0) with ST(i), pop
+                    (outs), (ins RST:$reg), "fucompi\t$reg", IIC_FUCOMI>;
+}
+
+let Defs = [EFLAGS, FPSW] in {
+def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RST:$reg),
+                  "fcomi\t$reg", IIC_FCOMI>;
+def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RST:$reg),
+                   "fcompi\t$reg", IIC_FCOMI>;
+}
+} // SchedRW
+
+// Floating point flag ops.
+let SchedRW = [WriteALU] in {
+let Defs = [AX], Uses = [FPSW] in
+def FNSTSW16r : I<0xDF, MRM_E0,                  // AX = fp flags
+                  (outs), (ins), "fnstsw\t{%ax|ax}",
+                  [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>;
+
+def FNSTCW16m : I<0xD9, MRM7m,                   // [mem16] = X87 control world
+                  (outs), (ins i16mem:$dst), "fnstcw\t$dst",
+                  [(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>;
+} // SchedRW
+let mayLoad = 1 in
+def FLDCW16m  : I<0xD9, MRM5m,                   // X87 control world = [mem16]
+                  (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>,
+                Sched<[WriteLoad]>;
+
+// FPU control instructions
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [FPSW] in
+def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", [], IIC_FNINIT>;
+def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg),
+                "ffree\t$reg", IIC_FFREE>;
+// Clear exceptions
+
+let Defs = [FPSW] in
+def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", [], IIC_FNCLEX>;
+} // SchedRW
+
+// Operandless floating-point instructions for the disassembler.
+let SchedRW = [WriteMicrocoded] in {
+def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>;
+
+def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", [], IIC_FNOP>;
+def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", [], IIC_FXAM>;
+def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", [], IIC_FLDL>;
+def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", [], IIC_FLDL>;
+def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", [], IIC_FLDL>;
+def FLDLG2 : I<0xD9, MRM_EC, (outs), (ins), "fldlg2", [], IIC_FLDL>;
+def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", [], IIC_FLDL>;
+def F2XM1 : I<0xD9, MRM_F0, (outs), (ins), "f2xm1", [], IIC_F2XM1>;
+def FYL2X : I<0xD9, MRM_F1, (outs), (ins), "fyl2x", [], IIC_FYL2X>;
+def FPTAN : I<0xD9, MRM_F2, (outs), (ins), "fptan", [], IIC_FPTAN>;
+def FPATAN : I<0xD9, MRM_F3, (outs), (ins), "fpatan", [], IIC_FPATAN>;
+def FXTRACT : I<0xD9, MRM_F4, (outs), (ins), "fxtract", [], IIC_FXTRACT>;
+def FPREM1 : I<0xD9, MRM_F5, (outs), (ins), "fprem1", [], IIC_FPREM1>;
+def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", [], IIC_FPSTP>;
+def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", [], IIC_FPSTP>;
+def FPREM : I<0xD9, MRM_F8, (outs), (ins), "fprem", [], IIC_FPREM>;
+def FYL2XP1 : I<0xD9, MRM_F9, (outs), (ins), "fyl2xp1", [], IIC_FYL2XP1>;
+def FSINCOS : I<0xD9, MRM_FB, (outs), (ins), "fsincos", [], IIC_FSINCOS>;
+def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", [], IIC_FRNDINT>;
+def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", [], IIC_FSCALE>;
+def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>;
+
+let Predicates = [HasFXSR] in {
+  def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
+                 "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB;
+  def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
+                    "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)],
+                    IIC_FXSAVE>, TB, Requires<[In64BitMode]>;
+  def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+                "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB;
+  def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+                     "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)],
+                     IIC_FXRSTOR>, TB, Requires<[In64BitMode]>;
+} // Predicates = [FeatureFXSR]
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Required for RET of f32 / f64 / f80 values.
+def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>;
+def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>;
+def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>;
+
+// Required for CALL which return f32 / f64 / f80 values.
+def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op,
+                                                          RFP64:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op,
+                                                          RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op,
+                                                          RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op,
+                                                         RFP80:$src)>;
+
+// Floating point constant -0.0 and -1.0
+def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>;
+def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStackf32]>;
+def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStackf64]>;
+def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>;
+def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>;
+def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>;
+
+// Used to conv. i64 to f64 since there isn't a SSE version.
+def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
+
+// FP extensions map onto simple pseudo-value conversions if they are to/from
+// the FP stack.
+def : Pat<(f64 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
+          Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>,
+           Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>,
+           Requires<[FPStackf64]>;
+
+// FP truncations map onto simple pseudo-value conversions if they are to/from
+// the FP stack.  We have validated that only value-preserving truncations make
+// it through isel.
+def : Pat<(f32 (fround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>,
+          Requires<[FPStackf32]>;
+def : Pat<(f32 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>,
+           Requires<[FPStackf32]>;
+def : Pat<(f64 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>,
+           Requires<[FPStackf64]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
new file mode 100644
index 0000000..e2fa295
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
@@ -0,0 +1,948 @@
+//===-- X86InstrFormats.td - X86 Instruction Formats -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<7> val> {
+  bits<7> Value = val;
+}
+
+def Pseudo     : Format<0>; def RawFrm     : Format<1>;
+def AddRegFrm  : Format<2>; def MRMDestReg : Format<3>;
+def MRMDestMem : Format<4>; def MRMSrcReg  : Format<5>;
+def MRMSrcMem  : Format<6>; def RawFrmMemOffs : Format<7>;
+def RawFrmSrc  : Format<8>; def RawFrmDst     : Format<9>;
+def RawFrmDstSrc: Format<10>;
+def RawFrmImm8 : Format<11>;
+def RawFrmImm16 : Format<12>;
+def MRMXr  : Format<14>; def MRMXm  : Format<15>;
+def MRM0r  : Format<16>; def MRM1r  : Format<17>; def MRM2r  : Format<18>;
+def MRM3r  : Format<19>; def MRM4r  : Format<20>; def MRM5r  : Format<21>;
+def MRM6r  : Format<22>; def MRM7r  : Format<23>;
+def MRM0m  : Format<24>; def MRM1m  : Format<25>; def MRM2m  : Format<26>;
+def MRM3m  : Format<27>; def MRM4m  : Format<28>; def MRM5m  : Format<29>;
+def MRM6m  : Format<30>; def MRM7m  : Format<31>;
+def MRM_C0 : Format<32>; def MRM_C1 : Format<33>; def MRM_C2 : Format<34>;
+def MRM_C3 : Format<35>; def MRM_C4 : Format<36>; def MRM_C5 : Format<37>;
+def MRM_C6 : Format<38>; def MRM_C7 : Format<39>; def MRM_C8 : Format<40>;
+def MRM_C9 : Format<41>; def MRM_CA : Format<42>; def MRM_CB : Format<43>;
+def MRM_CC : Format<44>; def MRM_CD : Format<45>; def MRM_CE : Format<46>;
+def MRM_CF : Format<47>; def MRM_D0 : Format<48>; def MRM_D1 : Format<49>;
+def MRM_D2 : Format<50>; def MRM_D3 : Format<51>; def MRM_D4 : Format<52>;
+def MRM_D5 : Format<53>; def MRM_D6 : Format<54>; def MRM_D7 : Format<55>;
+def MRM_D8 : Format<56>; def MRM_D9 : Format<57>; def MRM_DA : Format<58>;
+def MRM_DB : Format<59>; def MRM_DC : Format<60>; def MRM_DD : Format<61>;
+def MRM_DE : Format<62>; def MRM_DF : Format<63>; def MRM_E0 : Format<64>;
+def MRM_E1 : Format<65>; def MRM_E2 : Format<66>; def MRM_E3 : Format<67>;
+def MRM_E4 : Format<68>; def MRM_E5 : Format<69>; def MRM_E6 : Format<70>;
+def MRM_E7 : Format<71>; def MRM_E8 : Format<72>; def MRM_E9 : Format<73>;
+def MRM_EA : Format<74>; def MRM_EB : Format<75>; def MRM_EC : Format<76>;
+def MRM_ED : Format<77>; def MRM_EE : Format<78>; def MRM_EF : Format<79>;
+def MRM_F0 : Format<80>; def MRM_F1 : Format<81>; def MRM_F2 : Format<82>;
+def MRM_F3 : Format<83>; def MRM_F4 : Format<84>; def MRM_F5 : Format<85>;
+def MRM_F6 : Format<86>; def MRM_F7 : Format<87>; def MRM_F8 : Format<88>;
+def MRM_F9 : Format<89>; def MRM_FA : Format<90>; def MRM_FB : Format<91>;
+def MRM_FC : Format<92>; def MRM_FD : Format<93>; def MRM_FE : Format<94>;
+def MRM_FF : Format<95>;
+
+// ImmType - This specifies the immediate type used by an instruction. This is
+// part of the ad-hoc solution used to emit machine instruction encodings by our
+// machine code emitter.
+class ImmType<bits<4> val> {
+  bits<4> Value = val;
+}
+def NoImm      : ImmType<0>;
+def Imm8       : ImmType<1>;
+def Imm8PCRel  : ImmType<2>;
+def Imm16      : ImmType<3>;
+def Imm16PCRel : ImmType<4>;
+def Imm32      : ImmType<5>;
+def Imm32PCRel : ImmType<6>;
+def Imm32S     : ImmType<7>;
+def Imm64      : ImmType<8>;
+
+// FPFormat - This specifies what form this FP instruction has.  This is used by
+// the Floating-Point stackifier pass.
+class FPFormat<bits<3> val> {
+  bits<3> Value = val;
+}
+def NotFP      : FPFormat<0>;
+def ZeroArgFP  : FPFormat<1>;
+def OneArgFP   : FPFormat<2>;
+def OneArgFPRW : FPFormat<3>;
+def TwoArgFP   : FPFormat<4>;
+def CompareFP  : FPFormat<5>;
+def CondMovFP  : FPFormat<6>;
+def SpecialFP  : FPFormat<7>;
+
+// Class specifying the SSE execution domain, used by the SSEDomainFix pass.
+// Keep in sync with tables in X86InstrInfo.cpp.
+class Domain<bits<2> val> {
+  bits<2> Value = val;
+}
+def GenericDomain   : Domain<0>;
+def SSEPackedSingle : Domain<1>;
+def SSEPackedDouble : Domain<2>;
+def SSEPackedInt    : Domain<3>;
+
+// Class specifying the vector form of the decompressed
+// displacement of 8-bit.
+class CD8VForm<bits<3> val> {
+  bits<3> Value = val;
+}
+def CD8VF  : CD8VForm<0>;  // v := VL
+def CD8VH  : CD8VForm<1>;  // v := VL/2
+def CD8VQ  : CD8VForm<2>;  // v := VL/4
+def CD8VO  : CD8VForm<3>;  // v := VL/8
+// The tuple (subvector) forms.
+def CD8VT1 : CD8VForm<4>;  // v := 1
+def CD8VT2 : CD8VForm<5>;  // v := 2
+def CD8VT4 : CD8VForm<6>;  // v := 4
+def CD8VT8 : CD8VForm<7>;  // v := 8
+
+// Class specifying the prefix used an opcode extension.
+class Prefix<bits<3> val> {
+  bits<3> Value = val;
+}
+def NoPrfx : Prefix<0>;
+def PS     : Prefix<1>;
+def PD     : Prefix<2>;
+def XS     : Prefix<3>;
+def XD     : Prefix<4>;
+
+// Class specifying the opcode map.
+class Map<bits<3> val> {
+  bits<3> Value = val;
+}
+def OB   : Map<0>;
+def TB   : Map<1>;
+def T8   : Map<2>;
+def TA   : Map<3>;
+def XOP8 : Map<4>;
+def XOP9 : Map<5>;
+def XOPA : Map<6>;
+
+// Class specifying the encoding
+class Encoding<bits<2> val> {
+  bits<2> Value = val;
+}
+def EncNormal : Encoding<0>;
+def EncVEX    : Encoding<1>;
+def EncXOP    : Encoding<2>;
+def EncEVEX   : Encoding<3>;
+
+// Operand size for encodings that change based on mode.
+class OperandSize<bits<2> val> {
+  bits<2> Value = val;
+}
+def OpSizeFixed : OperandSize<0>; // Never needs a 0x66 prefix.
+def OpSize16    : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode.
+def OpSize32    : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode.
+
+// Address size for encodings that change based on mode.
+class AddressSize<bits<2> val> {
+  bits<2> Value = val;
+}
+def AdSizeX  : AddressSize<0>; // Address size determined using addr operand.
+def AdSize16 : AddressSize<1>; // Encodes a 16-bit address.
+def AdSize32 : AddressSize<2>; // Encodes a 32-bit address.
+def AdSize64 : AddressSize<3>; // Encodes a 64-bit address.
+
+// Prefix byte classes which are used to indicate to the ad-hoc machine code
+// emitter that various prefix bytes are required.
+class OpSize16 { OperandSize OpSize = OpSize16; }
+class OpSize32 { OperandSize OpSize = OpSize32; }
+class AdSize16 { AddressSize AdSize = AdSize16; }
+class AdSize32 { AddressSize AdSize = AdSize32; }
+class AdSize64 { AddressSize AdSize = AdSize64; }
+class REX_W  { bit hasREX_WPrefix = 1; }
+class LOCK   { bit hasLockPrefix = 1; }
+class REP    { bit hasREPPrefix = 1; }
+class TB     { Map OpMap = TB; }
+class T8     { Map OpMap = T8; }
+class TA     { Map OpMap = TA; }
+class XOP8   { Map OpMap = XOP8; Prefix OpPrefix = PS; }
+class XOP9   { Map OpMap = XOP9; Prefix OpPrefix = PS; }
+class XOPA   { Map OpMap = XOPA; Prefix OpPrefix = PS; }
+class OBXS   { Prefix OpPrefix = XS; }
+class PS   : TB { Prefix OpPrefix = PS; }
+class PD   : TB { Prefix OpPrefix = PD; }
+class XD   : TB { Prefix OpPrefix = XD; }
+class XS   : TB { Prefix OpPrefix = XS; }
+class T8PS : T8 { Prefix OpPrefix = PS; }
+class T8PD : T8 { Prefix OpPrefix = PD; }
+class T8XD : T8 { Prefix OpPrefix = XD; }
+class T8XS : T8 { Prefix OpPrefix = XS; }
+class TAPS : TA { Prefix OpPrefix = PS; }
+class TAPD : TA { Prefix OpPrefix = PD; }
+class TAXD : TA { Prefix OpPrefix = XD; }
+class VEX    { Encoding OpEnc = EncVEX; }
+class VEX_W  { bit hasVEX_WPrefix = 1; }
+class VEX_4V : VEX { bit hasVEX_4V = 1; }
+class VEX_4VOp3 : VEX { bit hasVEX_4VOp3 = 1; }
+class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; }
+class VEX_L  { bit hasVEX_L = 1; }
+class VEX_LIG { bit ignoresVEX_L = 1; }
+class EVEX : VEX { Encoding OpEnc = EncEVEX; }
+class EVEX_4V : VEX_4V { Encoding OpEnc = EncEVEX; }
+class EVEX_K { bit hasEVEX_K = 1; }
+class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; }
+class EVEX_B { bit hasEVEX_B = 1; }
+class EVEX_RC { bit hasEVEX_RC = 1; }
+class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; }
+class EVEX_V256 { bit hasEVEX_L2 = 0; bit hasVEX_L = 1; }
+class EVEX_V128 { bit hasEVEX_L2 = 0; bit hasVEX_L = 0; }
+
+// Specify AVX512 8-bit compressed displacement encoding based on the vector
+// element size in bits (8, 16, 32, 64) and the CDisp8 form.
+class EVEX_CD8<int esize, CD8VForm form> {
+  int CD8_EltSize = !srl(esize, 3);
+  bits<3> CD8_Form = form.Value;
+}
+
+class Has3DNow0F0FOpcode  { bit has3DNow0F0FOpcode = 1; }
+class MemOp4 { bit hasMemOp4Prefix = 1; }
+class XOP { Encoding OpEnc = EncXOP; }
+class XOP_4V : XOP { bit hasVEX_4V = 1; }
+class XOP_4VOp3 : XOP { bit hasVEX_4VOp3 = 1; }
+
+class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
+              string AsmStr,
+              InstrItinClass itin,
+              Domain d = GenericDomain>
+  : Instruction {
+  let Namespace = "X86";
+
+  bits<8> Opcode = opcod;
+  Format Form = f;
+  bits<7> FormBits = Form.Value;
+  ImmType ImmT = i;
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  string AsmString = AsmStr;
+
+  // If this is a pseudo instruction, mark it isCodeGenOnly.
+  let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
+
+  let Itinerary = itin;
+
+  //
+  // Attributes specific to X86 instructions...
+  //
+  bit ForceDisassemble = 0; // Force instruction to disassemble even though it's
+                            // isCodeGenonly. Needed to hide an ambiguous
+                            // AsmString from the parser, but still disassemble.
+
+  OperandSize OpSize = OpSizeFixed; // Does this instruction's encoding change
+                                    // based on operand size of the mode?
+  bits<2> OpSizeBits = OpSize.Value;
+  AddressSize AdSize = AdSizeX; // Does this instruction's encoding change
+                                // based on address size of the mode?
+  bits<2> AdSizeBits = AdSize.Value;
+
+  Prefix OpPrefix = NoPrfx; // Which prefix byte does this inst have?
+  bits<3> OpPrefixBits = OpPrefix.Value;
+  Map OpMap = OB;           // Which opcode map does this inst have?
+  bits<3> OpMapBits = OpMap.Value;
+  bit hasREX_WPrefix  = 0;  // Does this inst require the REX.W prefix?
+  FPFormat FPForm = NotFP;  // What flavor of FP instruction is this?
+  bit hasLockPrefix = 0;    // Does this inst have a 0xF0 prefix?
+  Domain ExeDomain = d;
+  bit hasREPPrefix = 0;     // Does this inst have a REP prefix?
+  Encoding OpEnc = EncNormal; // Encoding used by this instruction
+  bits<2> OpEncBits = OpEnc.Value;
+  bit hasVEX_WPrefix = 0;   // Does this inst set the VEX_W field?
+  bit hasVEX_4V = 0;        // Does this inst require the VEX.VVVV field?
+  bit hasVEX_4VOp3 = 0;     // Does this inst require the VEX.VVVV field to
+                            // encode the third operand?
+  bit hasVEX_i8ImmReg = 0;  // Does this inst require the last source register
+                            // to be encoded in a immediate field?
+  bit hasVEX_L = 0;         // Does this inst use large (256-bit) registers?
+  bit ignoresVEX_L = 0;     // Does this instruction ignore the L-bit
+  bit hasEVEX_K = 0;        // Does this inst require masking?
+  bit hasEVEX_Z = 0;        // Does this inst set the EVEX_Z field?
+  bit hasEVEX_L2 = 0;       // Does this inst set the EVEX_L2 field?
+  bit hasEVEX_B = 0;        // Does this inst set the EVEX_B field?
+  bits<3> CD8_Form = 0;     // Compressed disp8 form - vector-width.
+  // Declare it int rather than bits<4> so that all bits are defined when
+  // assigning to bits<7>.
+  int CD8_EltSize = 0;      // Compressed disp8 form - element-size in bytes.
+  bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
+  bit hasMemOp4Prefix = 0;  // Same bit as VEX_W, but used for swapping operands
+  bit hasEVEX_RC = 0;       // Explicitly specified rounding control in FP instruction.
+
+  bits<2> EVEX_LL;
+  let EVEX_LL{0} = hasVEX_L;
+  let EVEX_LL{1} = hasEVEX_L2;
+  // Vector size in bytes.
+  bits<7> VectSize = !shl(16, EVEX_LL);
+
+  // The scaling factor for AVX512's compressed displacement is either
+  //   - the size of a  power-of-two number of elements or
+  //   - the size of a single element for broadcasts or
+  //   - the total vector size divided by a power-of-two number.
+  // Possible values are: 0 (non-AVX512 inst), 1, 2, 4, 8, 16, 32 and 64.
+  bits<7> CD8_Scale = !if (!eq (OpEnc.Value, EncEVEX.Value),
+                           !if (CD8_Form{2},
+                                !shl(CD8_EltSize, CD8_Form{1-0}),
+                                !if (hasEVEX_B,
+                                     CD8_EltSize,
+                                     !srl(VectSize, CD8_Form{1-0}))), 0);
+
+  // TSFlags layout should be kept in sync with X86BaseInfo.h.
+  let TSFlags{6-0}   = FormBits;
+  let TSFlags{8-7}   = OpSizeBits;
+  let TSFlags{10-9}  = AdSizeBits;
+  let TSFlags{13-11} = OpPrefixBits;
+  let TSFlags{16-14} = OpMapBits;
+  let TSFlags{17}    = hasREX_WPrefix;
+  let TSFlags{21-18} = ImmT.Value;
+  let TSFlags{24-22} = FPForm.Value;
+  let TSFlags{25}    = hasLockPrefix;
+  let TSFlags{26}    = hasREPPrefix;
+  let TSFlags{28-27} = ExeDomain.Value;
+  let TSFlags{30-29} = OpEncBits;
+  let TSFlags{38-31} = Opcode;
+  let TSFlags{39}    = hasVEX_WPrefix;
+  let TSFlags{40}    = hasVEX_4V;
+  let TSFlags{41}    = hasVEX_4VOp3;
+  let TSFlags{42}    = hasVEX_i8ImmReg;
+  let TSFlags{43}    = hasVEX_L;
+  let TSFlags{44}    = ignoresVEX_L;
+  let TSFlags{45}    = hasEVEX_K;
+  let TSFlags{46}    = hasEVEX_Z;
+  let TSFlags{47}    = hasEVEX_L2;
+  let TSFlags{48}    = hasEVEX_B;
+  // If we run out of TSFlags bits, it's possible to encode this in 3 bits.
+  let TSFlags{55-49} = CD8_Scale;
+  let TSFlags{56}    = has3DNow0F0FOpcode;
+  let TSFlags{57}    = hasMemOp4Prefix;
+  let TSFlags{58}    = hasEVEX_RC;
+}
+
+class PseudoI<dag oops, dag iops, list<dag> pattern>
+  : X86Inst<0, Pseudo, NoImm, oops, iops, "", NoItinerary> {
+  let Pattern = pattern;
+}
+
+class I<bits<8> o, Format f, dag outs, dag ins, string asm,
+        list<dag> pattern, InstrItinClass itin = NoItinerary,
+        Domain d = GenericDomain>
+  : X86Inst<o, f, NoImm, outs, ins, asm, itin, d> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary,
+           Domain d = GenericDomain>
+  : X86Inst<o, f, Imm8, outs, ins, asm, itin, d> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii32S<bits<8> o, Format f, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm32S, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+           : X86Inst<o, f, Imm16PCRel, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm32PCRel, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+// FPStack Instruction Templates:
+// FPI - Floating Point Instruction template.
+class FPI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          InstrItinClass itin = NoItinerary>
+  : I<o, F, outs, ins, asm, [], itin> {}
+
+// FpI_ - Floating Point Pseudo Instruction template. Not Predicated.
+class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
+           InstrItinClass itin = NoItinerary>
+  : X86Inst<0, Pseudo, NoImm, outs, ins, "", itin> {
+  let FPForm = fp;
+  let Pattern = pattern;
+}
+
+// Templates for instructions that use a 16- or 32-bit segmented address as
+//  their only operand: lcall (FAR CALL) and ljmp (FAR JMP)
+//
+//   Iseg16 - 16-bit segment selector, 16-bit offset
+//   Iseg32 - 16-bit segment selector, 32-bit offset
+
+class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+// SI - SSE 1 & 2 scalar instructions
+class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+         list<dag> pattern, InstrItinClass itin = NoItinerary,
+         Domain d = GenericDomain>
+      : I<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX],
+                   !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+                   !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2],
+                   !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+                   [UseSSE1])))));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+                  !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+                  asm));
+}
+
+// SI - SSE 1 & 2 scalar intrinsics - vex form available on AVX512
+class SI_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
+         list<dag> pattern, InstrItinClass itin = NoItinerary,
+         Domain d = GenericDomain>
+      : I<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+                   !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+                   !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2],
+                   !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+                   [UseSSE1])))));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+                  !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+                  asm));
+}
+// SIi8 - SSE 1 & 2 scalar instructions - vex form available on AVX512
+class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin> {
+  let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+                   !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+                   [UseSSE2])));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+                  !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+                  asm));
+}
+
+// PI - SSE 1 & 2 packed instructions
+class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+         InstrItinClass itin, Domain d>
+      : I<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+                   !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+                   [UseSSE1])));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+                  !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+                  asm));
+}
+
+// MMXPI - SSE 1 & 2 packed instructions with MMX operands
+class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+            InstrItinClass itin, Domain d>
+      : I<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(!eq(OpPrefix.Value, PD.Value), [HasSSE2],
+                       [HasSSE1]);
+}
+
+// PIi8 - SSE 1 & 2 packed instructions with immediate
+class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin, Domain d>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+                   !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+                   [UseSSE1])));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+                  !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+                  asm));
+}
+
+// SSE1 Instruction Templates:
+//
+//   SSI   - SSE1 instructions with XS prefix.
+//   PSI   - SSE1 instructions with PS prefix.
+//   PSIi8 - SSE1 instructions with ImmT == Imm8 and PS prefix.
+//   VSSI  - SSE1 instructions with XS prefix in AVX form.
+//   VPSI  - SSE1 instructions with PS prefix in AVX form, packed single.
+
+class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
+class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
+class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+        Requires<[UseSSE1]>;
+class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+        Requires<[UseSSE1]>;
+class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+        Requires<[HasAVX]>;
+class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedSingle>, PS,
+        Requires<[HasAVX]>;
+
+// SSE2 Instruction Templates:
+//
+//   SDI    - SSE2 instructions with XD prefix.
+//   SDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix.
+//   S2SI   - SSE2 instructions with XS prefix.
+//   SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
+//   PDI    - SSE2 instructions with PD prefix, packed double domain.
+//   PDIi8  - SSE2 instructions with ImmT == Imm8 and PD prefix.
+//   VSDI   - SSE2 scalar instructions with XD prefix in AVX form.
+//   VPDI   - SSE2 vector instructions with PD prefix in AVX form,
+//                 packed double domain.
+//   VS2I   - SSE2 scalar instructions with PD prefix in AVX form.
+//   S2I    - SSE2 scalar instructions with PD prefix.
+//   MMXSDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix as well as
+//               MMX operands.
+//   MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as
+//               MMX operands.
+
+class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
+class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
+class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>;
+class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
+class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+        Requires<[UseSSE2]>;
+class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+        Requires<[UseSSE2]>;
+class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
+        Requires<[UseAVX]>;
+class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+        Requires<[HasAVX]>;
+class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>,
+        PD, Requires<[HasAVX]>;
+class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, PD,
+        Requires<[UseAVX]>;
+class S2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[UseSSE2]>;
+class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+                list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
+
+// SSE3 Instruction Templates:
+//
+//   S3I   - SSE3 instructions with PD prefixes.
+//   S3SI  - SSE3 instructions with XS prefix.
+//   S3DI  - SSE3 instructions with XD prefix.
+
+class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS,
+        Requires<[UseSSE3]>;
+class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD,
+        Requires<[UseSSE3]>;
+class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+        Requires<[UseSSE3]>;
+
+
+// SSSE3 Instruction Templates:
+//
+//   SS38I - SSSE3 instructions with T8 prefix.
+//   SS3AI - SSSE3 instructions with TA prefix.
+//   MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands.
+//   MMXSS3AI - SSSE3 instructions with TA prefix and MMX operands.
+//
+// Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version
+// uses the MMX registers. The 64-bit versions are grouped with the MMX
+// classes. They need to be enabled even if AVX is enabled.
+
+class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[UseSSSE3]>;
+class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[UseSSSE3]>;
+class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PS,
+        Requires<[HasSSSE3]>;
+class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPS,
+        Requires<[HasSSSE3]>;
+
+// SSE4.1 Instruction Templates:
+//
+//   SS48I - SSE 4.1 instructions with T8 prefix.
+//   SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8.
+//
+class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[UseSSE41]>;
+class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[UseSSE41]>;
+
+// SSE4.2 Instruction Templates:
+//
+//   SS428I - SSE 4.2 instructions with T8 prefix.
+class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[UseSSE42]>;
+
+//   SS42FI - SSE 4.2 instructions with T8XD prefix.
+// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns.
+class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>;
+
+//   SS42AI = SSE 4.2 instructions with TA prefix
+class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[UseSSE42]>;
+
+// AVX Instruction Templates:
+//   Instructions introduced in AVX (no SSE equivalent forms)
+//
+//   AVX8I - AVX instructions with T8PD prefix.
+//   AVXAIi8 - AVX instructions with TAPD prefix and ImmT = Imm8.
+class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[HasAVX]>;
+class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[HasAVX]>;
+
+// AVX2 Instruction Templates:
+//   Instructions introduced in AVX2 (no SSE equivalent forms)
+//
+//   AVX28I - AVX2 instructions with T8PD prefix.
+//   AVX2AIi8 - AVX2 instructions with TAPD prefix and ImmT = Imm8.
+class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[HasAVX2]>;
+class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[HasAVX2]>;
+
+
+// AVX-512 Instruction Templates:
+//   Instructions introduced in AVX-512 (no SSE equivalent forms)
+//
+//   AVX5128I - AVX-512 instructions with T8PD prefix.
+//   AVX512AIi8 - AVX-512 instructions with TAPD prefix and ImmT = Imm8.
+//   AVX512PDI  - AVX-512 instructions with PD, double packed.
+//   AVX512PSI  - AVX-512 instructions with PS, single packed.
+//   AVX512XS8I - AVX-512 instructions with T8 and XS prefixes.
+//   AVX512XSI  - AVX-512 instructions with XS prefix, generic domain.
+//   AVX512BI   - AVX-512 instructions with PD, int packed domain.
+//   AVX512SI   - AVX-512 scalar instructions with PD prefix.
+
+class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[HasAVX512]>;
+class AVX5128IBase : T8PD {
+  Domain ExeDomain = SSEPackedInt;
+}
+class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8XS,
+        Requires<[HasAVX512]>;
+class AVX512XSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, XS,
+        Requires<[HasAVX512]>;
+class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, XD,
+        Requires<[HasAVX512]>;
+class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD,
+        Requires<[HasAVX512]>;
+class AVX512BIBase : PD {
+  Domain ExeDomain = SSEPackedInt;
+}
+class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD,
+        Requires<[HasAVX512]>;
+class AVX512BIi8Base : PD {
+  Domain ExeDomain = SSEPackedInt;
+  ImmType ImmT = Imm8;
+}
+class AVX512XSIi8Base : XS {
+  Domain ExeDomain = SSEPackedInt;
+  ImmType ImmT = Imm8;
+}
+class AVX512XDIi8Base : XD {
+  Domain ExeDomain = SSEPackedInt;
+  ImmType ImmT = Imm8;
+}
+class AVX512PSIi8Base : PS {
+  Domain ExeDomain = SSEPackedSingle;
+  ImmType ImmT = Imm8;
+}
+class AVX512PDIi8Base : PD {
+  Domain ExeDomain = SSEPackedDouble;
+  ImmType ImmT = Imm8;
+}
+class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[HasAVX512]>;
+class AVX512AIi8Base : TAPD {
+  Domain ExeDomain = SSEPackedInt;
+  ImmType ImmT = Imm8;
+}
+class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>,
+        Requires<[HasAVX512]>;
+class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+        Requires<[HasAVX512]>;
+class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+        Requires<[HasAVX512]>;
+class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+        EVEX_4V, Requires<[HasAVX512]>;
+class AVX512FMA3Base : T8PD, EVEX_4V;
+
+class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, Requires<[HasAVX512]>;
+
+// AES Instruction Templates:
+//
+// AES8I
+// These use the same encoding as the SSE4.2 T8 and TA encodings.
+class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag>pattern, InstrItinClass itin = IIC_AES>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[HasAES]>;
+
+class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[HasAES]>;
+
+// PCLMUL Instruction Templates
+class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[HasPCLMUL]>;
+
+class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+                  list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        VEX_4V, Requires<[HasAVX, HasPCLMUL]>;
+
+// FMA3 Instruction Templates
+class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+        VEX_4V, FMASC, Requires<[HasFMA]>;
+
+// FMA4 Instruction Templates
+class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, TAPD,
+        VEX_4V, VEX_I8IMM, FMASC, Requires<[HasFMA4]>;
+
+// XOP 2, 3 and 4 Operand Instruction Template
+class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+         XOP9, Requires<[HasXOP]>;
+
+// XOP 2, 3 and 4 Operand Instruction Templates with imm byte
+class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+         XOP8, Requires<[HasXOP]>;
+
+//  XOP 5 operand instruction (VEX encoding!)
+class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        VEX_4V, VEX_I8IMM, Requires<[HasXOP]>;
+
+// X86-64 Instruction templates...
+//
+
+class RI<bits<8> o, Format F, dag outs, dag ins, string asm,
+         list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi16 <bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii16<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii32<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi32S <bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii32S<o, F, outs, ins, asm, pattern, itin>, REX_W;
+
+class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm64, outs, ins, asm, itin>, REX_W {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class RIi64_NOREX<bits<8> o, Format f, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm64, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class RS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : S2I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : VS2I<o, F, outs, ins, asm, pattern, itin>, VEX_W;
+
+// MMX Instruction templates
+//
+
+// MMXI   - MMX instructions with TB prefix.
+// MMXI32 - MMX instructions with TB prefix valid only in 32 bit mode.
+// MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode.
+// MMX2I  - MMX / SSE2 instructions with PD prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix.
+// MMXID  - MMX instructions with XD prefix.
+// MMXIS  - MMX instructions with XS prefix.
+class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>;
+class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,Not64BitMode]>;
+class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,In64BitMode]>;
+class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, PS, REX_W, Requires<[HasMMX]>;
+class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[HasMMX]>;
+class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>;
+class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>;
+class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasMMX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
new file mode 100644
index 0000000..829cedd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -0,0 +1,1037 @@
+//===-- X86InstrFragmentsSIMD.td - x86 SIMD ISA ------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides pattern fragments useful for SIMD instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+// Low word of MMX to GPR.
+def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>;
+// GPR to low word of MMX.
+def MMX_X86movw2d : SDNode<"X86ISD::MMX_MOVW2D", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, x86mmx>, SDTCisVT<1, i32>]>>;
+
+//===----------------------------------------------------------------------===//
+// MMX Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>;
+def load_mvmmx : PatFrag<(ops node:$ptr),
+                         (x86mmx (MMX_X86movw2d (load node:$ptr)))>;
+def bc_mmx  : PatFrag<(ops node:$in), (x86mmx  (bitconvert node:$in))>;
+
+//===----------------------------------------------------------------------===//
+// SSE specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
+                                       SDTCisFP<1>, SDTCisVT<3, i8>,
+                                       SDTCisVec<1>]>;
+def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, 
+                                     SDTCisSameAs<1, 2>, SDTCisInt<3>]>;
+
+def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
+def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
+
+// Commutative and Associative FMIN and FMAX.
+def X86fminc    : SDNode<"X86ISD::FMINC", SDTFPBinOp,
+    [SDNPCommutative, SDNPAssociative]>;
+def X86fmaxc    : SDNode<"X86ISD::FMAXC", SDTFPBinOp,
+    [SDNPCommutative, SDNPAssociative]>;
+
+def X86fand    : SDNode<"X86ISD::FAND",      SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86for     : SDNode<"X86ISD::FOR",       SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86fxor    : SDNode<"X86ISD::FXOR",      SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86fandn   : SDNode<"X86ISD::FANDN",     SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
+def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
+def X86frsqrt14s: SDNode<"X86ISD::FRSQRT",  SDTFPBinOp>;
+def X86frcp14s : SDNode<"X86ISD::FRCP",    SDTFPBinOp>;
+def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>;
+def X86fhadd   : SDNode<"X86ISD::FHADD",     SDTFPBinOp>;
+def X86fhsub   : SDNode<"X86ISD::FHSUB",     SDTFPBinOp>;
+def X86hadd    : SDNode<"X86ISD::HADD",      SDTIntBinOp>;
+def X86hsub    : SDNode<"X86ISD::HSUB",      SDTIntBinOp>;
+def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
+def X86comiSae : SDNode<"X86ISD::COMI",      SDTX86CmpTestSae>;
+def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
+def X86ucomiSae: SDNode<"X86ISD::UCOMI",     SDTX86CmpTestSae>;
+def X86cmps    : SDNode<"X86ISD::FSETCC",     SDTX86Cmps>;
+//def X86cmpsd   : SDNode<"X86ISD::FSETCCsd",    SDTX86Cmpsd>;
+def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD",
+                 SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>,
+                                      SDTCisVT<1, v4i32>]>>;
+def X86cvtudq2pd: SDNode<"X86ISD::CVTUDQ2PD",
+                 SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>,
+                                      SDTCisVT<1, v4i32>]>>;
+def X86pshufb  : SDNode<"X86ISD::PSHUFB",
+                 SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>,
+                                      SDTCisSameAs<0,2>]>>;
+def X86psadbw  : SDNode<"X86ISD::PSADBW",
+                 SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+                                      SDTCVecEltisVT<1, i8>,
+                                      SDTCisSameSizeAs<0,1>,
+                                      SDTCisSameAs<1,2>]>>;
+def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW",
+                  SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>,
+                                       SDTCVecEltisVT<1, i8>,
+                                       SDTCisSameSizeAs<0,1>,
+                                       SDTCisSameAs<1,2>, SDTCisInt<3>]>>;
+def X86andnp   : SDNode<"X86ISD::ANDNP",
+                 SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisSameAs<0,2>]>>;
+def X86psign   : SDNode<"X86ISD::PSIGN",
+                 SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisSameAs<0,2>]>>;
+def X86pextrb  : SDNode<"X86ISD::PEXTRB",
+                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v16i8>,
+                                      SDTCisPtrTy<2>]>>;
+def X86pextrw  : SDNode<"X86ISD::PEXTRW",
+                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v8i16>,
+                                      SDTCisPtrTy<2>]>>;
+def X86pinsrb  : SDNode<"X86ISD::PINSRB",
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86pinsrw  : SDNode<"X86ISD::PINSRW",
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86insertps : SDNode<"X86ISD::INSERTPS",
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, v4f32>, SDTCisVT<3, i8>]>>;
+def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
+                 SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+
+def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def X86vzext   : SDNode<"X86ISD::VZEXT",
+                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>,
+                                              SDTCisOpSmallerThanOp<1, 0>]>>;
+
+def X86vsext   : SDNode<"X86ISD::VSEXT",
+                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>,
+                                              SDTCisOpSmallerThanOp<1, 0>]>>;
+
+def SDTVtrunc    : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                       SDTCisInt<0>, SDTCisInt<1>,
+                                       SDTCisOpSmallerThanOp<0, 1>]>;
+
+def X86vtrunc    : SDNode<"X86ISD::VTRUNC",   SDTVtrunc>;
+def X86vtruncs   : SDNode<"X86ISD::VTRUNCS",  SDTVtrunc>;
+def X86vtruncus  : SDNode<"X86ISD::VTRUNCUS", SDTVtrunc>;
+
+def X86trunc    : SDNode<"X86ISD::TRUNC",
+                         SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>,
+                                              SDTCisOpSmallerThanOp<0, 1>]>>;
+def X86vfpext  : SDNode<"X86ISD::VFPEXT",
+                        SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                             SDTCisFP<0>, SDTCisFP<1>,
+                                             SDTCisOpSmallerThanOp<1, 0>]>>;
+def X86vfpround: SDNode<"X86ISD::VFPROUND",
+                        SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                             SDTCisFP<0>, SDTCisFP<1>,
+                                             SDTCisOpSmallerThanOp<0, 1>]>>;
+
+def X86fround: SDNode<"X86ISD::VFPROUND",
+                        SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
+                                             SDTCVecEltisVT<0, f32>,
+                                             SDTCVecEltisVT<1, f64>,
+                                             SDTCVecEltisVT<2, f64>,
+                                             SDTCisOpSmallerThanOp<0, 1>]>>;
+def X86froundRnd: SDNode<"X86ISD::VFPROUND",
+                        SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
+                                             SDTCVecEltisVT<0, f32>,
+                                             SDTCVecEltisVT<1, f64>,
+                                             SDTCVecEltisVT<2, f64>,
+                                             SDTCisOpSmallerThanOp<0, 1>,
+                                             SDTCisInt<3>]>>;
+
+def X86fpext  : SDNode<"X86ISD::VFPEXT",
+                        SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
+                                             SDTCVecEltisVT<0, f64>,
+                                             SDTCVecEltisVT<1, f32>,
+                                             SDTCVecEltisVT<2, f32>,
+                                             SDTCisOpSmallerThanOp<1, 0>]>>;
+
+def X86fpextRnd  : SDNode<"X86ISD::VFPEXT",
+                        SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>,
+                                             SDTCVecEltisVT<0, f64>,
+                                             SDTCVecEltisVT<1, f32>,
+                                             SDTCVecEltisVT<2, f32>,
+                                             SDTCisOpSmallerThanOp<1, 0>,
+                                             SDTCisInt<3>]>>;
+
+def X86vshldq  : SDNode<"X86ISD::VSHLDQ",    SDTIntShiftOp>;
+def X86vshrdq  : SDNode<"X86ISD::VSRLDQ",    SDTIntShiftOp>;
+def X86cmpp    : SDNode<"X86ISD::CMPP",      SDTX86VFCMP>;
+def X86pcmpeq  : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpgt  : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>;
+
+def X86IntCmpMask : SDTypeProfile<1, 2,
+    [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<1>]>;
+def X86pcmpeqm  : SDNode<"X86ISD::PCMPEQM", X86IntCmpMask, [SDNPCommutative]>;
+def X86pcmpgtm  : SDNode<"X86ISD::PCMPGTM", X86IntCmpMask>;
+
+def X86CmpMaskCC :
+      SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
+                       SDTCisVec<1>, SDTCisSameAs<2, 1>,
+                       SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>;
+def X86CmpMaskCCRound :
+      SDTypeProfile<1, 4, [SDTCisVec<0>,SDTCVecEltisVT<0, i1>,
+                       SDTCisVec<1>, SDTCisSameAs<2, 1>,
+                       SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>,
+                       SDTCisInt<4>]>;
+def X86CmpMaskCCScalar :
+      SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+
+def X86CmpMaskCCScalarRound :
+      SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>,
+                           SDTCisInt<4>]>;
+
+def X86cmpm     : SDNode<"X86ISD::CMPM",     X86CmpMaskCC>;
+def X86cmpmRnd  : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>;
+def X86cmpmu    : SDNode<"X86ISD::CMPMU",    X86CmpMaskCC>;
+def X86cmpms    : SDNode<"X86ISD::FSETCC",   X86CmpMaskCCScalar>;
+def X86cmpmsRnd : SDNode<"X86ISD::FSETCC",   X86CmpMaskCCScalarRound>;
+
+def X86vshl    : SDNode<"X86ISD::VSHL",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisVec<2>]>>;
+def X86vsrl    : SDNode<"X86ISD::VSRL",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisVec<2>]>>;
+def X86vsra    : SDNode<"X86ISD::VSRA",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisVec<2>]>>;
+
+def X86vshli   : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>;
+def X86vsrli   : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>;
+def X86vsrai   : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
+
+def X86vprot   : SDNode<"X86ISD::VPROT",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>]>>;
+def X86vproti  : SDNode<"X86ISD::VPROTI",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisVT<2, i8>]>>;
+
+def X86vpshl   : SDNode<"X86ISD::VPSHL",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>]>>;
+def X86vpsha   : SDNode<"X86ISD::VPSHA",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>]>>;
+
+def X86vpcom   : SDNode<"X86ISD::VPCOM",
+                        SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>,
+                                             SDTCisVT<3, i8>]>>;
+def X86vpcomu  : SDNode<"X86ISD::VPCOMU",
+                        SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>,
+                                             SDTCisVT<3, i8>]>>;
+
+def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+                                          SDTCisVec<1>,
+                                          SDTCisSameAs<2, 1>]>;
+def X86addus   : SDNode<"X86ISD::ADDUS", SDTIntBinOp>;
+def X86subus   : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
+def X86adds    : SDNode<"X86ISD::ADDS", SDTIntBinOp>;
+def X86subs    : SDNode<"X86ISD::SUBS", SDTIntBinOp>;
+def X86mulhrs  : SDNode<"X86ISD::MULHRS" , SDTIntBinOp>;
+def X86avg     : SDNode<"X86ISD::AVG" , SDTIntBinOp>;
+def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
+def X86testp   : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
+def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
+def X86ktest   : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>;
+def X86testm   : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                          SDTCisVec<1>, SDTCisSameAs<2, 1>,
+                                          SDTCVecEltisVT<0, i1>,
+                                          SDTCisSameNumEltsAs<0, 1>]>>;
+def X86testnm  : SDNode<"X86ISD::TESTNM", SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                          SDTCisVec<1>, SDTCisSameAs<2, 1>,
+                                          SDTCVecEltisVT<0, i1>,
+                                          SDTCisSameNumEltsAs<0, 1>]>>;
+def X86select  : SDNode<"X86ISD::SELECT"     , SDTSelect>;
+
+def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
+                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+                                             SDTCVecEltisVT<1, i32>,
+                                             SDTCisSameSizeAs<0,1>,
+                                             SDTCisSameAs<1,2>]>>;
+def X86pmuldq  : SDNode<"X86ISD::PMULDQ",
+                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+                                             SDTCVecEltisVT<1, i32>,
+                                             SDTCisSameSizeAs<0,1>,
+                                             SDTCisSameAs<1,2>]>>;
+
+def X86extrqi : SDNode<"X86ISD::EXTRQI",
+                  SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
+                                       SDTCisVT<2, i8>, SDTCisVT<3, i8>]>>;
+def X86insertqi : SDNode<"X86ISD::INSERTQI",
+                    SDTypeProfile<1, 4, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
+                                         SDTCisSameAs<1,2>, SDTCisVT<3, i8>,
+                                         SDTCisVT<4, i8>]>>;
+
+// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
+// translated into one of the target nodes below during lowering.
+// Note: this is a work in progress...
+def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                SDTCisSameAs<0,2>]>;
+
+def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                        SDTCisSameSizeAs<0,2>,
+                                        SDTCisSameNumEltsAs<0,2>]>;
+def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                 SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>;
+def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                 SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>;
+def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                             SDTCisSameAs<0,2>, SDTCisInt<3>, SDTCisInt<4>]>;
+def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                              SDTCisInt<2>, SDTCisInt<3>]>;
+
+def SDTVBroadcast  : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>,
+                                          SDTCisInt<0>, SDTCisInt<1>]>;
+
+def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                             SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>;
+
+def SDTTernlog  : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                SDTCisSameAs<0,2>, SDTCisSameAs<0,3>,
+                                SDTCisVT<4, i8>]>;
+
+def SDTFPBinOpRound : SDTypeProfile<1, 3, [      // fadd_round, fmul_round, etc.
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisInt<3>]>;
+
+def SDTFPUnaryOpRound : SDTypeProfile<1, 2, [      // fsqrt_round, fgetexp_round, etc.
+  SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]>;
+
+def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
+                           SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
+def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
+                           SDTCisSameAs<1,2>, SDTCisSameAs<1,3>, SDTCisInt<4>]>;
+def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>,
+                           SDTCisVec<0>, SDTCisVT<2, i32>]>;
+def STDFp2SrcRm : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
+                           SDTCisVec<0>, SDTCisVT<3, i32>]>;
+def STDFp3SrcRm : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
+                           SDTCisVec<0>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
+
+def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
+def X86VAlign  : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
+
+def X86Abs      : SDNode<"X86ISD::ABS", SDTIntUnaryOp>;
+def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>;
+
+def X86PShufd  : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
+def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>;
+def X86PShuflw : SDNode<"X86ISD::PSHUFLW", SDTShuff2OpI>;
+
+def X86Shufp   : SDNode<"X86ISD::SHUFP", SDTShuff3OpI>;
+def X86Shuf128 : SDNode<"X86ISD::SHUF128", SDTShuff3OpI>;
+
+def X86Movddup  : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>;
+def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
+def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>;
+
+def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2Op>;
+def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>;
+
+def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>;
+def X86Movlhpd : SDNode<"X86ISD::MOVLHPD", SDTShuff2Op>;
+def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
+
+def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
+def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
+
+def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                   SDTCisSameSizeAs<0,1>,
+                                   SDTCisSameAs<1,2>]>;
+def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>;
+def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>;
+
+def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
+def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
+
+def X86vpmaddubsw  : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>;
+def X86vpmaddwd    : SDNode<"X86ISD::VPMADDWD"   , SDTPack>;
+
+def X86VPermilpv  : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>;
+def X86VPermilpi  : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>;
+def X86VPermv     : SDNode<"X86ISD::VPERMV",
+                           SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<1>,
+                                                SDTCisSameNumEltsAs<0,1>,
+                                                SDTCisSameSizeAs<0,1>,
+                                                SDTCisSameAs<0,2>]>>;
+def X86VPermi     : SDNode<"X86ISD::VPERMI",    SDTShuff2OpI>;
+def X86VPermt2     : SDNode<"X86ISD::VPERMV3",
+                    SDTypeProfile<1, 3, [SDTCisVec<0>,
+                                         SDTCisSameAs<0,1>, SDTCisInt<2>,
+                                         SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>,
+                                         SDTCisSameSizeAs<0,2>,
+                                         SDTCisSameAs<0,3>]>, []>;
+
+def X86VPermi2X   : SDNode<"X86ISD::VPERMIV3",
+                    SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>,
+                                         SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>,
+                                         SDTCisSameSizeAs<0,1>,
+                                         SDTCisSameAs<0,2>,
+                                         SDTCisSameAs<0,3>]>, []>;
+
+def X86vpternlog  : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>;
+
+def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
+
+def X86VFixupimm   : SDNode<"X86ISD::VFIXUPIMM", SDTFPBinOpImmRound>;
+def X86VRange      : SDNode<"X86ISD::VRANGE",    SDTFPBinOpImmRound>;
+def X86VReduce     : SDNode<"X86ISD::VREDUCE",   SDTFPUnaryOpImmRound>;
+def X86VRndScale   : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImmRound>;
+def X86VGetMant    : SDNode<"X86ISD::VGETMANT",  SDTFPUnaryOpImmRound>;
+def X86Vfpclass    : SDNode<"X86ISD::VFPCLASS",
+                       SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
+                                            SDTCisVec<1>, SDTCisFP<1>,
+                                            SDTCisSameNumEltsAs<0,1>,
+                                            SDTCisVT<2, i32>]>, []>;
+def X86Vfpclasss   : SDNode<"X86ISD::VFPCLASSS",
+                       SDTypeProfile<1, 2, [SDTCisVT<0, i1>,
+                                            SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>;
+
+def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
+                    SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                         SDTCisSubVecOfVec<1, 0>]>, []>;
+// SDTCisSubVecOfVec restriction cannot be applied for 128 bit version of VBROADCASTI32x2.
+def X86SubV32x2Broadcast : SDNode<"X86ISD::SUBV_BROADCAST",
+                    SDTypeProfile<1, 1, [SDTCisVec<0>,
+                                         SDTCisSameAs<0,1>]>, []>;
+
+def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
+def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
+def X86Vinsert   : SDNode<"X86ISD::VINSERT",  SDTypeProfile<1, 3,
+                              [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>,
+                               SDTCisPtrTy<3>]>, []>;
+def X86Vextract   : SDNode<"X86ISD::VEXTRACT",  SDTypeProfile<1, 2,
+                              [SDTCisEltOfVec<0, 1>, SDTCisVec<1>,
+                               SDTCisPtrTy<2>]>, []>;
+
+def X86Blendi    : SDNode<"X86ISD::BLENDI",   SDTBlend>;
+
+def X86Addsub    : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>;
+
+def X86faddRnd   : SDNode<"X86ISD::FADD_RND",  SDTFPBinOpRound>;
+def X86fsubRnd   : SDNode<"X86ISD::FSUB_RND",  SDTFPBinOpRound>;
+def X86fmulRnd   : SDNode<"X86ISD::FMUL_RND",  SDTFPBinOpRound>;
+def X86fdivRnd   : SDNode<"X86ISD::FDIV_RND",  SDTFPBinOpRound>;
+def X86fmaxRnd   : SDNode<"X86ISD::FMAX_RND",       SDTFPBinOpRound>;
+def X86scalef    : SDNode<"X86ISD::SCALEF",         SDTFPBinOpRound>;
+def X86fminRnd   : SDNode<"X86ISD::FMIN_RND",       SDTFPBinOpRound>;
+def X86fsqrtRnd     : SDNode<"X86ISD::FSQRT_RND",   SDTFPUnaryOpRound>;
+def X86fsqrtRnds    : SDNode<"X86ISD::FSQRT_RND",   STDFp2SrcRm>;
+def X86fgetexpRnd   : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>;
+def X86fgetexpRnds  : SDNode<"X86ISD::FGETEXP_RND", STDFp2SrcRm>;
+
+def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
+def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
+def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFma>;
+def X86Fnmsub    : SDNode<"X86ISD::FNMSUB",    SDTFma>;
+def X86Fmaddsub  : SDNode<"X86ISD::FMADDSUB",  SDTFma>;
+def X86Fmsubadd  : SDNode<"X86ISD::FMSUBADD",  SDTFma>;
+
+def X86FmaddRnd     : SDNode<"X86ISD::FMADD_RND",     SDTFmaRound>;
+def X86FnmaddRnd    : SDNode<"X86ISD::FNMADD_RND",    SDTFmaRound>;
+def X86FmsubRnd     : SDNode<"X86ISD::FMSUB_RND",     SDTFmaRound>;
+def X86FnmsubRnd    : SDNode<"X86ISD::FNMSUB_RND",    SDTFmaRound>;
+def X86FmaddsubRnd  : SDNode<"X86ISD::FMADDSUB_RND",  SDTFmaRound>;
+def X86FmsubaddRnd  : SDNode<"X86ISD::FMSUBADD_RND",  SDTFmaRound>;
+
+def X86rsqrt28   : SDNode<"X86ISD::RSQRT28",  STDFp1SrcRm>;
+def X86rcp28     : SDNode<"X86ISD::RCP28",    STDFp1SrcRm>;
+def X86exp2      : SDNode<"X86ISD::EXP2",     STDFp1SrcRm>;
+
+def X86rsqrt28s  : SDNode<"X86ISD::RSQRT28",   STDFp2SrcRm>;
+def X86rcp28s    : SDNode<"X86ISD::RCP28",     STDFp2SrcRm>;
+def X86RndScales : SDNode<"X86ISD::VRNDSCALE", STDFp3SrcRm>;
+def X86Reduces   : SDNode<"X86ISD::VREDUCE",   STDFp3SrcRm>;
+def X86GetMants  : SDNode<"X86ISD::VGETMANT",  STDFp3SrcRm>;
+
+def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                         SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
+                                         SDTCisVT<4, i8>]>;
+def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                         SDTCisVT<2, v16i8>, SDTCisVT<3, i32>,
+                                         SDTCisVT<4, v16i8>, SDTCisVT<5, i32>,
+                                         SDTCisVT<6, i8>]>;
+
+def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>;
+def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>;
+
+def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1,
+                              [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
+def X86expand  : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1,
+                              [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
+
+def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>,
+                                          SDTCisSameAs<0,1>, SDTCisInt<2>,
+                                          SDTCisVT<3, i32>]>;
+
+def SDTDoubleToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                         SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>;
+def SDTFloatToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                         SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>;
+
+def SDTDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                         SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>;
+def SDTSDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>,SDTCisFP<1>, 
+                                             SDTCVecEltisVT<1, f64>, SDTCisInt<2>]>;
+def SDTFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                         SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>;
+def SDTSFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>,
+                                            SDTCVecEltisVT<1, f32>, SDTCisInt<2>]>;
+def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                           SDTCisFP<0>, SDTCVecEltisVT<1, i32>,
+                                           SDTCisInt<2>]>;
+def SDTVlongToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                           SDTCisFP<0>, SDTCVecEltisVT<1, i64>,
+                                           SDTCisInt<2>]>;
+
+def SDTVFPToIntRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                           SDTCisFP<1>, SDTCVecEltisVT<0, i32>,
+                                           SDTCisInt<2>]>;
+def SDTVFPToLongRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                           SDTCisFP<1>, SDTCVecEltisVT<0, i64>,
+                                           SDTCisInt<2>]>;
+
+// Scalar
+def X86SintToFpRnd  : SDNode<"X86ISD::SINT_TO_FP_RND",  SDTintToFPRound>;
+def X86UintToFpRnd  : SDNode<"X86ISD::UINT_TO_FP_RND",  SDTintToFPRound>;
+
+def X86cvttss2IntRnd      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTSFloatToIntRnd>;
+def X86cvttss2UIntRnd     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTSFloatToIntRnd>;
+def X86cvttsd2IntRnd      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTSDoubleToIntRnd>;
+def X86cvttsd2UIntRnd     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTSDoubleToIntRnd>;
+// Vector with rounding mode
+
+// cvtt fp-to-int staff
+def X86VFpToSintRnd   : SDNode<"ISD::FP_TO_SINT",  SDTVFPToIntRound>;
+def X86VFpToUintRnd   : SDNode<"ISD::FP_TO_UINT",  SDTVFPToIntRound>;
+def X86VFpToSlongRnd  : SDNode<"ISD::FP_TO_SINT",  SDTVFPToLongRound>;
+def X86VFpToUlongRnd  : SDNode<"ISD::FP_TO_UINT",  SDTVFPToLongRound>;
+
+def X86VSintToFpRnd   : SDNode<"ISD::SINT_TO_FP",  SDTVintToFPRound>;
+def X86VUintToFpRnd   : SDNode<"ISD::UINT_TO_FP",  SDTVintToFPRound>;
+def X86VSlongToFpRnd  : SDNode<"ISD::SINT_TO_FP",  SDTVlongToFPRound>;
+def X86VUlongToFpRnd  : SDNode<"ISD::UINT_TO_FP",  SDTVlongToFPRound>;
+
+// cvt fp-to-int staff
+def X86cvtps2IntRnd      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTFloatToIntRnd>;
+def X86cvtps2UIntRnd     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTFloatToIntRnd>;
+def X86cvtpd2IntRnd      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTDoubleToIntRnd>;
+def X86cvtpd2UIntRnd     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTDoubleToIntRnd>;
+
+// Vector without rounding mode
+def X86cvtps2Int      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTFloatToInt>;
+def X86cvtps2UInt     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTFloatToInt>;
+def X86cvtpd2Int      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTDoubleToInt>;
+def X86cvtpd2UInt     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTDoubleToInt>;
+
+def X86cvtph2ps     : SDNode<"ISD::FP16_TO_FP",
+                              SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                                   SDTCVecEltisVT<0, f32>,
+                                                   SDTCVecEltisVT<1, i16>,
+                                                   SDTCisFP<0>,
+                                                   SDTCisVT<2, i32>]> >;
+
+def X86cvtps2ph   : SDNode<"ISD::FP_TO_FP16",
+                        SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
+                                             SDTCVecEltisVT<0, i16>,
+                                             SDTCVecEltisVT<1, f32>,
+                                             SDTCisFP<1>, SDTCisVT<2, i32>,
+                                             SDTCisVT<3, i32>]> >;
+def X86vfpextRnd  : SDNode<"X86ISD::VFPEXT",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                             SDTCisFP<0>, SDTCisFP<1>,
+                                             SDTCVecEltisVT<0, f64>,
+                                             SDTCVecEltisVT<1, f32>,
+                                             SDTCisOpSmallerThanOp<1, 0>,
+                                             SDTCisVT<2, i32>]>>;
+def X86vfproundRnd: SDNode<"X86ISD::VFPROUND",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                             SDTCisFP<0>, SDTCisFP<1>,
+                                             SDTCVecEltisVT<0, f32>,
+                                             SDTCVecEltisVT<1, f64>,
+                                             SDTCisOpSmallerThanOp<0, 1>,
+                                             SDTCisVT<2, i32>]>>;
+
+def X86cvt2mask   : SDNode<"X86ISD::CVT2MASK", SDTIntTruncOp>;
+
+//===----------------------------------------------------------------------===//
+// SSE Complex Patterns
+//===----------------------------------------------------------------------===//
+
+// These are 'extloads' from a scalar to the low element of a vector, zeroing
+// the top elements.  These are used for the SSE 'ss' and 'sd' instruction
+// forms.
+def sse_load_f32 : ComplexPattern<v4f32, 5, "selectScalarSSELoad", [],
+                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
+                                   SDNPWantRoot]>;
+def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [],
+                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
+                                   SDNPWantRoot]>;
+
+def ssmem : Operand<v4f32> {
+  let PrintMethod = "printf32mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let ParserMatchClass = X86Mem32AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+def sdmem : Operand<v2f64> {
+  let PrintMethod = "printf64mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let ParserMatchClass = X86Mem64AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+//===----------------------------------------------------------------------===//
+// SSE pattern fragments
+//===----------------------------------------------------------------------===//
+
+// 128-bit load pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
+def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+
+// 256-bit load pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
+def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
+def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
+
+// 512-bit load pattern fragments
+def loadv16f32   : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
+def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>;
+def loadv64i8    : PatFrag<(ops node:$ptr), (v64i8 (load node:$ptr))>;
+def loadv32i16   : PatFrag<(ops node:$ptr), (v32i16 (load node:$ptr))>;
+def loadv16i32   : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>;
+def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>;
+
+// 128-/256-/512-bit extload pattern fragments
+def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
+def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
+def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
+
+// These are needed to match a scalar load that is used in a vector-only
+// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
+// The memory operand is required to be a 128-bit load, so it must be converted
+// from a vector to a scalar.
+def loadf32_128 : PatFrag<(ops node:$ptr),
+  (f32 (extractelt (loadv4f32 node:$ptr), (iPTR 0)))>;
+def loadf64_128 : PatFrag<(ops node:$ptr),
+  (f64 (extractelt (loadv2f64 node:$ptr), (iPTR 0)))>;
+
+// Like 'store', but always requires 128-bit vector alignment.
+def alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                           (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'store', but always requires 256-bit vector alignment.
+def alignedstore256 : PatFrag<(ops node:$val, node:$ptr),
+                              (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+// Like 'store', but always requires 512-bit vector alignment.
+def alignedstore512 : PatFrag<(ops node:$val, node:$ptr),
+                              (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 64;
+}]>;
+
+// Like 'load', but always requires 128-bit vector alignment.
+def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'X86vzload', but always requires 128-bit vector alignment.
+def alignedX86vzload : PatFrag<(ops node:$ptr), (X86vzload node:$ptr), [{
+  return cast<MemSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'load', but always requires 256-bit vector alignment.
+def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+// Like 'load', but always requires 512-bit vector alignment.
+def alignedload512 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 64;
+}]>;
+
+def alignedloadfsf32 : PatFrag<(ops node:$ptr),
+                               (f32 (alignedload node:$ptr))>;
+def alignedloadfsf64 : PatFrag<(ops node:$ptr),
+                               (f64 (alignedload node:$ptr))>;
+
+// 128-bit aligned load pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def alignedloadv4f32 : PatFrag<(ops node:$ptr),
+                               (v4f32 (alignedload node:$ptr))>;
+def alignedloadv2f64 : PatFrag<(ops node:$ptr),
+                               (v2f64 (alignedload node:$ptr))>;
+def alignedloadv2i64 : PatFrag<(ops node:$ptr),
+                               (v2i64 (alignedload node:$ptr))>;
+
+// 256-bit aligned load pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def alignedloadv8f32 : PatFrag<(ops node:$ptr),
+                               (v8f32 (alignedload256 node:$ptr))>;
+def alignedloadv4f64 : PatFrag<(ops node:$ptr),
+                               (v4f64 (alignedload256 node:$ptr))>;
+def alignedloadv4i64 : PatFrag<(ops node:$ptr),
+                               (v4i64 (alignedload256 node:$ptr))>;
+
+// 512-bit aligned load pattern fragments
+def alignedloadv16f32 : PatFrag<(ops node:$ptr),
+                                (v16f32 (alignedload512 node:$ptr))>;
+def alignedloadv16i32 : PatFrag<(ops node:$ptr),
+                                (v16i32 (alignedload512 node:$ptr))>;
+def alignedloadv8f64  : PatFrag<(ops node:$ptr),
+                                (v8f64  (alignedload512 node:$ptr))>;
+def alignedloadv8i64  : PatFrag<(ops node:$ptr),
+                                (v8i64  (alignedload512 node:$ptr))>;
+
+// Like 'load', but uses special alignment checks suitable for use in
+// memory operands in most SSE instructions, which are required to
+// be naturally aligned on some targets but not on others.  If the subtarget
+// allows unaligned accesses, match any load, though this may require
+// setting a feature bit in the processor (on startup, for example).
+// Opteron 10h and later implement such a feature.
+def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return    Subtarget->hasSSEUnalignedMem()
+         || cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def memopfsf32 : PatFrag<(ops node:$ptr), (f32   (memop node:$ptr))>;
+def memopfsf64 : PatFrag<(ops node:$ptr), (f64   (memop node:$ptr))>;
+
+// 128-bit memop pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
+def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
+def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
+
+// These are needed to match a scalar memop that is used in a vector-only
+// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
+// The memory operand is required to be a 128-bit load, so it must be converted
+// from a vector to a scalar.
+def memopfsf32_128 : PatFrag<(ops node:$ptr),
+  (f32 (extractelt (memopv4f32 node:$ptr), (iPTR 0)))>;
+def memopfsf64_128 : PatFrag<(ops node:$ptr),
+  (f64 (extractelt (memopv2f64 node:$ptr), (iPTR 0)))>;
+
+
+// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
+// 16-byte boundary.
+// FIXME: 8 byte alignment for mmx reads is not required
+def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+
+def memopmmx  : PatFrag<(ops node:$ptr), (x86mmx  (memop64 node:$ptr))>;
+
+def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_gather node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+    return (Mgt->getIndex().getValueType() == MVT::v4i32 ||
+            Mgt->getBasePtr().getValueType() == MVT::v4i32);
+  return false;
+}]>;
+
+def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_gather node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+    return (Mgt->getIndex().getValueType() == MVT::v8i32 ||
+            Mgt->getBasePtr().getValueType() == MVT::v8i32);
+  return false;
+}]>;
+
+def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_gather node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+    return (Mgt->getIndex().getValueType() == MVT::v2i64 ||
+            Mgt->getBasePtr().getValueType() == MVT::v2i64);
+  return false;
+}]>;
+def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_gather node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+    return (Mgt->getIndex().getValueType() == MVT::v4i64 ||
+            Mgt->getBasePtr().getValueType() == MVT::v4i64);
+  return false;
+}]>;
+def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_gather node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+    return (Mgt->getIndex().getValueType() == MVT::v8i64 ||
+            Mgt->getBasePtr().getValueType() == MVT::v8i64);
+  return false;
+}]>;
+def mgatherv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_gather node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+    return (Mgt->getIndex().getValueType() == MVT::v16i32 ||
+            Mgt->getBasePtr().getValueType() == MVT::v16i32);
+  return false;
+}]>;
+
+def mscatterv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+    return (Sc->getIndex().getValueType() == MVT::v2i64 ||
+            Sc->getBasePtr().getValueType() == MVT::v2i64);
+  return false;
+}]>;
+
+def mscatterv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+    return (Sc->getIndex().getValueType() == MVT::v4i32 ||
+            Sc->getBasePtr().getValueType() == MVT::v4i32);
+  return false;
+}]>;
+
+def mscatterv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+    return (Sc->getIndex().getValueType() == MVT::v4i64 ||
+            Sc->getBasePtr().getValueType() == MVT::v4i64);
+  return false;
+}]>;
+
+def mscatterv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+    return (Sc->getIndex().getValueType() == MVT::v8i32 ||
+            Sc->getBasePtr().getValueType() == MVT::v8i32);
+  return false;
+}]>;
+
+def mscatterv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+    return (Sc->getIndex().getValueType() == MVT::v8i64 ||
+            Sc->getBasePtr().getValueType() == MVT::v8i64);
+  return false;
+}]>;
+def mscatterv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+    return (Sc->getIndex().getValueType() == MVT::v16i32 ||
+            Sc->getBasePtr().getValueType() == MVT::v16i32);
+  return false;
+}]>;
+
+// 128-bit bitconvert pattern fragments
+def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
+def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
+def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
+def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
+def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
+def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
+
+// 256-bit bitconvert pattern fragments
+def bc_v32i8 : PatFrag<(ops node:$in), (v32i8 (bitconvert node:$in))>;
+def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>;
+def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>;
+def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>;
+def bc_v8f32 : PatFrag<(ops node:$in), (v8f32 (bitconvert node:$in))>;
+
+// 512-bit bitconvert pattern fragments
+def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>;
+def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>;
+def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>;
+def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>;
+
+def vzmovl_v2i64 : PatFrag<(ops node:$src),
+                           (bitconvert (v2i64 (X86vzmovl
+                             (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
+def vzmovl_v4i32 : PatFrag<(ops node:$src),
+                           (bitconvert (v4i32 (X86vzmovl
+                             (v4i32 (scalar_to_vector (loadi32 node:$src))))))>;
+
+def vzload_v2i64 : PatFrag<(ops node:$src),
+                           (bitconvert (v2i64 (X86vzload node:$src)))>;
+
+
+def fp32imm0 : PatLeaf<(f32 fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def I8Imm : SDNodeXForm<imm, [{
+  // Transformation function: get the low 8 bits.
+  return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
+}]>;
+
+def FROUND_NO_EXC : ImmLeaf<i32, [{ return Imm == 8; }]>;
+def FROUND_CURRENT : ImmLeaf<i32, [{
+  return Imm == X86::STATIC_ROUNDING::CUR_DIRECTION;
+}]>;
+
+// BYTE_imm - Transform bit immediates into byte immediates.
+def BYTE_imm  : SDNodeXForm<imm, [{
+  // Transformation function: imm >> 3
+  return getI32Imm(N->getZExtValue() >> 3, SDLoc(N));
+}]>;
+
+// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index
+// to VEXTRACTF128/VEXTRACTI128 imm.
+def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{
+  return getI8Imm(X86::getExtractVEXTRACT128Immediate(N), SDLoc(N));
+}]>;
+
+// INSERT_get_vinsert128_imm xform function: convert insert_subvector index to
+// VINSERTF128/VINSERTI128 imm.
+def INSERT_get_vinsert128_imm : SDNodeXForm<insert_subvector, [{
+  return getI8Imm(X86::getInsertVINSERT128Immediate(N), SDLoc(N));
+}]>;
+
+// EXTRACT_get_vextract256_imm xform function: convert extract_subvector index
+// to VEXTRACTF64x4 imm.
+def EXTRACT_get_vextract256_imm : SDNodeXForm<extract_subvector, [{
+  return getI8Imm(X86::getExtractVEXTRACT256Immediate(N), SDLoc(N));
+}]>;
+
+// INSERT_get_vinsert256_imm xform function: convert insert_subvector index to
+// VINSERTF64x4 imm.
+def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{
+  return getI8Imm(X86::getInsertVINSERT256Immediate(N), SDLoc(N));
+}]>;
+
+def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index),
+                                   (extract_subvector node:$bigvec,
+                                                      node:$index), [{
+  return X86::isVEXTRACT128Index(N);
+}], EXTRACT_get_vextract128_imm>;
+
+def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+                                      node:$index),
+                                 (insert_subvector node:$bigvec, node:$smallvec,
+                                                   node:$index), [{
+  return X86::isVINSERT128Index(N);
+}], INSERT_get_vinsert128_imm>;
+
+
+def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index),
+                                   (extract_subvector node:$bigvec,
+                                                      node:$index), [{
+  return X86::isVEXTRACT256Index(N);
+}], EXTRACT_get_vextract256_imm>;
+
+def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+                                      node:$index),
+                                 (insert_subvector node:$bigvec, node:$smallvec,
+                                                   node:$index), [{
+  return X86::isVINSERT256Index(N);
+}], INSERT_get_vinsert256_imm>;
+
+def masked_load_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                         (masked_load node:$src1, node:$src2, node:$src3), [{
+  if (auto *Load = dyn_cast<MaskedLoadSDNode>(N))
+    return Load->getAlignment() >= 16;
+  return false;
+}]>;
+
+def masked_load_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                         (masked_load node:$src1, node:$src2, node:$src3), [{
+  if (auto *Load = dyn_cast<MaskedLoadSDNode>(N))
+    return Load->getAlignment() >= 32;
+  return false;
+}]>;
+
+def masked_load_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                         (masked_load node:$src1, node:$src2, node:$src3), [{
+  if (auto *Load = dyn_cast<MaskedLoadSDNode>(N))
+    return Load->getAlignment() >= 64;
+  return false;
+}]>;
+
+def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                         (masked_load node:$src1, node:$src2, node:$src3), [{
+  return isa<MaskedLoadSDNode>(N);
+}]>;
+
+// masked store fragments.
+// X86mstore can't be implemented in core DAG files because some targets
+// doesn't support vector type ( llvm-tblgen will fail)
+def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                        (masked_store node:$src1, node:$src2, node:$src3), [{
+  return !cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+}]>;
+
+def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                         (X86mstore node:$src1, node:$src2, node:$src3), [{
+  if (auto *Store = dyn_cast<MaskedStoreSDNode>(N))
+    return Store->getAlignment() >= 16;
+  return false;
+}]>;
+
+def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                         (X86mstore node:$src1, node:$src2, node:$src3), [{
+  if (auto *Store = dyn_cast<MaskedStoreSDNode>(N))
+    return Store->getAlignment() >= 32;
+  return false;
+}]>;
+
+def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                         (X86mstore node:$src1, node:$src2, node:$src3), [{
+  if (auto *Store = dyn_cast<MaskedStoreSDNode>(N))
+    return Store->getAlignment() >= 64;
+  return false;
+}]>;
+
+def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                         (X86mstore node:$src1, node:$src2, node:$src3), [{
+  return isa<MaskedStoreSDNode>(N);
+}]>;
+
+// masked truncstore fragments
+// X86mtruncstore can't be implemented in core DAG files because some targets
+// doesn't support vector type ( llvm-tblgen will fail)
+def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                             (masked_store node:$src1, node:$src2, node:$src3), [{
+    return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+}]>;
+def masked_truncstorevi8 :
+  PatFrag<(ops node:$src1, node:$src2, node:$src3),
+          (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def masked_truncstorevi16 :
+  PatFrag<(ops node:$src1, node:$src2, node:$src3),
+          (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+def masked_truncstorevi32 :
+  PatFrag<(ops node:$src1, node:$src2, node:$src3),
+          (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
new file mode 100644
index 0000000..246804e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -0,0 +1,7330 @@
+//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+#include <limits>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-instr-info"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "X86GenInstrInfo.inc"
+
+static cl::opt<bool>
+NoFusing("disable-spill-fusing",
+         cl::desc("Disable fusing of spill code into instructions"));
+static cl::opt<bool>
+PrintFailedFusing("print-failed-fuse-candidates",
+                  cl::desc("Print instructions that the allocator wants to"
+                           " fuse, but the X86 backend currently can't"),
+                  cl::Hidden);
+static cl::opt<bool>
+ReMatPICStubLoad("remat-pic-stub-load",
+                 cl::desc("Re-materialize load from stub in PIC mode"),
+                 cl::init(false), cl::Hidden);
+
+enum {
+  // Select which memory operand is being unfolded.
+  // (stored in bits 0 - 3)
+  TB_INDEX_0    = 0,
+  TB_INDEX_1    = 1,
+  TB_INDEX_2    = 2,
+  TB_INDEX_3    = 3,
+  TB_INDEX_4    = 4,
+  TB_INDEX_MASK = 0xf,
+
+  // Do not insert the reverse map (MemOp -> RegOp) into the table.
+  // This may be needed because there is a many -> one mapping.
+  TB_NO_REVERSE   = 1 << 4,
+
+  // Do not insert the forward map (RegOp -> MemOp) into the table.
+  // This is needed for Native Client, which prohibits branch
+  // instructions from using a memory operand.
+  TB_NO_FORWARD   = 1 << 5,
+
+  TB_FOLDED_LOAD  = 1 << 6,
+  TB_FOLDED_STORE = 1 << 7,
+
+  // Minimum alignment required for load/store.
+  // Used for RegOp->MemOp conversion.
+  // (stored in bits 8 - 15)
+  TB_ALIGN_SHIFT = 8,
+  TB_ALIGN_NONE  =    0 << TB_ALIGN_SHIFT,
+  TB_ALIGN_16    =   16 << TB_ALIGN_SHIFT,
+  TB_ALIGN_32    =   32 << TB_ALIGN_SHIFT,
+  TB_ALIGN_64    =   64 << TB_ALIGN_SHIFT,
+  TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT
+};
+
+struct X86MemoryFoldTableEntry {
+  uint16_t RegOp;
+  uint16_t MemOp;
+  uint16_t Flags;
+};
+
+// Pin the vtable to this file.
+void X86InstrInfo::anchor() {}
+
+X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
+    : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
+                                               : X86::ADJCALLSTACKDOWN32),
+                      (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
+                                               : X86::ADJCALLSTACKUP32),
+                      X86::CATCHRET),
+      Subtarget(STI), RI(STI.getTargetTriple()) {
+
+  static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
+    { X86::ADC32ri,     X86::ADC32mi,    0 },
+    { X86::ADC32ri8,    X86::ADC32mi8,   0 },
+    { X86::ADC32rr,     X86::ADC32mr,    0 },
+    { X86::ADC64ri32,   X86::ADC64mi32,  0 },
+    { X86::ADC64ri8,    X86::ADC64mi8,   0 },
+    { X86::ADC64rr,     X86::ADC64mr,    0 },
+    { X86::ADD16ri,     X86::ADD16mi,    0 },
+    { X86::ADD16ri8,    X86::ADD16mi8,   0 },
+    { X86::ADD16ri_DB,  X86::ADD16mi,    TB_NO_REVERSE },
+    { X86::ADD16ri8_DB, X86::ADD16mi8,   TB_NO_REVERSE },
+    { X86::ADD16rr,     X86::ADD16mr,    0 },
+    { X86::ADD16rr_DB,  X86::ADD16mr,    TB_NO_REVERSE },
+    { X86::ADD32ri,     X86::ADD32mi,    0 },
+    { X86::ADD32ri8,    X86::ADD32mi8,   0 },
+    { X86::ADD32ri_DB,  X86::ADD32mi,    TB_NO_REVERSE },
+    { X86::ADD32ri8_DB, X86::ADD32mi8,   TB_NO_REVERSE },
+    { X86::ADD32rr,     X86::ADD32mr,    0 },
+    { X86::ADD32rr_DB,  X86::ADD32mr,    TB_NO_REVERSE },
+    { X86::ADD64ri32,   X86::ADD64mi32,  0 },
+    { X86::ADD64ri8,    X86::ADD64mi8,   0 },
+    { X86::ADD64ri32_DB,X86::ADD64mi32,  TB_NO_REVERSE },
+    { X86::ADD64ri8_DB, X86::ADD64mi8,   TB_NO_REVERSE },
+    { X86::ADD64rr,     X86::ADD64mr,    0 },
+    { X86::ADD64rr_DB,  X86::ADD64mr,    TB_NO_REVERSE },
+    { X86::ADD8ri,      X86::ADD8mi,     0 },
+    { X86::ADD8rr,      X86::ADD8mr,     0 },
+    { X86::AND16ri,     X86::AND16mi,    0 },
+    { X86::AND16ri8,    X86::AND16mi8,   0 },
+    { X86::AND16rr,     X86::AND16mr,    0 },
+    { X86::AND32ri,     X86::AND32mi,    0 },
+    { X86::AND32ri8,    X86::AND32mi8,   0 },
+    { X86::AND32rr,     X86::AND32mr,    0 },
+    { X86::AND64ri32,   X86::AND64mi32,  0 },
+    { X86::AND64ri8,    X86::AND64mi8,   0 },
+    { X86::AND64rr,     X86::AND64mr,    0 },
+    { X86::AND8ri,      X86::AND8mi,     0 },
+    { X86::AND8rr,      X86::AND8mr,     0 },
+    { X86::DEC16r,      X86::DEC16m,     0 },
+    { X86::DEC32r,      X86::DEC32m,     0 },
+    { X86::DEC64r,      X86::DEC64m,     0 },
+    { X86::DEC8r,       X86::DEC8m,      0 },
+    { X86::INC16r,      X86::INC16m,     0 },
+    { X86::INC32r,      X86::INC32m,     0 },
+    { X86::INC64r,      X86::INC64m,     0 },
+    { X86::INC8r,       X86::INC8m,      0 },
+    { X86::NEG16r,      X86::NEG16m,     0 },
+    { X86::NEG32r,      X86::NEG32m,     0 },
+    { X86::NEG64r,      X86::NEG64m,     0 },
+    { X86::NEG8r,       X86::NEG8m,      0 },
+    { X86::NOT16r,      X86::NOT16m,     0 },
+    { X86::NOT32r,      X86::NOT32m,     0 },
+    { X86::NOT64r,      X86::NOT64m,     0 },
+    { X86::NOT8r,       X86::NOT8m,      0 },
+    { X86::OR16ri,      X86::OR16mi,     0 },
+    { X86::OR16ri8,     X86::OR16mi8,    0 },
+    { X86::OR16rr,      X86::OR16mr,     0 },
+    { X86::OR32ri,      X86::OR32mi,     0 },
+    { X86::OR32ri8,     X86::OR32mi8,    0 },
+    { X86::OR32rr,      X86::OR32mr,     0 },
+    { X86::OR64ri32,    X86::OR64mi32,   0 },
+    { X86::OR64ri8,     X86::OR64mi8,    0 },
+    { X86::OR64rr,      X86::OR64mr,     0 },
+    { X86::OR8ri,       X86::OR8mi,      0 },
+    { X86::OR8rr,       X86::OR8mr,      0 },
+    { X86::ROL16r1,     X86::ROL16m1,    0 },
+    { X86::ROL16rCL,    X86::ROL16mCL,   0 },
+    { X86::ROL16ri,     X86::ROL16mi,    0 },
+    { X86::ROL32r1,     X86::ROL32m1,    0 },
+    { X86::ROL32rCL,    X86::ROL32mCL,   0 },
+    { X86::ROL32ri,     X86::ROL32mi,    0 },
+    { X86::ROL64r1,     X86::ROL64m1,    0 },
+    { X86::ROL64rCL,    X86::ROL64mCL,   0 },
+    { X86::ROL64ri,     X86::ROL64mi,    0 },
+    { X86::ROL8r1,      X86::ROL8m1,     0 },
+    { X86::ROL8rCL,     X86::ROL8mCL,    0 },
+    { X86::ROL8ri,      X86::ROL8mi,     0 },
+    { X86::ROR16r1,     X86::ROR16m1,    0 },
+    { X86::ROR16rCL,    X86::ROR16mCL,   0 },
+    { X86::ROR16ri,     X86::ROR16mi,    0 },
+    { X86::ROR32r1,     X86::ROR32m1,    0 },
+    { X86::ROR32rCL,    X86::ROR32mCL,   0 },
+    { X86::ROR32ri,     X86::ROR32mi,    0 },
+    { X86::ROR64r1,     X86::ROR64m1,    0 },
+    { X86::ROR64rCL,    X86::ROR64mCL,   0 },
+    { X86::ROR64ri,     X86::ROR64mi,    0 },
+    { X86::ROR8r1,      X86::ROR8m1,     0 },
+    { X86::ROR8rCL,     X86::ROR8mCL,    0 },
+    { X86::ROR8ri,      X86::ROR8mi,     0 },
+    { X86::SAR16r1,     X86::SAR16m1,    0 },
+    { X86::SAR16rCL,    X86::SAR16mCL,   0 },
+    { X86::SAR16ri,     X86::SAR16mi,    0 },
+    { X86::SAR32r1,     X86::SAR32m1,    0 },
+    { X86::SAR32rCL,    X86::SAR32mCL,   0 },
+    { X86::SAR32ri,     X86::SAR32mi,    0 },
+    { X86::SAR64r1,     X86::SAR64m1,    0 },
+    { X86::SAR64rCL,    X86::SAR64mCL,   0 },
+    { X86::SAR64ri,     X86::SAR64mi,    0 },
+    { X86::SAR8r1,      X86::SAR8m1,     0 },
+    { X86::SAR8rCL,     X86::SAR8mCL,    0 },
+    { X86::SAR8ri,      X86::SAR8mi,     0 },
+    { X86::SBB32ri,     X86::SBB32mi,    0 },
+    { X86::SBB32ri8,    X86::SBB32mi8,   0 },
+    { X86::SBB32rr,     X86::SBB32mr,    0 },
+    { X86::SBB64ri32,   X86::SBB64mi32,  0 },
+    { X86::SBB64ri8,    X86::SBB64mi8,   0 },
+    { X86::SBB64rr,     X86::SBB64mr,    0 },
+    { X86::SHL16rCL,    X86::SHL16mCL,   0 },
+    { X86::SHL16ri,     X86::SHL16mi,    0 },
+    { X86::SHL32rCL,    X86::SHL32mCL,   0 },
+    { X86::SHL32ri,     X86::SHL32mi,    0 },
+    { X86::SHL64rCL,    X86::SHL64mCL,   0 },
+    { X86::SHL64ri,     X86::SHL64mi,    0 },
+    { X86::SHL8rCL,     X86::SHL8mCL,    0 },
+    { X86::SHL8ri,      X86::SHL8mi,     0 },
+    { X86::SHLD16rrCL,  X86::SHLD16mrCL, 0 },
+    { X86::SHLD16rri8,  X86::SHLD16mri8, 0 },
+    { X86::SHLD32rrCL,  X86::SHLD32mrCL, 0 },
+    { X86::SHLD32rri8,  X86::SHLD32mri8, 0 },
+    { X86::SHLD64rrCL,  X86::SHLD64mrCL, 0 },
+    { X86::SHLD64rri8,  X86::SHLD64mri8, 0 },
+    { X86::SHR16r1,     X86::SHR16m1,    0 },
+    { X86::SHR16rCL,    X86::SHR16mCL,   0 },
+    { X86::SHR16ri,     X86::SHR16mi,    0 },
+    { X86::SHR32r1,     X86::SHR32m1,    0 },
+    { X86::SHR32rCL,    X86::SHR32mCL,   0 },
+    { X86::SHR32ri,     X86::SHR32mi,    0 },
+    { X86::SHR64r1,     X86::SHR64m1,    0 },
+    { X86::SHR64rCL,    X86::SHR64mCL,   0 },
+    { X86::SHR64ri,     X86::SHR64mi,    0 },
+    { X86::SHR8r1,      X86::SHR8m1,     0 },
+    { X86::SHR8rCL,     X86::SHR8mCL,    0 },
+    { X86::SHR8ri,      X86::SHR8mi,     0 },
+    { X86::SHRD16rrCL,  X86::SHRD16mrCL, 0 },
+    { X86::SHRD16rri8,  X86::SHRD16mri8, 0 },
+    { X86::SHRD32rrCL,  X86::SHRD32mrCL, 0 },
+    { X86::SHRD32rri8,  X86::SHRD32mri8, 0 },
+    { X86::SHRD64rrCL,  X86::SHRD64mrCL, 0 },
+    { X86::SHRD64rri8,  X86::SHRD64mri8, 0 },
+    { X86::SUB16ri,     X86::SUB16mi,    0 },
+    { X86::SUB16ri8,    X86::SUB16mi8,   0 },
+    { X86::SUB16rr,     X86::SUB16mr,    0 },
+    { X86::SUB32ri,     X86::SUB32mi,    0 },
+    { X86::SUB32ri8,    X86::SUB32mi8,   0 },
+    { X86::SUB32rr,     X86::SUB32mr,    0 },
+    { X86::SUB64ri32,   X86::SUB64mi32,  0 },
+    { X86::SUB64ri8,    X86::SUB64mi8,   0 },
+    { X86::SUB64rr,     X86::SUB64mr,    0 },
+    { X86::SUB8ri,      X86::SUB8mi,     0 },
+    { X86::SUB8rr,      X86::SUB8mr,     0 },
+    { X86::XOR16ri,     X86::XOR16mi,    0 },
+    { X86::XOR16ri8,    X86::XOR16mi8,   0 },
+    { X86::XOR16rr,     X86::XOR16mr,    0 },
+    { X86::XOR32ri,     X86::XOR32mi,    0 },
+    { X86::XOR32ri8,    X86::XOR32mi8,   0 },
+    { X86::XOR32rr,     X86::XOR32mr,    0 },
+    { X86::XOR64ri32,   X86::XOR64mi32,  0 },
+    { X86::XOR64ri8,    X86::XOR64mi8,   0 },
+    { X86::XOR64rr,     X86::XOR64mr,    0 },
+    { X86::XOR8ri,      X86::XOR8mi,     0 },
+    { X86::XOR8rr,      X86::XOR8mr,     0 }
+  };
+
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) {
+    AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
+                  Entry.RegOp, Entry.MemOp,
+                  // Index 0, folded load and store, no alignment requirement.
+                  Entry.Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
+  }
+
+  static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
+    { X86::BT16ri8,     X86::BT16mi8,       TB_FOLDED_LOAD },
+    { X86::BT32ri8,     X86::BT32mi8,       TB_FOLDED_LOAD },
+    { X86::BT64ri8,     X86::BT64mi8,       TB_FOLDED_LOAD },
+    { X86::CALL32r,     X86::CALL32m,       TB_FOLDED_LOAD },
+    { X86::CALL64r,     X86::CALL64m,       TB_FOLDED_LOAD },
+    { X86::CMP16ri,     X86::CMP16mi,       TB_FOLDED_LOAD },
+    { X86::CMP16ri8,    X86::CMP16mi8,      TB_FOLDED_LOAD },
+    { X86::CMP16rr,     X86::CMP16mr,       TB_FOLDED_LOAD },
+    { X86::CMP32ri,     X86::CMP32mi,       TB_FOLDED_LOAD },
+    { X86::CMP32ri8,    X86::CMP32mi8,      TB_FOLDED_LOAD },
+    { X86::CMP32rr,     X86::CMP32mr,       TB_FOLDED_LOAD },
+    { X86::CMP64ri32,   X86::CMP64mi32,     TB_FOLDED_LOAD },
+    { X86::CMP64ri8,    X86::CMP64mi8,      TB_FOLDED_LOAD },
+    { X86::CMP64rr,     X86::CMP64mr,       TB_FOLDED_LOAD },
+    { X86::CMP8ri,      X86::CMP8mi,        TB_FOLDED_LOAD },
+    { X86::CMP8rr,      X86::CMP8mr,        TB_FOLDED_LOAD },
+    { X86::DIV16r,      X86::DIV16m,        TB_FOLDED_LOAD },
+    { X86::DIV32r,      X86::DIV32m,        TB_FOLDED_LOAD },
+    { X86::DIV64r,      X86::DIV64m,        TB_FOLDED_LOAD },
+    { X86::DIV8r,       X86::DIV8m,         TB_FOLDED_LOAD },
+    { X86::EXTRACTPSrr, X86::EXTRACTPSmr,   TB_FOLDED_STORE },
+    { X86::IDIV16r,     X86::IDIV16m,       TB_FOLDED_LOAD },
+    { X86::IDIV32r,     X86::IDIV32m,       TB_FOLDED_LOAD },
+    { X86::IDIV64r,     X86::IDIV64m,       TB_FOLDED_LOAD },
+    { X86::IDIV8r,      X86::IDIV8m,        TB_FOLDED_LOAD },
+    { X86::IMUL16r,     X86::IMUL16m,       TB_FOLDED_LOAD },
+    { X86::IMUL32r,     X86::IMUL32m,       TB_FOLDED_LOAD },
+    { X86::IMUL64r,     X86::IMUL64m,       TB_FOLDED_LOAD },
+    { X86::IMUL8r,      X86::IMUL8m,        TB_FOLDED_LOAD },
+    { X86::JMP32r,      X86::JMP32m,        TB_FOLDED_LOAD },
+    { X86::JMP64r,      X86::JMP64m,        TB_FOLDED_LOAD },
+    { X86::MOV16ri,     X86::MOV16mi,       TB_FOLDED_STORE },
+    { X86::MOV16rr,     X86::MOV16mr,       TB_FOLDED_STORE },
+    { X86::MOV32ri,     X86::MOV32mi,       TB_FOLDED_STORE },
+    { X86::MOV32rr,     X86::MOV32mr,       TB_FOLDED_STORE },
+    { X86::MOV64ri32,   X86::MOV64mi32,     TB_FOLDED_STORE },
+    { X86::MOV64rr,     X86::MOV64mr,       TB_FOLDED_STORE },
+    { X86::MOV8ri,      X86::MOV8mi,        TB_FOLDED_STORE },
+    { X86::MOV8rr,      X86::MOV8mr,        TB_FOLDED_STORE },
+    { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
+    { X86::MOVAPDrr,    X86::MOVAPDmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::MOVAPSrr,    X86::MOVAPSmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::MOVDQArr,    X86::MOVDQAmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::MOVPDI2DIrr, X86::MOVPDI2DImr,   TB_FOLDED_STORE },
+    { X86::MOVPQIto64rr,X86::MOVPQI2QImr,   TB_FOLDED_STORE },
+    { X86::MOVSDto64rr, X86::MOVSDto64mr,   TB_FOLDED_STORE },
+    { X86::MOVSS2DIrr,  X86::MOVSS2DImr,    TB_FOLDED_STORE },
+    { X86::MOVUPDrr,    X86::MOVUPDmr,      TB_FOLDED_STORE },
+    { X86::MOVUPSrr,    X86::MOVUPSmr,      TB_FOLDED_STORE },
+    { X86::MUL16r,      X86::MUL16m,        TB_FOLDED_LOAD },
+    { X86::MUL32r,      X86::MUL32m,        TB_FOLDED_LOAD },
+    { X86::MUL64r,      X86::MUL64m,        TB_FOLDED_LOAD },
+    { X86::MUL8r,       X86::MUL8m,         TB_FOLDED_LOAD },
+    { X86::PEXTRDrr,    X86::PEXTRDmr,      TB_FOLDED_STORE },
+    { X86::PEXTRQrr,    X86::PEXTRQmr,      TB_FOLDED_STORE },
+    { X86::PUSH16r,     X86::PUSH16rmm,     TB_FOLDED_LOAD },
+    { X86::PUSH32r,     X86::PUSH32rmm,     TB_FOLDED_LOAD },
+    { X86::PUSH64r,     X86::PUSH64rmm,     TB_FOLDED_LOAD },
+    { X86::SETAEr,      X86::SETAEm,        TB_FOLDED_STORE },
+    { X86::SETAr,       X86::SETAm,         TB_FOLDED_STORE },
+    { X86::SETBEr,      X86::SETBEm,        TB_FOLDED_STORE },
+    { X86::SETBr,       X86::SETBm,         TB_FOLDED_STORE },
+    { X86::SETEr,       X86::SETEm,         TB_FOLDED_STORE },
+    { X86::SETGEr,      X86::SETGEm,        TB_FOLDED_STORE },
+    { X86::SETGr,       X86::SETGm,         TB_FOLDED_STORE },
+    { X86::SETLEr,      X86::SETLEm,        TB_FOLDED_STORE },
+    { X86::SETLr,       X86::SETLm,         TB_FOLDED_STORE },
+    { X86::SETNEr,      X86::SETNEm,        TB_FOLDED_STORE },
+    { X86::SETNOr,      X86::SETNOm,        TB_FOLDED_STORE },
+    { X86::SETNPr,      X86::SETNPm,        TB_FOLDED_STORE },
+    { X86::SETNSr,      X86::SETNSm,        TB_FOLDED_STORE },
+    { X86::SETOr,       X86::SETOm,         TB_FOLDED_STORE },
+    { X86::SETPr,       X86::SETPm,         TB_FOLDED_STORE },
+    { X86::SETSr,       X86::SETSm,         TB_FOLDED_STORE },
+    { X86::TAILJMPr,    X86::TAILJMPm,      TB_FOLDED_LOAD },
+    { X86::TAILJMPr64,  X86::TAILJMPm64,    TB_FOLDED_LOAD },
+    { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD },
+    { X86::TEST16ri,    X86::TEST16mi,      TB_FOLDED_LOAD },
+    { X86::TEST32ri,    X86::TEST32mi,      TB_FOLDED_LOAD },
+    { X86::TEST64ri32,  X86::TEST64mi32,    TB_FOLDED_LOAD },
+    { X86::TEST8ri,     X86::TEST8mi,       TB_FOLDED_LOAD },
+
+    // AVX 128-bit versions of foldable instructions
+    { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr,  TB_FOLDED_STORE  },
+    { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVAPDrr,   X86::VMOVAPDmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVAPSrr,   X86::VMOVAPSmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVDQArr,   X86::VMOVDQAmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr,  TB_FOLDED_STORE },
+    { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
+    { X86::VMOVSDto64rr,X86::VMOVSDto64mr,  TB_FOLDED_STORE },
+    { X86::VMOVSS2DIrr, X86::VMOVSS2DImr,   TB_FOLDED_STORE },
+    { X86::VMOVUPDrr,   X86::VMOVUPDmr,     TB_FOLDED_STORE },
+    { X86::VMOVUPSrr,   X86::VMOVUPSmr,     TB_FOLDED_STORE },
+    { X86::VPEXTRDrr,   X86::VPEXTRDmr,     TB_FOLDED_STORE },
+    { X86::VPEXTRQrr,   X86::VPEXTRQmr,     TB_FOLDED_STORE },
+
+    // AVX 256-bit foldable instructions
+    { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVAPDYrr,  X86::VMOVAPDYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVAPSYrr,  X86::VMOVAPSYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVDQAYrr,  X86::VMOVDQAYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVUPDYrr,  X86::VMOVUPDYmr,    TB_FOLDED_STORE },
+    { X86::VMOVUPSYrr,  X86::VMOVUPSYmr,    TB_FOLDED_STORE },
+
+    // AVX-512 foldable instructions
+    { X86::VMOVPDI2DIZrr,   X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
+    { X86::VMOVAPDZrr,      X86::VMOVAPDZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
+    { X86::VMOVAPSZrr,      X86::VMOVAPSZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
+    { X86::VMOVDQA32Zrr,    X86::VMOVDQA32Zmr,  TB_FOLDED_STORE | TB_ALIGN_64 },
+    { X86::VMOVDQA64Zrr,    X86::VMOVDQA64Zmr,  TB_FOLDED_STORE | TB_ALIGN_64 },
+    { X86::VMOVUPDZrr,      X86::VMOVUPDZmr,    TB_FOLDED_STORE },
+    { X86::VMOVUPSZrr,      X86::VMOVUPSZmr,    TB_FOLDED_STORE },
+    { X86::VMOVDQU8Zrr,     X86::VMOVDQU8Zmr,   TB_FOLDED_STORE },
+    { X86::VMOVDQU16Zrr,    X86::VMOVDQU16Zmr,  TB_FOLDED_STORE },
+    { X86::VMOVDQU32Zrr,    X86::VMOVDQU32Zmr,  TB_FOLDED_STORE },
+    { X86::VMOVDQU64Zrr,    X86::VMOVDQU64Zmr,  TB_FOLDED_STORE },
+
+    // AVX-512 foldable instructions (256-bit versions)
+    { X86::VMOVAPDZ256rr,      X86::VMOVAPDZ256mr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVAPSZ256rr,      X86::VMOVAPSZ256mr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVDQA32Z256rr,    X86::VMOVDQA32Z256mr,  TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVDQA64Z256rr,    X86::VMOVDQA64Z256mr,  TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVUPDZ256rr,      X86::VMOVUPDZ256mr,    TB_FOLDED_STORE },
+    { X86::VMOVUPSZ256rr,      X86::VMOVUPSZ256mr,    TB_FOLDED_STORE },
+    { X86::VMOVDQU8Z256rr,     X86::VMOVDQU8Z256mr,   TB_FOLDED_STORE },
+    { X86::VMOVDQU16Z256rr,    X86::VMOVDQU16Z256mr,  TB_FOLDED_STORE },
+    { X86::VMOVDQU32Z256rr,    X86::VMOVDQU32Z256mr,  TB_FOLDED_STORE },
+    { X86::VMOVDQU64Z256rr,    X86::VMOVDQU64Z256mr,  TB_FOLDED_STORE },
+
+    // AVX-512 foldable instructions (128-bit versions)
+    { X86::VMOVAPDZ128rr,      X86::VMOVAPDZ128mr,    TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVAPSZ128rr,      X86::VMOVAPSZ128mr,    TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVDQA32Z128rr,    X86::VMOVDQA32Z128mr,  TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVDQA64Z128rr,    X86::VMOVDQA64Z128mr,  TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVUPDZ128rr,      X86::VMOVUPDZ128mr,    TB_FOLDED_STORE },
+    { X86::VMOVUPSZ128rr,      X86::VMOVUPSZ128mr,    TB_FOLDED_STORE },
+    { X86::VMOVDQU8Z128rr,     X86::VMOVDQU8Z128mr,   TB_FOLDED_STORE },
+    { X86::VMOVDQU16Z128rr,    X86::VMOVDQU16Z128mr,  TB_FOLDED_STORE },
+    { X86::VMOVDQU32Z128rr,    X86::VMOVDQU32Z128mr,  TB_FOLDED_STORE },
+    { X86::VMOVDQU64Z128rr,    X86::VMOVDQU64Z128mr,  TB_FOLDED_STORE },
+
+    // F16C foldable instructions
+    { X86::VCVTPS2PHrr,        X86::VCVTPS2PHmr,      TB_FOLDED_STORE },
+    { X86::VCVTPS2PHYrr,       X86::VCVTPS2PHYmr,     TB_FOLDED_STORE }
+  };
+
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) {
+    AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
+                  Entry.RegOp, Entry.MemOp, TB_INDEX_0 | Entry.Flags);
+  }
+
+  static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
+    { X86::BSF16rr,         X86::BSF16rm,             0 },
+    { X86::BSF32rr,         X86::BSF32rm,             0 },
+    { X86::BSF64rr,         X86::BSF64rm,             0 },
+    { X86::BSR16rr,         X86::BSR16rm,             0 },
+    { X86::BSR32rr,         X86::BSR32rm,             0 },
+    { X86::BSR64rr,         X86::BSR64rm,             0 },
+    { X86::CMP16rr,         X86::CMP16rm,             0 },
+    { X86::CMP32rr,         X86::CMP32rm,             0 },
+    { X86::CMP64rr,         X86::CMP64rm,             0 },
+    { X86::CMP8rr,          X86::CMP8rm,              0 },
+    { X86::CVTSD2SSrr,      X86::CVTSD2SSrm,          0 },
+    { X86::CVTSI2SD64rr,    X86::CVTSI2SD64rm,        0 },
+    { X86::CVTSI2SDrr,      X86::CVTSI2SDrm,          0 },
+    { X86::CVTSI2SS64rr,    X86::CVTSI2SS64rm,        0 },
+    { X86::CVTSI2SSrr,      X86::CVTSI2SSrm,          0 },
+    { X86::CVTSS2SDrr,      X86::CVTSS2SDrm,          0 },
+    { X86::CVTTSD2SI64rr,   X86::CVTTSD2SI64rm,       0 },
+    { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm,         0 },
+    { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm,       0 },
+    { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm,         0 },
+    { X86::IMUL16rri,       X86::IMUL16rmi,           0 },
+    { X86::IMUL16rri8,      X86::IMUL16rmi8,          0 },
+    { X86::IMUL32rri,       X86::IMUL32rmi,           0 },
+    { X86::IMUL32rri8,      X86::IMUL32rmi8,          0 },
+    { X86::IMUL64rri32,     X86::IMUL64rmi32,         0 },
+    { X86::IMUL64rri8,      X86::IMUL64rmi8,          0 },
+    { X86::Int_COMISDrr,    X86::Int_COMISDrm,        0 },
+    { X86::Int_COMISSrr,    X86::Int_COMISSrm,        0 },
+    { X86::CVTSD2SI64rr,    X86::CVTSD2SI64rm,        0 },
+    { X86::CVTSD2SIrr,      X86::CVTSD2SIrm,          0 },
+    { X86::CVTSS2SI64rr,    X86::CVTSS2SI64rm,        0 },
+    { X86::CVTSS2SIrr,      X86::CVTSS2SIrm,          0 },
+    { X86::CVTDQ2PDrr,      X86::CVTDQ2PDrm,          TB_ALIGN_16 },
+    { X86::CVTDQ2PSrr,      X86::CVTDQ2PSrm,          TB_ALIGN_16 },
+    { X86::CVTPD2DQrr,      X86::CVTPD2DQrm,          TB_ALIGN_16 },
+    { X86::CVTPD2PSrr,      X86::CVTPD2PSrm,          TB_ALIGN_16 },
+    { X86::CVTPS2DQrr,      X86::CVTPS2DQrm,          TB_ALIGN_16 },
+    { X86::CVTPS2PDrr,      X86::CVTPS2PDrm,          TB_ALIGN_16 },
+    { X86::CVTTPD2DQrr,     X86::CVTTPD2DQrm,         TB_ALIGN_16 },
+    { X86::CVTTPS2DQrr,     X86::CVTTPS2DQrm,         TB_ALIGN_16 },
+    { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm,  0 },
+    { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm,     0 },
+    { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm,  0 },
+    { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm,     0 },
+    { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm,       0 },
+    { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm,       0 },
+    { X86::MOV16rr,         X86::MOV16rm,             0 },
+    { X86::MOV32rr,         X86::MOV32rm,             0 },
+    { X86::MOV64rr,         X86::MOV64rm,             0 },
+    { X86::MOV64toPQIrr,    X86::MOVQI2PQIrm,         0 },
+    { X86::MOV64toSDrr,     X86::MOV64toSDrm,         0 },
+    { X86::MOV8rr,          X86::MOV8rm,              0 },
+    { X86::MOVAPDrr,        X86::MOVAPDrm,            TB_ALIGN_16 },
+    { X86::MOVAPSrr,        X86::MOVAPSrm,            TB_ALIGN_16 },
+    { X86::MOVDDUPrr,       X86::MOVDDUPrm,           0 },
+    { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm,         0 },
+    { X86::MOVDI2SSrr,      X86::MOVDI2SSrm,          0 },
+    { X86::MOVDQArr,        X86::MOVDQArm,            TB_ALIGN_16 },
+    { X86::MOVSHDUPrr,      X86::MOVSHDUPrm,          TB_ALIGN_16 },
+    { X86::MOVSLDUPrr,      X86::MOVSLDUPrm,          TB_ALIGN_16 },
+    { X86::MOVSX16rr8,      X86::MOVSX16rm8,          0 },
+    { X86::MOVSX32rr16,     X86::MOVSX32rm16,         0 },
+    { X86::MOVSX32rr8,      X86::MOVSX32rm8,          0 },
+    { X86::MOVSX64rr16,     X86::MOVSX64rm16,         0 },
+    { X86::MOVSX64rr32,     X86::MOVSX64rm32,         0 },
+    { X86::MOVSX64rr8,      X86::MOVSX64rm8,          0 },
+    { X86::MOVUPDrr,        X86::MOVUPDrm,            TB_ALIGN_16 },
+    { X86::MOVUPSrr,        X86::MOVUPSrm,            0 },
+    { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm,     TB_ALIGN_16 },
+    { X86::MOVZX16rr8,      X86::MOVZX16rm8,          0 },
+    { X86::MOVZX32rr16,     X86::MOVZX32rm16,         0 },
+    { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8,   0 },
+    { X86::MOVZX32rr8,      X86::MOVZX32rm8,          0 },
+    { X86::PABSBrr128,      X86::PABSBrm128,          TB_ALIGN_16 },
+    { X86::PABSDrr128,      X86::PABSDrm128,          TB_ALIGN_16 },
+    { X86::PABSWrr128,      X86::PABSWrm128,          TB_ALIGN_16 },
+    { X86::PCMPESTRIrr,     X86::PCMPESTRIrm,         TB_ALIGN_16 },
+    { X86::PCMPESTRM128rr,  X86::PCMPESTRM128rm,      TB_ALIGN_16 },
+    { X86::PCMPISTRIrr,     X86::PCMPISTRIrm,         TB_ALIGN_16 },
+    { X86::PCMPISTRM128rr,  X86::PCMPISTRM128rm,      TB_ALIGN_16 },
+    { X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128,     TB_ALIGN_16 },
+    { X86::PMOVSXBDrr,      X86::PMOVSXBDrm,          TB_ALIGN_16 },
+    { X86::PMOVSXBQrr,      X86::PMOVSXBQrm,          TB_ALIGN_16 },
+    { X86::PMOVSXBWrr,      X86::PMOVSXBWrm,          TB_ALIGN_16 },
+    { X86::PMOVSXDQrr,      X86::PMOVSXDQrm,          TB_ALIGN_16 },
+    { X86::PMOVSXWDrr,      X86::PMOVSXWDrm,          TB_ALIGN_16 },
+    { X86::PMOVSXWQrr,      X86::PMOVSXWQrm,          TB_ALIGN_16 },
+    { X86::PMOVZXBDrr,      X86::PMOVZXBDrm,          TB_ALIGN_16 },
+    { X86::PMOVZXBQrr,      X86::PMOVZXBQrm,          TB_ALIGN_16 },
+    { X86::PMOVZXBWrr,      X86::PMOVZXBWrm,          TB_ALIGN_16 },
+    { X86::PMOVZXDQrr,      X86::PMOVZXDQrm,          TB_ALIGN_16 },
+    { X86::PMOVZXWDrr,      X86::PMOVZXWDrm,          TB_ALIGN_16 },
+    { X86::PMOVZXWQrr,      X86::PMOVZXWQrm,          TB_ALIGN_16 },
+    { X86::PSHUFDri,        X86::PSHUFDmi,            TB_ALIGN_16 },
+    { X86::PSHUFHWri,       X86::PSHUFHWmi,           TB_ALIGN_16 },
+    { X86::PSHUFLWri,       X86::PSHUFLWmi,           TB_ALIGN_16 },
+    { X86::PTESTrr,         X86::PTESTrm,             TB_ALIGN_16 },
+    { X86::RCPPSr,          X86::RCPPSm,              TB_ALIGN_16 },
+    { X86::RCPSSr,          X86::RCPSSm,              0 },
+    { X86::RCPSSr_Int,      X86::RCPSSm_Int,          0 },
+    { X86::ROUNDPDr,        X86::ROUNDPDm,            TB_ALIGN_16 },
+    { X86::ROUNDPSr,        X86::ROUNDPSm,            TB_ALIGN_16 },
+    { X86::RSQRTPSr,        X86::RSQRTPSm,            TB_ALIGN_16 },
+    { X86::RSQRTSSr,        X86::RSQRTSSm,            0 },
+    { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int,        0 },
+    { X86::SQRTPDr,         X86::SQRTPDm,             TB_ALIGN_16 },
+    { X86::SQRTPSr,         X86::SQRTPSm,             TB_ALIGN_16 },
+    { X86::SQRTSDr,         X86::SQRTSDm,             0 },
+    { X86::SQRTSDr_Int,     X86::SQRTSDm_Int,         0 },
+    { X86::SQRTSSr,         X86::SQRTSSm,             0 },
+    { X86::SQRTSSr_Int,     X86::SQRTSSm_Int,         0 },
+    { X86::TEST16rr,        X86::TEST16rm,            0 },
+    { X86::TEST32rr,        X86::TEST32rm,            0 },
+    { X86::TEST64rr,        X86::TEST64rm,            0 },
+    { X86::TEST8rr,         X86::TEST8rm,             0 },
+    // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
+    { X86::UCOMISDrr,       X86::UCOMISDrm,           0 },
+    { X86::UCOMISSrr,       X86::UCOMISSrm,           0 },
+
+    // MMX version of foldable instructions
+    { X86::MMX_CVTPD2PIirr,   X86::MMX_CVTPD2PIirm,   0 },
+    { X86::MMX_CVTPI2PDirr,   X86::MMX_CVTPI2PDirm,   0 },
+    { X86::MMX_CVTPS2PIirr,   X86::MMX_CVTPS2PIirm,   0 },
+    { X86::MMX_CVTTPD2PIirr,  X86::MMX_CVTTPD2PIirm,  0 },
+    { X86::MMX_CVTTPS2PIirr,  X86::MMX_CVTTPS2PIirm,  0 },
+    { X86::MMX_MOVD64to64rr,  X86::MMX_MOVQ64rm,      0 },
+    { X86::MMX_PABSBrr64,     X86::MMX_PABSBrm64,     0 },
+    { X86::MMX_PABSDrr64,     X86::MMX_PABSDrm64,     0 },
+    { X86::MMX_PABSWrr64,     X86::MMX_PABSWrm64,     0 },
+    { X86::MMX_PSHUFWri,      X86::MMX_PSHUFWmi,      0 },
+
+    // 3DNow! version of foldable instructions
+    { X86::PF2IDrr,         X86::PF2IDrm,             0 },
+    { X86::PF2IWrr,         X86::PF2IWrm,             0 },
+    { X86::PFRCPrr,         X86::PFRCPrm,             0 },
+    { X86::PFRSQRTrr,       X86::PFRSQRTrm,           0 },
+    { X86::PI2FDrr,         X86::PI2FDrm,             0 },
+    { X86::PI2FWrr,         X86::PI2FWrm,             0 },
+    { X86::PSWAPDrr,        X86::PSWAPDrm,            0 },
+
+    // AVX 128-bit versions of foldable instructions
+    { X86::Int_VCOMISDrr,   X86::Int_VCOMISDrm,       0 },
+    { X86::Int_VCOMISSrr,   X86::Int_VCOMISSrm,       0 },
+    { X86::Int_VUCOMISDrr,  X86::Int_VUCOMISDrm,      0 },
+    { X86::Int_VUCOMISSrr,  X86::Int_VUCOMISSrm,      0 },
+    { X86::VCVTTSD2SI64rr,  X86::VCVTTSD2SI64rm,      0 },
+    { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,0 },
+    { X86::VCVTTSD2SIrr,    X86::VCVTTSD2SIrm,        0 },
+    { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm,    0 },
+    { X86::VCVTTSS2SI64rr,  X86::VCVTTSS2SI64rm,      0 },
+    { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,0 },
+    { X86::VCVTTSS2SIrr,    X86::VCVTTSS2SIrm,        0 },
+    { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm,    0 },
+    { X86::VCVTSD2SI64rr,   X86::VCVTSD2SI64rm,       0 },
+    { X86::VCVTSD2SIrr,     X86::VCVTSD2SIrm,         0 },
+    { X86::VCVTSS2SI64rr,   X86::VCVTSS2SI64rm,       0 },
+    { X86::VCVTSS2SIrr,     X86::VCVTSS2SIrm,         0 },
+    { X86::VCVTDQ2PDrr,     X86::VCVTDQ2PDrm,         0 },
+    { X86::VCVTDQ2PSrr,     X86::VCVTDQ2PSrm,         0 },
+    { X86::VCVTPD2DQrr,     X86::VCVTPD2DQXrm,        0 },
+    { X86::VCVTPD2PSrr,     X86::VCVTPD2PSXrm,        0 },
+    { X86::VCVTPS2DQrr,     X86::VCVTPS2DQrm,         0 },
+    { X86::VCVTPS2PDrr,     X86::VCVTPS2PDrm,         0 },
+    { X86::VCVTTPD2DQrr,    X86::VCVTTPD2DQXrm,       0 },
+    { X86::VCVTTPS2DQrr,    X86::VCVTTPS2DQrm,        0 },
+    { X86::VMOV64toPQIrr,   X86::VMOVQI2PQIrm,        0 },
+    { X86::VMOV64toSDrr,    X86::VMOV64toSDrm,        0 },
+    { X86::VMOVAPDrr,       X86::VMOVAPDrm,           TB_ALIGN_16 },
+    { X86::VMOVAPSrr,       X86::VMOVAPSrm,           TB_ALIGN_16 },
+    { X86::VMOVDDUPrr,      X86::VMOVDDUPrm,          0 },
+    { X86::VMOVDI2PDIrr,    X86::VMOVDI2PDIrm,        0 },
+    { X86::VMOVDI2SSrr,     X86::VMOVDI2SSrm,         0 },
+    { X86::VMOVDQArr,       X86::VMOVDQArm,           TB_ALIGN_16 },
+    { X86::VMOVSLDUPrr,     X86::VMOVSLDUPrm,         0 },
+    { X86::VMOVSHDUPrr,     X86::VMOVSHDUPrm,         0 },
+    { X86::VMOVUPDrr,       X86::VMOVUPDrm,           0 },
+    { X86::VMOVUPSrr,       X86::VMOVUPSrm,           0 },
+    { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm,    TB_ALIGN_16 },
+    { X86::VPABSBrr128,     X86::VPABSBrm128,         0 },
+    { X86::VPABSDrr128,     X86::VPABSDrm128,         0 },
+    { X86::VPABSWrr128,     X86::VPABSWrm128,         0 },
+    { X86::VPCMPESTRIrr,    X86::VPCMPESTRIrm,        0 },
+    { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm,     0 },
+    { X86::VPCMPISTRIrr,    X86::VPCMPISTRIrm,        0 },
+    { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm,     0 },
+    { X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128,   0 },
+    { X86::VPERMILPDri,     X86::VPERMILPDmi,         0 },
+    { X86::VPERMILPSri,     X86::VPERMILPSmi,         0 },
+    { X86::VPMOVSXBDrr,     X86::VPMOVSXBDrm,         0 },
+    { X86::VPMOVSXBQrr,     X86::VPMOVSXBQrm,         0 },
+    { X86::VPMOVSXBWrr,     X86::VPMOVSXBWrm,         0 },
+    { X86::VPMOVSXDQrr,     X86::VPMOVSXDQrm,         0 },
+    { X86::VPMOVSXWDrr,     X86::VPMOVSXWDrm,         0 },
+    { X86::VPMOVSXWQrr,     X86::VPMOVSXWQrm,         0 },
+    { X86::VPMOVZXBDrr,     X86::VPMOVZXBDrm,         0 },
+    { X86::VPMOVZXBQrr,     X86::VPMOVZXBQrm,         0 },
+    { X86::VPMOVZXBWrr,     X86::VPMOVZXBWrm,         0 },
+    { X86::VPMOVZXDQrr,     X86::VPMOVZXDQrm,         0 },
+    { X86::VPMOVZXWDrr,     X86::VPMOVZXWDrm,         0 },
+    { X86::VPMOVZXWQrr,     X86::VPMOVZXWQrm,         0 },
+    { X86::VPSHUFDri,       X86::VPSHUFDmi,           0 },
+    { X86::VPSHUFHWri,      X86::VPSHUFHWmi,          0 },
+    { X86::VPSHUFLWri,      X86::VPSHUFLWmi,          0 },
+    { X86::VPTESTrr,        X86::VPTESTrm,            0 },
+    { X86::VRCPPSr,         X86::VRCPPSm,             0 },
+    { X86::VROUNDPDr,       X86::VROUNDPDm,           0 },
+    { X86::VROUNDPSr,       X86::VROUNDPSm,           0 },
+    { X86::VRSQRTPSr,       X86::VRSQRTPSm,           0 },
+    { X86::VSQRTPDr,        X86::VSQRTPDm,            0 },
+    { X86::VSQRTPSr,        X86::VSQRTPSm,            0 },
+    { X86::VTESTPDrr,       X86::VTESTPDrm,           0 },
+    { X86::VTESTPSrr,       X86::VTESTPSrm,           0 },
+    { X86::VUCOMISDrr,      X86::VUCOMISDrm,          0 },
+    { X86::VUCOMISSrr,      X86::VUCOMISSrm,          0 },
+
+    // AVX 256-bit foldable instructions
+    { X86::VCVTDQ2PDYrr,    X86::VCVTDQ2PDYrm,        0 },
+    { X86::VCVTDQ2PSYrr,    X86::VCVTDQ2PSYrm,        0 },
+    { X86::VCVTPD2DQYrr,    X86::VCVTPD2DQYrm,        0 },
+    { X86::VCVTPD2PSYrr,    X86::VCVTPD2PSYrm,        0 },
+    { X86::VCVTPS2DQYrr,    X86::VCVTPS2DQYrm,        0 },
+    { X86::VCVTPS2PDYrr,    X86::VCVTPS2PDYrm,        0 },
+    { X86::VCVTTPD2DQYrr,   X86::VCVTTPD2DQYrm,       0 },
+    { X86::VCVTTPS2DQYrr,   X86::VCVTTPS2DQYrm,       0 },
+    { X86::VMOVAPDYrr,      X86::VMOVAPDYrm,          TB_ALIGN_32 },
+    { X86::VMOVAPSYrr,      X86::VMOVAPSYrm,          TB_ALIGN_32 },
+    { X86::VMOVDDUPYrr,     X86::VMOVDDUPYrm,         0 },
+    { X86::VMOVDQAYrr,      X86::VMOVDQAYrm,          TB_ALIGN_32 },
+    { X86::VMOVSLDUPYrr,    X86::VMOVSLDUPYrm,        0 },
+    { X86::VMOVSHDUPYrr,    X86::VMOVSHDUPYrm,        0 },
+    { X86::VMOVUPDYrr,      X86::VMOVUPDYrm,          0 },
+    { X86::VMOVUPSYrr,      X86::VMOVUPSYrm,          0 },
+    { X86::VPERMILPDYri,    X86::VPERMILPDYmi,        0 },
+    { X86::VPERMILPSYri,    X86::VPERMILPSYmi,        0 },
+    { X86::VPTESTYrr,       X86::VPTESTYrm,           0 },
+    { X86::VRCPPSYr,        X86::VRCPPSYm,            0 },
+    { X86::VROUNDYPDr,      X86::VROUNDYPDm,          0 },
+    { X86::VROUNDYPSr,      X86::VROUNDYPSm,          0 },
+    { X86::VRSQRTPSYr,      X86::VRSQRTPSYm,          0 },
+    { X86::VSQRTPDYr,       X86::VSQRTPDYm,           0 },
+    { X86::VSQRTPSYr,       X86::VSQRTPSYm,           0 },
+    { X86::VTESTPDYrr,      X86::VTESTPDYrm,          0 },
+    { X86::VTESTPSYrr,      X86::VTESTPSYrm,          0 },
+
+    // AVX2 foldable instructions
+
+    // VBROADCASTS{SD}rr register instructions were an AVX2 addition while the
+    // VBROADCASTS{SD}rm memory instructions were available from AVX1.
+    // TB_NO_REVERSE prevents unfolding from introducing an illegal instruction
+    // on AVX1 targets. The VPBROADCAST instructions are all AVX2 instructions
+    // so they don't need an equivalent limitation.
+    { X86::VBROADCASTSSrr,  X86::VBROADCASTSSrm,      TB_NO_REVERSE },
+    { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
+    { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
+    { X86::VPABSBrr256,     X86::VPABSBrm256,         0 },
+    { X86::VPABSDrr256,     X86::VPABSDrm256,         0 },
+    { X86::VPABSWrr256,     X86::VPABSWrm256,         0 },
+    { X86::VPBROADCASTBrr,  X86::VPBROADCASTBrm,      0 },
+    { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm,     0 },
+    { X86::VPBROADCASTDrr,  X86::VPBROADCASTDrm,      0 },
+    { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm,     0 },
+    { X86::VPBROADCASTQrr,  X86::VPBROADCASTQrm,      0 },
+    { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm,     0 },
+    { X86::VPBROADCASTWrr,  X86::VPBROADCASTWrm,      0 },
+    { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm,     0 },
+    { X86::VPERMPDYri,      X86::VPERMPDYmi,          0 },
+    { X86::VPERMQYri,       X86::VPERMQYmi,           0 },
+    { X86::VPMOVSXBDYrr,    X86::VPMOVSXBDYrm,        0 },
+    { X86::VPMOVSXBQYrr,    X86::VPMOVSXBQYrm,        0 },
+    { X86::VPMOVSXBWYrr,    X86::VPMOVSXBWYrm,        0 },
+    { X86::VPMOVSXDQYrr,    X86::VPMOVSXDQYrm,        0 },
+    { X86::VPMOVSXWDYrr,    X86::VPMOVSXWDYrm,        0 },
+    { X86::VPMOVSXWQYrr,    X86::VPMOVSXWQYrm,        0 },
+    { X86::VPMOVZXBDYrr,    X86::VPMOVZXBDYrm,        0 },
+    { X86::VPMOVZXBQYrr,    X86::VPMOVZXBQYrm,        0 },
+    { X86::VPMOVZXBWYrr,    X86::VPMOVZXBWYrm,        0 },
+    { X86::VPMOVZXDQYrr,    X86::VPMOVZXDQYrm,        0 },
+    { X86::VPMOVZXWDYrr,    X86::VPMOVZXWDYrm,        0 },
+    { X86::VPMOVZXWQYrr,    X86::VPMOVZXWQYrm,        0 },
+    { X86::VPSHUFDYri,      X86::VPSHUFDYmi,          0 },
+    { X86::VPSHUFHWYri,     X86::VPSHUFHWYmi,         0 },
+    { X86::VPSHUFLWYri,     X86::VPSHUFLWYmi,         0 },
+
+    // XOP foldable instructions
+    { X86::VFRCZPDrr,          X86::VFRCZPDrm,        0 },
+    { X86::VFRCZPDrrY,         X86::VFRCZPDrmY,       0 },
+    { X86::VFRCZPSrr,          X86::VFRCZPSrm,        0 },
+    { X86::VFRCZPSrrY,         X86::VFRCZPSrmY,       0 },
+    { X86::VFRCZSDrr,          X86::VFRCZSDrm,        0 },
+    { X86::VFRCZSSrr,          X86::VFRCZSSrm,        0 },
+    { X86::VPHADDBDrr,         X86::VPHADDBDrm,       0 },
+    { X86::VPHADDBQrr,         X86::VPHADDBQrm,       0 },
+    { X86::VPHADDBWrr,         X86::VPHADDBWrm,       0 },
+    { X86::VPHADDDQrr,         X86::VPHADDDQrm,       0 },
+    { X86::VPHADDWDrr,         X86::VPHADDWDrm,       0 },
+    { X86::VPHADDWQrr,         X86::VPHADDWQrm,       0 },
+    { X86::VPHADDUBDrr,        X86::VPHADDUBDrm,      0 },
+    { X86::VPHADDUBQrr,        X86::VPHADDUBQrm,      0 },
+    { X86::VPHADDUBWrr,        X86::VPHADDUBWrm,      0 },
+    { X86::VPHADDUDQrr,        X86::VPHADDUDQrm,      0 },
+    { X86::VPHADDUWDrr,        X86::VPHADDUWDrm,      0 },
+    { X86::VPHADDUWQrr,        X86::VPHADDUWQrm,      0 },
+    { X86::VPHSUBBWrr,         X86::VPHSUBBWrm,       0 },
+    { X86::VPHSUBDQrr,         X86::VPHSUBDQrm,       0 },
+    { X86::VPHSUBWDrr,         X86::VPHSUBWDrm,       0 },
+    { X86::VPROTBri,           X86::VPROTBmi,         0 },
+    { X86::VPROTBrr,           X86::VPROTBmr,         0 },
+    { X86::VPROTDri,           X86::VPROTDmi,         0 },
+    { X86::VPROTDrr,           X86::VPROTDmr,         0 },
+    { X86::VPROTQri,           X86::VPROTQmi,         0 },
+    { X86::VPROTQrr,           X86::VPROTQmr,         0 },
+    { X86::VPROTWri,           X86::VPROTWmi,         0 },
+    { X86::VPROTWrr,           X86::VPROTWmr,         0 },
+    { X86::VPSHABrr,           X86::VPSHABmr,         0 },
+    { X86::VPSHADrr,           X86::VPSHADmr,         0 },
+    { X86::VPSHAQrr,           X86::VPSHAQmr,         0 },
+    { X86::VPSHAWrr,           X86::VPSHAWmr,         0 },
+    { X86::VPSHLBrr,           X86::VPSHLBmr,         0 },
+    { X86::VPSHLDrr,           X86::VPSHLDmr,         0 },
+    { X86::VPSHLQrr,           X86::VPSHLQmr,         0 },
+    { X86::VPSHLWrr,           X86::VPSHLWmr,         0 },
+
+    // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
+    { X86::BEXTR32rr,       X86::BEXTR32rm,           0 },
+    { X86::BEXTR64rr,       X86::BEXTR64rm,           0 },
+    { X86::BEXTRI32ri,      X86::BEXTRI32mi,          0 },
+    { X86::BEXTRI64ri,      X86::BEXTRI64mi,          0 },
+    { X86::BLCFILL32rr,     X86::BLCFILL32rm,         0 },
+    { X86::BLCFILL64rr,     X86::BLCFILL64rm,         0 },
+    { X86::BLCI32rr,        X86::BLCI32rm,            0 },
+    { X86::BLCI64rr,        X86::BLCI64rm,            0 },
+    { X86::BLCIC32rr,       X86::BLCIC32rm,           0 },
+    { X86::BLCIC64rr,       X86::BLCIC64rm,           0 },
+    { X86::BLCMSK32rr,      X86::BLCMSK32rm,          0 },
+    { X86::BLCMSK64rr,      X86::BLCMSK64rm,          0 },
+    { X86::BLCS32rr,        X86::BLCS32rm,            0 },
+    { X86::BLCS64rr,        X86::BLCS64rm,            0 },
+    { X86::BLSFILL32rr,     X86::BLSFILL32rm,         0 },
+    { X86::BLSFILL64rr,     X86::BLSFILL64rm,         0 },
+    { X86::BLSI32rr,        X86::BLSI32rm,            0 },
+    { X86::BLSI64rr,        X86::BLSI64rm,            0 },
+    { X86::BLSIC32rr,       X86::BLSIC32rm,           0 },
+    { X86::BLSIC64rr,       X86::BLSIC64rm,           0 },
+    { X86::BLSMSK32rr,      X86::BLSMSK32rm,          0 },
+    { X86::BLSMSK64rr,      X86::BLSMSK64rm,          0 },
+    { X86::BLSR32rr,        X86::BLSR32rm,            0 },
+    { X86::BLSR64rr,        X86::BLSR64rm,            0 },
+    { X86::BZHI32rr,        X86::BZHI32rm,            0 },
+    { X86::BZHI64rr,        X86::BZHI64rm,            0 },
+    { X86::LZCNT16rr,       X86::LZCNT16rm,           0 },
+    { X86::LZCNT32rr,       X86::LZCNT32rm,           0 },
+    { X86::LZCNT64rr,       X86::LZCNT64rm,           0 },
+    { X86::POPCNT16rr,      X86::POPCNT16rm,          0 },
+    { X86::POPCNT32rr,      X86::POPCNT32rm,          0 },
+    { X86::POPCNT64rr,      X86::POPCNT64rm,          0 },
+    { X86::RORX32ri,        X86::RORX32mi,            0 },
+    { X86::RORX64ri,        X86::RORX64mi,            0 },
+    { X86::SARX32rr,        X86::SARX32rm,            0 },
+    { X86::SARX64rr,        X86::SARX64rm,            0 },
+    { X86::SHRX32rr,        X86::SHRX32rm,            0 },
+    { X86::SHRX64rr,        X86::SHRX64rm,            0 },
+    { X86::SHLX32rr,        X86::SHLX32rm,            0 },
+    { X86::SHLX64rr,        X86::SHLX64rm,            0 },
+    { X86::T1MSKC32rr,      X86::T1MSKC32rm,          0 },
+    { X86::T1MSKC64rr,      X86::T1MSKC64rm,          0 },
+    { X86::TZCNT16rr,       X86::TZCNT16rm,           0 },
+    { X86::TZCNT32rr,       X86::TZCNT32rm,           0 },
+    { X86::TZCNT64rr,       X86::TZCNT64rm,           0 },
+    { X86::TZMSK32rr,       X86::TZMSK32rm,           0 },
+    { X86::TZMSK64rr,       X86::TZMSK64rm,           0 },
+
+    // AVX-512 foldable instructions
+    { X86::VMOV64toPQIZrr,  X86::VMOVQI2PQIZrm,       0 },
+    { X86::VMOVDI2SSZrr,    X86::VMOVDI2SSZrm,        0 },
+    { X86::VMOVAPDZrr,      X86::VMOVAPDZrm,          TB_ALIGN_64 },
+    { X86::VMOVAPSZrr,      X86::VMOVAPSZrm,          TB_ALIGN_64 },
+    { X86::VMOVDQA32Zrr,    X86::VMOVDQA32Zrm,        TB_ALIGN_64 },
+    { X86::VMOVDQA64Zrr,    X86::VMOVDQA64Zrm,        TB_ALIGN_64 },
+    { X86::VMOVDQU8Zrr,     X86::VMOVDQU8Zrm,         0 },
+    { X86::VMOVDQU16Zrr,    X86::VMOVDQU16Zrm,        0 },
+    { X86::VMOVDQU32Zrr,    X86::VMOVDQU32Zrm,        0 },
+    { X86::VMOVDQU64Zrr,    X86::VMOVDQU64Zrm,        0 },
+    { X86::VMOVUPDZrr,      X86::VMOVUPDZrm,          0 },
+    { X86::VMOVUPSZrr,      X86::VMOVUPSZrm,          0 },
+    { X86::VPABSDZrr,       X86::VPABSDZrm,           0 },
+    { X86::VPABSQZrr,       X86::VPABSQZrm,           0 },
+    { X86::VBROADCASTSSZr,  X86::VBROADCASTSSZm,      TB_NO_REVERSE },
+    { X86::VBROADCASTSDZr,  X86::VBROADCASTSDZm,      TB_NO_REVERSE },
+
+    // AVX-512 foldable instructions (256-bit versions)
+    { X86::VMOVAPDZ256rr,      X86::VMOVAPDZ256rm,          TB_ALIGN_32 },
+    { X86::VMOVAPSZ256rr,      X86::VMOVAPSZ256rm,          TB_ALIGN_32 },
+    { X86::VMOVDQA32Z256rr,    X86::VMOVDQA32Z256rm,        TB_ALIGN_32 },
+    { X86::VMOVDQA64Z256rr,    X86::VMOVDQA64Z256rm,        TB_ALIGN_32 },
+    { X86::VMOVDQU8Z256rr,     X86::VMOVDQU8Z256rm,         0 },
+    { X86::VMOVDQU16Z256rr,    X86::VMOVDQU16Z256rm,        0 },
+    { X86::VMOVDQU32Z256rr,    X86::VMOVDQU32Z256rm,        0 },
+    { X86::VMOVDQU64Z256rr,    X86::VMOVDQU64Z256rm,        0 },
+    { X86::VMOVUPDZ256rr,      X86::VMOVUPDZ256rm,          0 },
+    { X86::VMOVUPSZ256rr,      X86::VMOVUPSZ256rm,          0 },
+    { X86::VBROADCASTSSZ256r,  X86::VBROADCASTSSZ256m,      TB_NO_REVERSE },
+    { X86::VBROADCASTSDZ256r,  X86::VBROADCASTSDZ256m,      TB_NO_REVERSE },
+
+    // AVX-512 foldable instructions (256-bit versions)
+    { X86::VMOVAPDZ128rr,      X86::VMOVAPDZ128rm,          TB_ALIGN_16 },
+    { X86::VMOVAPSZ128rr,      X86::VMOVAPSZ128rm,          TB_ALIGN_16 },
+    { X86::VMOVDQA32Z128rr,    X86::VMOVDQA32Z128rm,        TB_ALIGN_16 },
+    { X86::VMOVDQA64Z128rr,    X86::VMOVDQA64Z128rm,        TB_ALIGN_16 },
+    { X86::VMOVDQU8Z128rr,     X86::VMOVDQU8Z128rm,         0 },
+    { X86::VMOVDQU16Z128rr,    X86::VMOVDQU16Z128rm,        0 },
+    { X86::VMOVDQU32Z128rr,    X86::VMOVDQU32Z128rm,        0 },
+    { X86::VMOVDQU64Z128rr,    X86::VMOVDQU64Z128rm,        0 },
+    { X86::VMOVUPDZ128rr,      X86::VMOVUPDZ128rm,          0 },
+    { X86::VMOVUPSZ128rr,      X86::VMOVUPSZ128rm,          0 },
+    { X86::VBROADCASTSSZ128r,  X86::VBROADCASTSSZ128m,      TB_NO_REVERSE },
+
+    // F16C foldable instructions
+    { X86::VCVTPH2PSrr,        X86::VCVTPH2PSrm,            0 },
+    { X86::VCVTPH2PSYrr,       X86::VCVTPH2PSYrm,           0 },
+
+    // AES foldable instructions
+    { X86::AESIMCrr,              X86::AESIMCrm,              TB_ALIGN_16 },
+    { X86::AESKEYGENASSIST128rr,  X86::AESKEYGENASSIST128rm,  TB_ALIGN_16 },
+    { X86::VAESIMCrr,             X86::VAESIMCrm,             0 },
+    { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 }
+  };
+
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) {
+    AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
+                  Entry.RegOp, Entry.MemOp,
+                  // Index 1, folded load
+                  Entry.Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
+  }
+
+  static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
+    { X86::ADC32rr,         X86::ADC32rm,       0 },
+    { X86::ADC64rr,         X86::ADC64rm,       0 },
+    { X86::ADD16rr,         X86::ADD16rm,       0 },
+    { X86::ADD16rr_DB,      X86::ADD16rm,       TB_NO_REVERSE },
+    { X86::ADD32rr,         X86::ADD32rm,       0 },
+    { X86::ADD32rr_DB,      X86::ADD32rm,       TB_NO_REVERSE },
+    { X86::ADD64rr,         X86::ADD64rm,       0 },
+    { X86::ADD64rr_DB,      X86::ADD64rm,       TB_NO_REVERSE },
+    { X86::ADD8rr,          X86::ADD8rm,        0 },
+    { X86::ADDPDrr,         X86::ADDPDrm,       TB_ALIGN_16 },
+    { X86::ADDPSrr,         X86::ADDPSrm,       TB_ALIGN_16 },
+    { X86::ADDSDrr,         X86::ADDSDrm,       0 },
+    { X86::ADDSDrr_Int,     X86::ADDSDrm_Int,   0 },
+    { X86::ADDSSrr,         X86::ADDSSrm,       0 },
+    { X86::ADDSSrr_Int,     X86::ADDSSrm_Int,   0 },
+    { X86::ADDSUBPDrr,      X86::ADDSUBPDrm,    TB_ALIGN_16 },
+    { X86::ADDSUBPSrr,      X86::ADDSUBPSrm,    TB_ALIGN_16 },
+    { X86::AND16rr,         X86::AND16rm,       0 },
+    { X86::AND32rr,         X86::AND32rm,       0 },
+    { X86::AND64rr,         X86::AND64rm,       0 },
+    { X86::AND8rr,          X86::AND8rm,        0 },
+    { X86::ANDNPDrr,        X86::ANDNPDrm,      TB_ALIGN_16 },
+    { X86::ANDNPSrr,        X86::ANDNPSrm,      TB_ALIGN_16 },
+    { X86::ANDPDrr,         X86::ANDPDrm,       TB_ALIGN_16 },
+    { X86::ANDPSrr,         X86::ANDPSrm,       TB_ALIGN_16 },
+    { X86::BLENDPDrri,      X86::BLENDPDrmi,    TB_ALIGN_16 },
+    { X86::BLENDPSrri,      X86::BLENDPSrmi,    TB_ALIGN_16 },
+    { X86::BLENDVPDrr0,     X86::BLENDVPDrm0,   TB_ALIGN_16 },
+    { X86::BLENDVPSrr0,     X86::BLENDVPSrm0,   TB_ALIGN_16 },
+    { X86::CMOVA16rr,       X86::CMOVA16rm,     0 },
+    { X86::CMOVA32rr,       X86::CMOVA32rm,     0 },
+    { X86::CMOVA64rr,       X86::CMOVA64rm,     0 },
+    { X86::CMOVAE16rr,      X86::CMOVAE16rm,    0 },
+    { X86::CMOVAE32rr,      X86::CMOVAE32rm,    0 },
+    { X86::CMOVAE64rr,      X86::CMOVAE64rm,    0 },
+    { X86::CMOVB16rr,       X86::CMOVB16rm,     0 },
+    { X86::CMOVB32rr,       X86::CMOVB32rm,     0 },
+    { X86::CMOVB64rr,       X86::CMOVB64rm,     0 },
+    { X86::CMOVBE16rr,      X86::CMOVBE16rm,    0 },
+    { X86::CMOVBE32rr,      X86::CMOVBE32rm,    0 },
+    { X86::CMOVBE64rr,      X86::CMOVBE64rm,    0 },
+    { X86::CMOVE16rr,       X86::CMOVE16rm,     0 },
+    { X86::CMOVE32rr,       X86::CMOVE32rm,     0 },
+    { X86::CMOVE64rr,       X86::CMOVE64rm,     0 },
+    { X86::CMOVG16rr,       X86::CMOVG16rm,     0 },
+    { X86::CMOVG32rr,       X86::CMOVG32rm,     0 },
+    { X86::CMOVG64rr,       X86::CMOVG64rm,     0 },
+    { X86::CMOVGE16rr,      X86::CMOVGE16rm,    0 },
+    { X86::CMOVGE32rr,      X86::CMOVGE32rm,    0 },
+    { X86::CMOVGE64rr,      X86::CMOVGE64rm,    0 },
+    { X86::CMOVL16rr,       X86::CMOVL16rm,     0 },
+    { X86::CMOVL32rr,       X86::CMOVL32rm,     0 },
+    { X86::CMOVL64rr,       X86::CMOVL64rm,     0 },
+    { X86::CMOVLE16rr,      X86::CMOVLE16rm,    0 },
+    { X86::CMOVLE32rr,      X86::CMOVLE32rm,    0 },
+    { X86::CMOVLE64rr,      X86::CMOVLE64rm,    0 },
+    { X86::CMOVNE16rr,      X86::CMOVNE16rm,    0 },
+    { X86::CMOVNE32rr,      X86::CMOVNE32rm,    0 },
+    { X86::CMOVNE64rr,      X86::CMOVNE64rm,    0 },
+    { X86::CMOVNO16rr,      X86::CMOVNO16rm,    0 },
+    { X86::CMOVNO32rr,      X86::CMOVNO32rm,    0 },
+    { X86::CMOVNO64rr,      X86::CMOVNO64rm,    0 },
+    { X86::CMOVNP16rr,      X86::CMOVNP16rm,    0 },
+    { X86::CMOVNP32rr,      X86::CMOVNP32rm,    0 },
+    { X86::CMOVNP64rr,      X86::CMOVNP64rm,    0 },
+    { X86::CMOVNS16rr,      X86::CMOVNS16rm,    0 },
+    { X86::CMOVNS32rr,      X86::CMOVNS32rm,    0 },
+    { X86::CMOVNS64rr,      X86::CMOVNS64rm,    0 },
+    { X86::CMOVO16rr,       X86::CMOVO16rm,     0 },
+    { X86::CMOVO32rr,       X86::CMOVO32rm,     0 },
+    { X86::CMOVO64rr,       X86::CMOVO64rm,     0 },
+    { X86::CMOVP16rr,       X86::CMOVP16rm,     0 },
+    { X86::CMOVP32rr,       X86::CMOVP32rm,     0 },
+    { X86::CMOVP64rr,       X86::CMOVP64rm,     0 },
+    { X86::CMOVS16rr,       X86::CMOVS16rm,     0 },
+    { X86::CMOVS32rr,       X86::CMOVS32rm,     0 },
+    { X86::CMOVS64rr,       X86::CMOVS64rm,     0 },
+    { X86::CMPPDrri,        X86::CMPPDrmi,      TB_ALIGN_16 },
+    { X86::CMPPSrri,        X86::CMPPSrmi,      TB_ALIGN_16 },
+    { X86::CMPSDrr,         X86::CMPSDrm,       0 },
+    { X86::CMPSSrr,         X86::CMPSSrm,       0 },
+    { X86::CRC32r32r32,     X86::CRC32r32m32,   0 },
+    { X86::CRC32r64r64,     X86::CRC32r64m64,   0 },
+    { X86::DIVPDrr,         X86::DIVPDrm,       TB_ALIGN_16 },
+    { X86::DIVPSrr,         X86::DIVPSrm,       TB_ALIGN_16 },
+    { X86::DIVSDrr,         X86::DIVSDrm,       0 },
+    { X86::DIVSDrr_Int,     X86::DIVSDrm_Int,   0 },
+    { X86::DIVSSrr,         X86::DIVSSrm,       0 },
+    { X86::DIVSSrr_Int,     X86::DIVSSrm_Int,   0 },
+    { X86::DPPDrri,         X86::DPPDrmi,       TB_ALIGN_16 },
+    { X86::DPPSrri,         X86::DPPSrmi,       TB_ALIGN_16 },
+
+    // Do not fold Fs* scalar logical op loads because there are no scalar
+    // load variants for these instructions. When folded, the load is required
+    // to be 128-bits, so the load size would not match.
+
+    { X86::FvANDNPDrr,      X86::FvANDNPDrm,    TB_ALIGN_16 },
+    { X86::FvANDNPSrr,      X86::FvANDNPSrm,    TB_ALIGN_16 },
+    { X86::FvANDPDrr,       X86::FvANDPDrm,     TB_ALIGN_16 },
+    { X86::FvANDPSrr,       X86::FvANDPSrm,     TB_ALIGN_16 },
+    { X86::FvORPDrr,        X86::FvORPDrm,      TB_ALIGN_16 },
+    { X86::FvORPSrr,        X86::FvORPSrm,      TB_ALIGN_16 },
+    { X86::FvXORPDrr,       X86::FvXORPDrm,     TB_ALIGN_16 },
+    { X86::FvXORPSrr,       X86::FvXORPSrm,     TB_ALIGN_16 },
+    { X86::HADDPDrr,        X86::HADDPDrm,      TB_ALIGN_16 },
+    { X86::HADDPSrr,        X86::HADDPSrm,      TB_ALIGN_16 },
+    { X86::HSUBPDrr,        X86::HSUBPDrm,      TB_ALIGN_16 },
+    { X86::HSUBPSrr,        X86::HSUBPSrm,      TB_ALIGN_16 },
+    { X86::IMUL16rr,        X86::IMUL16rm,      0 },
+    { X86::IMUL32rr,        X86::IMUL32rm,      0 },
+    { X86::IMUL64rr,        X86::IMUL64rm,      0 },
+    { X86::Int_CMPSDrr,     X86::Int_CMPSDrm,   0 },
+    { X86::Int_CMPSSrr,     X86::Int_CMPSSrm,   0 },
+    { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm,      0 },
+    { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm,    0 },
+    { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm,      0 },
+    { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm,    0 },
+    { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm,      0 },
+    { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm,      0 },
+    { X86::MAXPDrr,         X86::MAXPDrm,       TB_ALIGN_16 },
+    { X86::MAXPSrr,         X86::MAXPSrm,       TB_ALIGN_16 },
+    { X86::MAXSDrr,         X86::MAXSDrm,       0 },
+    { X86::MAXSDrr_Int,     X86::MAXSDrm_Int,   0 },
+    { X86::MAXSSrr,         X86::MAXSSrm,       0 },
+    { X86::MAXSSrr_Int,     X86::MAXSSrm_Int,   0 },
+    { X86::MINPDrr,         X86::MINPDrm,       TB_ALIGN_16 },
+    { X86::MINPSrr,         X86::MINPSrm,       TB_ALIGN_16 },
+    { X86::MINSDrr,         X86::MINSDrm,       0 },
+    { X86::MINSDrr_Int,     X86::MINSDrm_Int,   0 },
+    { X86::MINSSrr,         X86::MINSSrm,       0 },
+    { X86::MINSSrr_Int,     X86::MINSSrm_Int,   0 },
+    { X86::MPSADBWrri,      X86::MPSADBWrmi,    TB_ALIGN_16 },
+    { X86::MULPDrr,         X86::MULPDrm,       TB_ALIGN_16 },
+    { X86::MULPSrr,         X86::MULPSrm,       TB_ALIGN_16 },
+    { X86::MULSDrr,         X86::MULSDrm,       0 },
+    { X86::MULSDrr_Int,     X86::MULSDrm_Int,   0 },
+    { X86::MULSSrr,         X86::MULSSrm,       0 },
+    { X86::MULSSrr_Int,     X86::MULSSrm_Int,   0 },
+    { X86::OR16rr,          X86::OR16rm,        0 },
+    { X86::OR32rr,          X86::OR32rm,        0 },
+    { X86::OR64rr,          X86::OR64rm,        0 },
+    { X86::OR8rr,           X86::OR8rm,         0 },
+    { X86::ORPDrr,          X86::ORPDrm,        TB_ALIGN_16 },
+    { X86::ORPSrr,          X86::ORPSrm,        TB_ALIGN_16 },
+    { X86::PACKSSDWrr,      X86::PACKSSDWrm,    TB_ALIGN_16 },
+    { X86::PACKSSWBrr,      X86::PACKSSWBrm,    TB_ALIGN_16 },
+    { X86::PACKUSDWrr,      X86::PACKUSDWrm,    TB_ALIGN_16 },
+    { X86::PACKUSWBrr,      X86::PACKUSWBrm,    TB_ALIGN_16 },
+    { X86::PADDBrr,         X86::PADDBrm,       TB_ALIGN_16 },
+    { X86::PADDDrr,         X86::PADDDrm,       TB_ALIGN_16 },
+    { X86::PADDQrr,         X86::PADDQrm,       TB_ALIGN_16 },
+    { X86::PADDSBrr,        X86::PADDSBrm,      TB_ALIGN_16 },
+    { X86::PADDSWrr,        X86::PADDSWrm,      TB_ALIGN_16 },
+    { X86::PADDUSBrr,       X86::PADDUSBrm,     TB_ALIGN_16 },
+    { X86::PADDUSWrr,       X86::PADDUSWrm,     TB_ALIGN_16 },
+    { X86::PADDWrr,         X86::PADDWrm,       TB_ALIGN_16 },
+    { X86::PALIGNR128rr,    X86::PALIGNR128rm,  TB_ALIGN_16 },
+    { X86::PANDNrr,         X86::PANDNrm,       TB_ALIGN_16 },
+    { X86::PANDrr,          X86::PANDrm,        TB_ALIGN_16 },
+    { X86::PAVGBrr,         X86::PAVGBrm,       TB_ALIGN_16 },
+    { X86::PAVGWrr,         X86::PAVGWrm,       TB_ALIGN_16 },
+    { X86::PBLENDVBrr0,     X86::PBLENDVBrm0,   TB_ALIGN_16 },
+    { X86::PBLENDWrri,      X86::PBLENDWrmi,    TB_ALIGN_16 },
+    { X86::PCLMULQDQrr,     X86::PCLMULQDQrm,   TB_ALIGN_16 },
+    { X86::PCMPEQBrr,       X86::PCMPEQBrm,     TB_ALIGN_16 },
+    { X86::PCMPEQDrr,       X86::PCMPEQDrm,     TB_ALIGN_16 },
+    { X86::PCMPEQQrr,       X86::PCMPEQQrm,     TB_ALIGN_16 },
+    { X86::PCMPEQWrr,       X86::PCMPEQWrm,     TB_ALIGN_16 },
+    { X86::PCMPGTBrr,       X86::PCMPGTBrm,     TB_ALIGN_16 },
+    { X86::PCMPGTDrr,       X86::PCMPGTDrm,     TB_ALIGN_16 },
+    { X86::PCMPGTQrr,       X86::PCMPGTQrm,     TB_ALIGN_16 },
+    { X86::PCMPGTWrr,       X86::PCMPGTWrm,     TB_ALIGN_16 },
+    { X86::PHADDDrr,        X86::PHADDDrm,      TB_ALIGN_16 },
+    { X86::PHADDWrr,        X86::PHADDWrm,      TB_ALIGN_16 },
+    { X86::PHADDSWrr128,    X86::PHADDSWrm128,  TB_ALIGN_16 },
+    { X86::PHSUBDrr,        X86::PHSUBDrm,      TB_ALIGN_16 },
+    { X86::PHSUBSWrr128,    X86::PHSUBSWrm128,  TB_ALIGN_16 },
+    { X86::PHSUBWrr,        X86::PHSUBWrm,      TB_ALIGN_16 },
+    { X86::PINSRBrr,        X86::PINSRBrm,      0 },
+    { X86::PINSRDrr,        X86::PINSRDrm,      0 },
+    { X86::PINSRQrr,        X86::PINSRQrm,      0 },
+    { X86::PINSRWrri,       X86::PINSRWrmi,     0 },
+    { X86::PMADDUBSWrr128,  X86::PMADDUBSWrm128, TB_ALIGN_16 },
+    { X86::PMADDWDrr,       X86::PMADDWDrm,     TB_ALIGN_16 },
+    { X86::PMAXSWrr,        X86::PMAXSWrm,      TB_ALIGN_16 },
+    { X86::PMAXUBrr,        X86::PMAXUBrm,      TB_ALIGN_16 },
+    { X86::PMINSWrr,        X86::PMINSWrm,      TB_ALIGN_16 },
+    { X86::PMINUBrr,        X86::PMINUBrm,      TB_ALIGN_16 },
+    { X86::PMINSBrr,        X86::PMINSBrm,      TB_ALIGN_16 },
+    { X86::PMINSDrr,        X86::PMINSDrm,      TB_ALIGN_16 },
+    { X86::PMINUDrr,        X86::PMINUDrm,      TB_ALIGN_16 },
+    { X86::PMINUWrr,        X86::PMINUWrm,      TB_ALIGN_16 },
+    { X86::PMAXSBrr,        X86::PMAXSBrm,      TB_ALIGN_16 },
+    { X86::PMAXSDrr,        X86::PMAXSDrm,      TB_ALIGN_16 },
+    { X86::PMAXUDrr,        X86::PMAXUDrm,      TB_ALIGN_16 },
+    { X86::PMAXUWrr,        X86::PMAXUWrm,      TB_ALIGN_16 },
+    { X86::PMULDQrr,        X86::PMULDQrm,      TB_ALIGN_16 },
+    { X86::PMULHRSWrr128,   X86::PMULHRSWrm128, TB_ALIGN_16 },
+    { X86::PMULHUWrr,       X86::PMULHUWrm,     TB_ALIGN_16 },
+    { X86::PMULHWrr,        X86::PMULHWrm,      TB_ALIGN_16 },
+    { X86::PMULLDrr,        X86::PMULLDrm,      TB_ALIGN_16 },
+    { X86::PMULLWrr,        X86::PMULLWrm,      TB_ALIGN_16 },
+    { X86::PMULUDQrr,       X86::PMULUDQrm,     TB_ALIGN_16 },
+    { X86::PORrr,           X86::PORrm,         TB_ALIGN_16 },
+    { X86::PSADBWrr,        X86::PSADBWrm,      TB_ALIGN_16 },
+    { X86::PSHUFBrr,        X86::PSHUFBrm,      TB_ALIGN_16 },
+    { X86::PSIGNBrr,        X86::PSIGNBrm,      TB_ALIGN_16 },
+    { X86::PSIGNWrr,        X86::PSIGNWrm,      TB_ALIGN_16 },
+    { X86::PSIGNDrr,        X86::PSIGNDrm,      TB_ALIGN_16 },
+    { X86::PSLLDrr,         X86::PSLLDrm,       TB_ALIGN_16 },
+    { X86::PSLLQrr,         X86::PSLLQrm,       TB_ALIGN_16 },
+    { X86::PSLLWrr,         X86::PSLLWrm,       TB_ALIGN_16 },
+    { X86::PSRADrr,         X86::PSRADrm,       TB_ALIGN_16 },
+    { X86::PSRAWrr,         X86::PSRAWrm,       TB_ALIGN_16 },
+    { X86::PSRLDrr,         X86::PSRLDrm,       TB_ALIGN_16 },
+    { X86::PSRLQrr,         X86::PSRLQrm,       TB_ALIGN_16 },
+    { X86::PSRLWrr,         X86::PSRLWrm,       TB_ALIGN_16 },
+    { X86::PSUBBrr,         X86::PSUBBrm,       TB_ALIGN_16 },
+    { X86::PSUBDrr,         X86::PSUBDrm,       TB_ALIGN_16 },
+    { X86::PSUBQrr,         X86::PSUBQrm,       TB_ALIGN_16 },
+    { X86::PSUBSBrr,        X86::PSUBSBrm,      TB_ALIGN_16 },
+    { X86::PSUBSWrr,        X86::PSUBSWrm,      TB_ALIGN_16 },
+    { X86::PSUBUSBrr,       X86::PSUBUSBrm,     TB_ALIGN_16 },
+    { X86::PSUBUSWrr,       X86::PSUBUSWrm,     TB_ALIGN_16 },
+    { X86::PSUBWrr,         X86::PSUBWrm,       TB_ALIGN_16 },
+    { X86::PUNPCKHBWrr,     X86::PUNPCKHBWrm,   TB_ALIGN_16 },
+    { X86::PUNPCKHDQrr,     X86::PUNPCKHDQrm,   TB_ALIGN_16 },
+    { X86::PUNPCKHQDQrr,    X86::PUNPCKHQDQrm,  TB_ALIGN_16 },
+    { X86::PUNPCKHWDrr,     X86::PUNPCKHWDrm,   TB_ALIGN_16 },
+    { X86::PUNPCKLBWrr,     X86::PUNPCKLBWrm,   TB_ALIGN_16 },
+    { X86::PUNPCKLDQrr,     X86::PUNPCKLDQrm,   TB_ALIGN_16 },
+    { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm,  TB_ALIGN_16 },
+    { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm,   TB_ALIGN_16 },
+    { X86::PXORrr,          X86::PXORrm,        TB_ALIGN_16 },
+    { X86::ROUNDSDr,        X86::ROUNDSDm,      0 },
+    { X86::ROUNDSSr,        X86::ROUNDSSm,      0 },
+    { X86::SBB32rr,         X86::SBB32rm,       0 },
+    { X86::SBB64rr,         X86::SBB64rm,       0 },
+    { X86::SHUFPDrri,       X86::SHUFPDrmi,     TB_ALIGN_16 },
+    { X86::SHUFPSrri,       X86::SHUFPSrmi,     TB_ALIGN_16 },
+    { X86::SUB16rr,         X86::SUB16rm,       0 },
+    { X86::SUB32rr,         X86::SUB32rm,       0 },
+    { X86::SUB64rr,         X86::SUB64rm,       0 },
+    { X86::SUB8rr,          X86::SUB8rm,        0 },
+    { X86::SUBPDrr,         X86::SUBPDrm,       TB_ALIGN_16 },
+    { X86::SUBPSrr,         X86::SUBPSrm,       TB_ALIGN_16 },
+    { X86::SUBSDrr,         X86::SUBSDrm,       0 },
+    { X86::SUBSDrr_Int,     X86::SUBSDrm_Int,   0 },
+    { X86::SUBSSrr,         X86::SUBSSrm,       0 },
+    { X86::SUBSSrr_Int,     X86::SUBSSrm_Int,   0 },
+    // FIXME: TEST*rr -> swapped operand of TEST*mr.
+    { X86::UNPCKHPDrr,      X86::UNPCKHPDrm,    TB_ALIGN_16 },
+    { X86::UNPCKHPSrr,      X86::UNPCKHPSrm,    TB_ALIGN_16 },
+    { X86::UNPCKLPDrr,      X86::UNPCKLPDrm,    TB_ALIGN_16 },
+    { X86::UNPCKLPSrr,      X86::UNPCKLPSrm,    TB_ALIGN_16 },
+    { X86::XOR16rr,         X86::XOR16rm,       0 },
+    { X86::XOR32rr,         X86::XOR32rm,       0 },
+    { X86::XOR64rr,         X86::XOR64rm,       0 },
+    { X86::XOR8rr,          X86::XOR8rm,        0 },
+    { X86::XORPDrr,         X86::XORPDrm,       TB_ALIGN_16 },
+    { X86::XORPSrr,         X86::XORPSrm,       TB_ALIGN_16 },
+
+    // MMX version of foldable instructions
+    { X86::MMX_CVTPI2PSirr,   X86::MMX_CVTPI2PSirm,   0 },
+    { X86::MMX_PACKSSDWirr,   X86::MMX_PACKSSDWirm,   0 },
+    { X86::MMX_PACKSSWBirr,   X86::MMX_PACKSSWBirm,   0 },
+    { X86::MMX_PACKUSWBirr,   X86::MMX_PACKUSWBirm,   0 },
+    { X86::MMX_PADDBirr,      X86::MMX_PADDBirm,      0 },
+    { X86::MMX_PADDDirr,      X86::MMX_PADDDirm,      0 },
+    { X86::MMX_PADDQirr,      X86::MMX_PADDQirm,      0 },
+    { X86::MMX_PADDSBirr,     X86::MMX_PADDSBirm,     0 },
+    { X86::MMX_PADDSWirr,     X86::MMX_PADDSWirm,     0 },
+    { X86::MMX_PADDUSBirr,    X86::MMX_PADDUSBirm,    0 },
+    { X86::MMX_PADDUSWirr,    X86::MMX_PADDUSWirm,    0 },
+    { X86::MMX_PADDWirr,      X86::MMX_PADDWirm,      0 },
+    { X86::MMX_PALIGNR64irr,  X86::MMX_PALIGNR64irm,  0 },
+    { X86::MMX_PANDNirr,      X86::MMX_PANDNirm,      0 },
+    { X86::MMX_PANDirr,       X86::MMX_PANDirm,       0 },
+    { X86::MMX_PAVGBirr,      X86::MMX_PAVGBirm,      0 },
+    { X86::MMX_PAVGWirr,      X86::MMX_PAVGWirm,      0 },
+    { X86::MMX_PCMPEQBirr,    X86::MMX_PCMPEQBirm,    0 },
+    { X86::MMX_PCMPEQDirr,    X86::MMX_PCMPEQDirm,    0 },
+    { X86::MMX_PCMPEQWirr,    X86::MMX_PCMPEQWirm,    0 },
+    { X86::MMX_PCMPGTBirr,    X86::MMX_PCMPGTBirm,    0 },
+    { X86::MMX_PCMPGTDirr,    X86::MMX_PCMPGTDirm,    0 },
+    { X86::MMX_PCMPGTWirr,    X86::MMX_PCMPGTWirm,    0 },
+    { X86::MMX_PHADDSWrr64,   X86::MMX_PHADDSWrm64,   0 },
+    { X86::MMX_PHADDWrr64,    X86::MMX_PHADDWrm64,    0 },
+    { X86::MMX_PHADDrr64,     X86::MMX_PHADDrm64,     0 },
+    { X86::MMX_PHSUBDrr64,    X86::MMX_PHSUBDrm64,    0 },
+    { X86::MMX_PHSUBSWrr64,   X86::MMX_PHSUBSWrm64,   0 },
+    { X86::MMX_PHSUBWrr64,    X86::MMX_PHSUBWrm64,    0 },
+    { X86::MMX_PINSRWirri,    X86::MMX_PINSRWirmi,    0 },
+    { X86::MMX_PMADDUBSWrr64, X86::MMX_PMADDUBSWrm64, 0 },
+    { X86::MMX_PMADDWDirr,    X86::MMX_PMADDWDirm,    0 },
+    { X86::MMX_PMAXSWirr,     X86::MMX_PMAXSWirm,     0 },
+    { X86::MMX_PMAXUBirr,     X86::MMX_PMAXUBirm,     0 },
+    { X86::MMX_PMINSWirr,     X86::MMX_PMINSWirm,     0 },
+    { X86::MMX_PMINUBirr,     X86::MMX_PMINUBirm,     0 },
+    { X86::MMX_PMULHRSWrr64,  X86::MMX_PMULHRSWrm64,  0 },
+    { X86::MMX_PMULHUWirr,    X86::MMX_PMULHUWirm,    0 },
+    { X86::MMX_PMULHWirr,     X86::MMX_PMULHWirm,     0 },
+    { X86::MMX_PMULLWirr,     X86::MMX_PMULLWirm,     0 },
+    { X86::MMX_PMULUDQirr,    X86::MMX_PMULUDQirm,    0 },
+    { X86::MMX_PORirr,        X86::MMX_PORirm,        0 },
+    { X86::MMX_PSADBWirr,     X86::MMX_PSADBWirm,     0 },
+    { X86::MMX_PSHUFBrr64,    X86::MMX_PSHUFBrm64,    0 },
+    { X86::MMX_PSIGNBrr64,    X86::MMX_PSIGNBrm64,    0 },
+    { X86::MMX_PSIGNDrr64,    X86::MMX_PSIGNDrm64,    0 },
+    { X86::MMX_PSIGNWrr64,    X86::MMX_PSIGNWrm64,    0 },
+    { X86::MMX_PSLLDrr,       X86::MMX_PSLLDrm,       0 },
+    { X86::MMX_PSLLQrr,       X86::MMX_PSLLQrm,       0 },
+    { X86::MMX_PSLLWrr,       X86::MMX_PSLLWrm,       0 },
+    { X86::MMX_PSRADrr,       X86::MMX_PSRADrm,       0 },
+    { X86::MMX_PSRAWrr,       X86::MMX_PSRAWrm,       0 },
+    { X86::MMX_PSRLDrr,       X86::MMX_PSRLDrm,       0 },
+    { X86::MMX_PSRLQrr,       X86::MMX_PSRLQrm,       0 },
+    { X86::MMX_PSRLWrr,       X86::MMX_PSRLWrm,       0 },
+    { X86::MMX_PSUBBirr,      X86::MMX_PSUBBirm,      0 },
+    { X86::MMX_PSUBDirr,      X86::MMX_PSUBDirm,      0 },
+    { X86::MMX_PSUBQirr,      X86::MMX_PSUBQirm,      0 },
+    { X86::MMX_PSUBSBirr,     X86::MMX_PSUBSBirm,     0 },
+    { X86::MMX_PSUBSWirr,     X86::MMX_PSUBSWirm,     0 },
+    { X86::MMX_PSUBUSBirr,    X86::MMX_PSUBUSBirm,    0 },
+    { X86::MMX_PSUBUSWirr,    X86::MMX_PSUBUSWirm,    0 },
+    { X86::MMX_PSUBWirr,      X86::MMX_PSUBWirm,      0 },
+    { X86::MMX_PUNPCKHBWirr,  X86::MMX_PUNPCKHBWirm,  0 },
+    { X86::MMX_PUNPCKHDQirr,  X86::MMX_PUNPCKHDQirm,  0 },
+    { X86::MMX_PUNPCKHWDirr,  X86::MMX_PUNPCKHWDirm,  0 },
+    { X86::MMX_PUNPCKLBWirr,  X86::MMX_PUNPCKLBWirm,  0 },
+    { X86::MMX_PUNPCKLDQirr,  X86::MMX_PUNPCKLDQirm,  0 },
+    { X86::MMX_PUNPCKLWDirr,  X86::MMX_PUNPCKLWDirm,  0 },
+    { X86::MMX_PXORirr,       X86::MMX_PXORirm,       0 },
+
+    // 3DNow! version of foldable instructions
+    { X86::PAVGUSBrr,         X86::PAVGUSBrm,         0 },
+    { X86::PFACCrr,           X86::PFACCrm,           0 },
+    { X86::PFADDrr,           X86::PFADDrm,           0 },
+    { X86::PFCMPEQrr,         X86::PFCMPEQrm,         0 },
+    { X86::PFCMPGErr,         X86::PFCMPGErm,         0 },
+    { X86::PFCMPGTrr,         X86::PFCMPGTrm,         0 },
+    { X86::PFMAXrr,           X86::PFMAXrm,           0 },
+    { X86::PFMINrr,           X86::PFMINrm,           0 },
+    { X86::PFMULrr,           X86::PFMULrm,           0 },
+    { X86::PFNACCrr,          X86::PFNACCrm,          0 },
+    { X86::PFPNACCrr,         X86::PFPNACCrm,         0 },
+    { X86::PFRCPIT1rr,        X86::PFRCPIT1rm,        0 },
+    { X86::PFRCPIT2rr,        X86::PFRCPIT2rm,        0 },
+    { X86::PFRSQIT1rr,        X86::PFRSQIT1rm,        0 },
+    { X86::PFSUBrr,           X86::PFSUBrm,           0 },
+    { X86::PFSUBRrr,          X86::PFSUBRrm,          0 },
+    { X86::PMULHRWrr,         X86::PMULHRWrm,         0 },
+
+    // AVX 128-bit versions of foldable instructions
+    { X86::VCVTSD2SSrr,       X86::VCVTSD2SSrm,        0 },
+    { X86::Int_VCVTSD2SSrr,   X86::Int_VCVTSD2SSrm,    0 },
+    { X86::VCVTSI2SD64rr,     X86::VCVTSI2SD64rm,      0 },
+    { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm,  0 },
+    { X86::VCVTSI2SDrr,       X86::VCVTSI2SDrm,        0 },
+    { X86::Int_VCVTSI2SDrr,   X86::Int_VCVTSI2SDrm,    0 },
+    { X86::VCVTSI2SS64rr,     X86::VCVTSI2SS64rm,      0 },
+    { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm,  0 },
+    { X86::VCVTSI2SSrr,       X86::VCVTSI2SSrm,        0 },
+    { X86::Int_VCVTSI2SSrr,   X86::Int_VCVTSI2SSrm,    0 },
+    { X86::VCVTSS2SDrr,       X86::VCVTSS2SDrm,        0 },
+    { X86::Int_VCVTSS2SDrr,   X86::Int_VCVTSS2SDrm,    0 },
+    { X86::VRCPSSr,           X86::VRCPSSm,            0 },
+    { X86::VRCPSSr_Int,       X86::VRCPSSm_Int,        0 },
+    { X86::VRSQRTSSr,         X86::VRSQRTSSm,          0 },
+    { X86::VRSQRTSSr_Int,     X86::VRSQRTSSm_Int,      0 },
+    { X86::VSQRTSDr,          X86::VSQRTSDm,           0 },
+    { X86::VSQRTSDr_Int,      X86::VSQRTSDm_Int,       0 },
+    { X86::VSQRTSSr,          X86::VSQRTSSm,           0 },
+    { X86::VSQRTSSr_Int,      X86::VSQRTSSm_Int,       0 },
+    { X86::VADDPDrr,          X86::VADDPDrm,           0 },
+    { X86::VADDPSrr,          X86::VADDPSrm,           0 },
+    { X86::VADDSDrr,          X86::VADDSDrm,           0 },
+    { X86::VADDSDrr_Int,      X86::VADDSDrm_Int,       0 },
+    { X86::VADDSSrr,          X86::VADDSSrm,           0 },
+    { X86::VADDSSrr_Int,      X86::VADDSSrm_Int,       0 },
+    { X86::VADDSUBPDrr,       X86::VADDSUBPDrm,        0 },
+    { X86::VADDSUBPSrr,       X86::VADDSUBPSrm,        0 },
+    { X86::VANDNPDrr,         X86::VANDNPDrm,          0 },
+    { X86::VANDNPSrr,         X86::VANDNPSrm,          0 },
+    { X86::VANDPDrr,          X86::VANDPDrm,           0 },
+    { X86::VANDPSrr,          X86::VANDPSrm,           0 },
+    { X86::VBLENDPDrri,       X86::VBLENDPDrmi,        0 },
+    { X86::VBLENDPSrri,       X86::VBLENDPSrmi,        0 },
+    { X86::VBLENDVPDrr,       X86::VBLENDVPDrm,        0 },
+    { X86::VBLENDVPSrr,       X86::VBLENDVPSrm,        0 },
+    { X86::VCMPPDrri,         X86::VCMPPDrmi,          0 },
+    { X86::VCMPPSrri,         X86::VCMPPSrmi,          0 },
+    { X86::VCMPSDrr,          X86::VCMPSDrm,           0 },
+    { X86::VCMPSSrr,          X86::VCMPSSrm,           0 },
+    { X86::VDIVPDrr,          X86::VDIVPDrm,           0 },
+    { X86::VDIVPSrr,          X86::VDIVPSrm,           0 },
+    { X86::VDIVSDrr,          X86::VDIVSDrm,           0 },
+    { X86::VDIVSDrr_Int,      X86::VDIVSDrm_Int,       0 },
+    { X86::VDIVSSrr,          X86::VDIVSSrm,           0 },
+    { X86::VDIVSSrr_Int,      X86::VDIVSSrm_Int,       0 },
+    { X86::VDPPDrri,          X86::VDPPDrmi,           0 },
+    { X86::VDPPSrri,          X86::VDPPSrmi,           0 },
+    // Do not fold VFs* loads because there are no scalar load variants for
+    // these instructions. When folded, the load is required to be 128-bits, so
+    // the load size would not match.
+    { X86::VFvANDNPDrr,       X86::VFvANDNPDrm,        0 },
+    { X86::VFvANDNPSrr,       X86::VFvANDNPSrm,        0 },
+    { X86::VFvANDPDrr,        X86::VFvANDPDrm,         0 },
+    { X86::VFvANDPSrr,        X86::VFvANDPSrm,         0 },
+    { X86::VFvORPDrr,         X86::VFvORPDrm,          0 },
+    { X86::VFvORPSrr,         X86::VFvORPSrm,          0 },
+    { X86::VFvXORPDrr,        X86::VFvXORPDrm,         0 },
+    { X86::VFvXORPSrr,        X86::VFvXORPSrm,         0 },
+    { X86::VHADDPDrr,         X86::VHADDPDrm,          0 },
+    { X86::VHADDPSrr,         X86::VHADDPSrm,          0 },
+    { X86::VHSUBPDrr,         X86::VHSUBPDrm,          0 },
+    { X86::VHSUBPSrr,         X86::VHSUBPSrm,          0 },
+    { X86::Int_VCMPSDrr,      X86::Int_VCMPSDrm,       0 },
+    { X86::Int_VCMPSSrr,      X86::Int_VCMPSSrm,       0 },
+    { X86::VMAXPDrr,          X86::VMAXPDrm,           0 },
+    { X86::VMAXPSrr,          X86::VMAXPSrm,           0 },
+    { X86::VMAXSDrr,          X86::VMAXSDrm,           0 },
+    { X86::VMAXSDrr_Int,      X86::VMAXSDrm_Int,       0 },
+    { X86::VMAXSSrr,          X86::VMAXSSrm,           0 },
+    { X86::VMAXSSrr_Int,      X86::VMAXSSrm_Int,       0 },
+    { X86::VMINPDrr,          X86::VMINPDrm,           0 },
+    { X86::VMINPSrr,          X86::VMINPSrm,           0 },
+    { X86::VMINSDrr,          X86::VMINSDrm,           0 },
+    { X86::VMINSDrr_Int,      X86::VMINSDrm_Int,       0 },
+    { X86::VMINSSrr,          X86::VMINSSrm,           0 },
+    { X86::VMINSSrr_Int,      X86::VMINSSrm_Int,       0 },
+    { X86::VMPSADBWrri,       X86::VMPSADBWrmi,        0 },
+    { X86::VMULPDrr,          X86::VMULPDrm,           0 },
+    { X86::VMULPSrr,          X86::VMULPSrm,           0 },
+    { X86::VMULSDrr,          X86::VMULSDrm,           0 },
+    { X86::VMULSDrr_Int,      X86::VMULSDrm_Int,       0 },
+    { X86::VMULSSrr,          X86::VMULSSrm,           0 },
+    { X86::VMULSSrr_Int,      X86::VMULSSrm_Int,       0 },
+    { X86::VORPDrr,           X86::VORPDrm,            0 },
+    { X86::VORPSrr,           X86::VORPSrm,            0 },
+    { X86::VPACKSSDWrr,       X86::VPACKSSDWrm,        0 },
+    { X86::VPACKSSWBrr,       X86::VPACKSSWBrm,        0 },
+    { X86::VPACKUSDWrr,       X86::VPACKUSDWrm,        0 },
+    { X86::VPACKUSWBrr,       X86::VPACKUSWBrm,        0 },
+    { X86::VPADDBrr,          X86::VPADDBrm,           0 },
+    { X86::VPADDDrr,          X86::VPADDDrm,           0 },
+    { X86::VPADDQrr,          X86::VPADDQrm,           0 },
+    { X86::VPADDSBrr,         X86::VPADDSBrm,          0 },
+    { X86::VPADDSWrr,         X86::VPADDSWrm,          0 },
+    { X86::VPADDUSBrr,        X86::VPADDUSBrm,         0 },
+    { X86::VPADDUSWrr,        X86::VPADDUSWrm,         0 },
+    { X86::VPADDWrr,          X86::VPADDWrm,           0 },
+    { X86::VPALIGNR128rr,     X86::VPALIGNR128rm,      0 },
+    { X86::VPANDNrr,          X86::VPANDNrm,           0 },
+    { X86::VPANDrr,           X86::VPANDrm,            0 },
+    { X86::VPAVGBrr,          X86::VPAVGBrm,           0 },
+    { X86::VPAVGWrr,          X86::VPAVGWrm,           0 },
+    { X86::VPBLENDVBrr,       X86::VPBLENDVBrm,        0 },
+    { X86::VPBLENDWrri,       X86::VPBLENDWrmi,        0 },
+    { X86::VPCLMULQDQrr,      X86::VPCLMULQDQrm,       0 },
+    { X86::VPCMPEQBrr,        X86::VPCMPEQBrm,         0 },
+    { X86::VPCMPEQDrr,        X86::VPCMPEQDrm,         0 },
+    { X86::VPCMPEQQrr,        X86::VPCMPEQQrm,         0 },
+    { X86::VPCMPEQWrr,        X86::VPCMPEQWrm,         0 },
+    { X86::VPCMPGTBrr,        X86::VPCMPGTBrm,         0 },
+    { X86::VPCMPGTDrr,        X86::VPCMPGTDrm,         0 },
+    { X86::VPCMPGTQrr,        X86::VPCMPGTQrm,         0 },
+    { X86::VPCMPGTWrr,        X86::VPCMPGTWrm,         0 },
+    { X86::VPHADDDrr,         X86::VPHADDDrm,          0 },
+    { X86::VPHADDSWrr128,     X86::VPHADDSWrm128,      0 },
+    { X86::VPHADDWrr,         X86::VPHADDWrm,          0 },
+    { X86::VPHSUBDrr,         X86::VPHSUBDrm,          0 },
+    { X86::VPHSUBSWrr128,     X86::VPHSUBSWrm128,      0 },
+    { X86::VPHSUBWrr,         X86::VPHSUBWrm,          0 },
+    { X86::VPERMILPDrr,       X86::VPERMILPDrm,        0 },
+    { X86::VPERMILPSrr,       X86::VPERMILPSrm,        0 },
+    { X86::VPINSRBrr,         X86::VPINSRBrm,          0 },
+    { X86::VPINSRDrr,         X86::VPINSRDrm,          0 },
+    { X86::VPINSRQrr,         X86::VPINSRQrm,          0 },
+    { X86::VPINSRWrri,        X86::VPINSRWrmi,         0 },
+    { X86::VPMADDUBSWrr128,   X86::VPMADDUBSWrm128,    0 },
+    { X86::VPMADDWDrr,        X86::VPMADDWDrm,         0 },
+    { X86::VPMAXSWrr,         X86::VPMAXSWrm,          0 },
+    { X86::VPMAXUBrr,         X86::VPMAXUBrm,          0 },
+    { X86::VPMINSWrr,         X86::VPMINSWrm,          0 },
+    { X86::VPMINUBrr,         X86::VPMINUBrm,          0 },
+    { X86::VPMINSBrr,         X86::VPMINSBrm,          0 },
+    { X86::VPMINSDrr,         X86::VPMINSDrm,          0 },
+    { X86::VPMINUDrr,         X86::VPMINUDrm,          0 },
+    { X86::VPMINUWrr,         X86::VPMINUWrm,          0 },
+    { X86::VPMAXSBrr,         X86::VPMAXSBrm,          0 },
+    { X86::VPMAXSDrr,         X86::VPMAXSDrm,          0 },
+    { X86::VPMAXUDrr,         X86::VPMAXUDrm,          0 },
+    { X86::VPMAXUWrr,         X86::VPMAXUWrm,          0 },
+    { X86::VPMULDQrr,         X86::VPMULDQrm,          0 },
+    { X86::VPMULHRSWrr128,    X86::VPMULHRSWrm128,     0 },
+    { X86::VPMULHUWrr,        X86::VPMULHUWrm,         0 },
+    { X86::VPMULHWrr,         X86::VPMULHWrm,          0 },
+    { X86::VPMULLDrr,         X86::VPMULLDrm,          0 },
+    { X86::VPMULLWrr,         X86::VPMULLWrm,          0 },
+    { X86::VPMULUDQrr,        X86::VPMULUDQrm,         0 },
+    { X86::VPORrr,            X86::VPORrm,             0 },
+    { X86::VPSADBWrr,         X86::VPSADBWrm,          0 },
+    { X86::VPSHUFBrr,         X86::VPSHUFBrm,          0 },
+    { X86::VPSIGNBrr,         X86::VPSIGNBrm,          0 },
+    { X86::VPSIGNWrr,         X86::VPSIGNWrm,          0 },
+    { X86::VPSIGNDrr,         X86::VPSIGNDrm,          0 },
+    { X86::VPSLLDrr,          X86::VPSLLDrm,           0 },
+    { X86::VPSLLQrr,          X86::VPSLLQrm,           0 },
+    { X86::VPSLLWrr,          X86::VPSLLWrm,           0 },
+    { X86::VPSRADrr,          X86::VPSRADrm,           0 },
+    { X86::VPSRAWrr,          X86::VPSRAWrm,           0 },
+    { X86::VPSRLDrr,          X86::VPSRLDrm,           0 },
+    { X86::VPSRLQrr,          X86::VPSRLQrm,           0 },
+    { X86::VPSRLWrr,          X86::VPSRLWrm,           0 },
+    { X86::VPSUBBrr,          X86::VPSUBBrm,           0 },
+    { X86::VPSUBDrr,          X86::VPSUBDrm,           0 },
+    { X86::VPSUBQrr,          X86::VPSUBQrm,           0 },
+    { X86::VPSUBSBrr,         X86::VPSUBSBrm,          0 },
+    { X86::VPSUBSWrr,         X86::VPSUBSWrm,          0 },
+    { X86::VPSUBUSBrr,        X86::VPSUBUSBrm,         0 },
+    { X86::VPSUBUSWrr,        X86::VPSUBUSWrm,         0 },
+    { X86::VPSUBWrr,          X86::VPSUBWrm,           0 },
+    { X86::VPUNPCKHBWrr,      X86::VPUNPCKHBWrm,       0 },
+    { X86::VPUNPCKHDQrr,      X86::VPUNPCKHDQrm,       0 },
+    { X86::VPUNPCKHQDQrr,     X86::VPUNPCKHQDQrm,      0 },
+    { X86::VPUNPCKHWDrr,      X86::VPUNPCKHWDrm,       0 },
+    { X86::VPUNPCKLBWrr,      X86::VPUNPCKLBWrm,       0 },
+    { X86::VPUNPCKLDQrr,      X86::VPUNPCKLDQrm,       0 },
+    { X86::VPUNPCKLQDQrr,     X86::VPUNPCKLQDQrm,      0 },
+    { X86::VPUNPCKLWDrr,      X86::VPUNPCKLWDrm,       0 },
+    { X86::VPXORrr,           X86::VPXORrm,            0 },
+    { X86::VROUNDSDr,         X86::VROUNDSDm,          0 },
+    { X86::VROUNDSSr,         X86::VROUNDSSm,          0 },
+    { X86::VSHUFPDrri,        X86::VSHUFPDrmi,         0 },
+    { X86::VSHUFPSrri,        X86::VSHUFPSrmi,         0 },
+    { X86::VSUBPDrr,          X86::VSUBPDrm,           0 },
+    { X86::VSUBPSrr,          X86::VSUBPSrm,           0 },
+    { X86::VSUBSDrr,          X86::VSUBSDrm,           0 },
+    { X86::VSUBSDrr_Int,      X86::VSUBSDrm_Int,       0 },
+    { X86::VSUBSSrr,          X86::VSUBSSrm,           0 },
+    { X86::VSUBSSrr_Int,      X86::VSUBSSrm_Int,       0 },
+    { X86::VUNPCKHPDrr,       X86::VUNPCKHPDrm,        0 },
+    { X86::VUNPCKHPSrr,       X86::VUNPCKHPSrm,        0 },
+    { X86::VUNPCKLPDrr,       X86::VUNPCKLPDrm,        0 },
+    { X86::VUNPCKLPSrr,       X86::VUNPCKLPSrm,        0 },
+    { X86::VXORPDrr,          X86::VXORPDrm,           0 },
+    { X86::VXORPSrr,          X86::VXORPSrm,           0 },
+
+    // AVX 256-bit foldable instructions
+    { X86::VADDPDYrr,         X86::VADDPDYrm,          0 },
+    { X86::VADDPSYrr,         X86::VADDPSYrm,          0 },
+    { X86::VADDSUBPDYrr,      X86::VADDSUBPDYrm,       0 },
+    { X86::VADDSUBPSYrr,      X86::VADDSUBPSYrm,       0 },
+    { X86::VANDNPDYrr,        X86::VANDNPDYrm,         0 },
+    { X86::VANDNPSYrr,        X86::VANDNPSYrm,         0 },
+    { X86::VANDPDYrr,         X86::VANDPDYrm,          0 },
+    { X86::VANDPSYrr,         X86::VANDPSYrm,          0 },
+    { X86::VBLENDPDYrri,      X86::VBLENDPDYrmi,       0 },
+    { X86::VBLENDPSYrri,      X86::VBLENDPSYrmi,       0 },
+    { X86::VBLENDVPDYrr,      X86::VBLENDVPDYrm,       0 },
+    { X86::VBLENDVPSYrr,      X86::VBLENDVPSYrm,       0 },
+    { X86::VCMPPDYrri,        X86::VCMPPDYrmi,         0 },
+    { X86::VCMPPSYrri,        X86::VCMPPSYrmi,         0 },
+    { X86::VDIVPDYrr,         X86::VDIVPDYrm,          0 },
+    { X86::VDIVPSYrr,         X86::VDIVPSYrm,          0 },
+    { X86::VDPPSYrri,         X86::VDPPSYrmi,          0 },
+    { X86::VHADDPDYrr,        X86::VHADDPDYrm,         0 },
+    { X86::VHADDPSYrr,        X86::VHADDPSYrm,         0 },
+    { X86::VHSUBPDYrr,        X86::VHSUBPDYrm,         0 },
+    { X86::VHSUBPSYrr,        X86::VHSUBPSYrm,         0 },
+    { X86::VINSERTF128rr,     X86::VINSERTF128rm,      0 },
+    { X86::VMAXPDYrr,         X86::VMAXPDYrm,          0 },
+    { X86::VMAXPSYrr,         X86::VMAXPSYrm,          0 },
+    { X86::VMINPDYrr,         X86::VMINPDYrm,          0 },
+    { X86::VMINPSYrr,         X86::VMINPSYrm,          0 },
+    { X86::VMULPDYrr,         X86::VMULPDYrm,          0 },
+    { X86::VMULPSYrr,         X86::VMULPSYrm,          0 },
+    { X86::VORPDYrr,          X86::VORPDYrm,           0 },
+    { X86::VORPSYrr,          X86::VORPSYrm,           0 },
+    { X86::VPERM2F128rr,      X86::VPERM2F128rm,       0 },
+    { X86::VPERMILPDYrr,      X86::VPERMILPDYrm,       0 },
+    { X86::VPERMILPSYrr,      X86::VPERMILPSYrm,       0 },
+    { X86::VSHUFPDYrri,       X86::VSHUFPDYrmi,        0 },
+    { X86::VSHUFPSYrri,       X86::VSHUFPSYrmi,        0 },
+    { X86::VSUBPDYrr,         X86::VSUBPDYrm,          0 },
+    { X86::VSUBPSYrr,         X86::VSUBPSYrm,          0 },
+    { X86::VUNPCKHPDYrr,      X86::VUNPCKHPDYrm,       0 },
+    { X86::VUNPCKHPSYrr,      X86::VUNPCKHPSYrm,       0 },
+    { X86::VUNPCKLPDYrr,      X86::VUNPCKLPDYrm,       0 },
+    { X86::VUNPCKLPSYrr,      X86::VUNPCKLPSYrm,       0 },
+    { X86::VXORPDYrr,         X86::VXORPDYrm,          0 },
+    { X86::VXORPSYrr,         X86::VXORPSYrm,          0 },
+
+    // AVX2 foldable instructions
+    { X86::VINSERTI128rr,     X86::VINSERTI128rm,      0 },
+    { X86::VPACKSSDWYrr,      X86::VPACKSSDWYrm,       0 },
+    { X86::VPACKSSWBYrr,      X86::VPACKSSWBYrm,       0 },
+    { X86::VPACKUSDWYrr,      X86::VPACKUSDWYrm,       0 },
+    { X86::VPACKUSWBYrr,      X86::VPACKUSWBYrm,       0 },
+    { X86::VPADDBYrr,         X86::VPADDBYrm,          0 },
+    { X86::VPADDDYrr,         X86::VPADDDYrm,          0 },
+    { X86::VPADDQYrr,         X86::VPADDQYrm,          0 },
+    { X86::VPADDSBYrr,        X86::VPADDSBYrm,         0 },
+    { X86::VPADDSWYrr,        X86::VPADDSWYrm,         0 },
+    { X86::VPADDUSBYrr,       X86::VPADDUSBYrm,        0 },
+    { X86::VPADDUSWYrr,       X86::VPADDUSWYrm,        0 },
+    { X86::VPADDWYrr,         X86::VPADDWYrm,          0 },
+    { X86::VPALIGNR256rr,     X86::VPALIGNR256rm,      0 },
+    { X86::VPANDNYrr,         X86::VPANDNYrm,          0 },
+    { X86::VPANDYrr,          X86::VPANDYrm,           0 },
+    { X86::VPAVGBYrr,         X86::VPAVGBYrm,          0 },
+    { X86::VPAVGWYrr,         X86::VPAVGWYrm,          0 },
+    { X86::VPBLENDDrri,       X86::VPBLENDDrmi,        0 },
+    { X86::VPBLENDDYrri,      X86::VPBLENDDYrmi,       0 },
+    { X86::VPBLENDVBYrr,      X86::VPBLENDVBYrm,       0 },
+    { X86::VPBLENDWYrri,      X86::VPBLENDWYrmi,       0 },
+    { X86::VPCMPEQBYrr,       X86::VPCMPEQBYrm,        0 },
+    { X86::VPCMPEQDYrr,       X86::VPCMPEQDYrm,        0 },
+    { X86::VPCMPEQQYrr,       X86::VPCMPEQQYrm,        0 },
+    { X86::VPCMPEQWYrr,       X86::VPCMPEQWYrm,        0 },
+    { X86::VPCMPGTBYrr,       X86::VPCMPGTBYrm,        0 },
+    { X86::VPCMPGTDYrr,       X86::VPCMPGTDYrm,        0 },
+    { X86::VPCMPGTQYrr,       X86::VPCMPGTQYrm,        0 },
+    { X86::VPCMPGTWYrr,       X86::VPCMPGTWYrm,        0 },
+    { X86::VPERM2I128rr,      X86::VPERM2I128rm,       0 },
+    { X86::VPERMDYrr,         X86::VPERMDYrm,          0 },
+    { X86::VPERMPSYrr,        X86::VPERMPSYrm,         0 },
+    { X86::VPHADDDYrr,        X86::VPHADDDYrm,         0 },
+    { X86::VPHADDSWrr256,     X86::VPHADDSWrm256,      0 },
+    { X86::VPHADDWYrr,        X86::VPHADDWYrm,         0 },
+    { X86::VPHSUBDYrr,        X86::VPHSUBDYrm,         0 },
+    { X86::VPHSUBSWrr256,     X86::VPHSUBSWrm256,      0 },
+    { X86::VPHSUBWYrr,        X86::VPHSUBWYrm,         0 },
+    { X86::VPMADDUBSWrr256,   X86::VPMADDUBSWrm256,    0 },
+    { X86::VPMADDWDYrr,       X86::VPMADDWDYrm,        0 },
+    { X86::VPMAXSWYrr,        X86::VPMAXSWYrm,         0 },
+    { X86::VPMAXUBYrr,        X86::VPMAXUBYrm,         0 },
+    { X86::VPMINSWYrr,        X86::VPMINSWYrm,         0 },
+    { X86::VPMINUBYrr,        X86::VPMINUBYrm,         0 },
+    { X86::VPMINSBYrr,        X86::VPMINSBYrm,         0 },
+    { X86::VPMINSDYrr,        X86::VPMINSDYrm,         0 },
+    { X86::VPMINUDYrr,        X86::VPMINUDYrm,         0 },
+    { X86::VPMINUWYrr,        X86::VPMINUWYrm,         0 },
+    { X86::VPMAXSBYrr,        X86::VPMAXSBYrm,         0 },
+    { X86::VPMAXSDYrr,        X86::VPMAXSDYrm,         0 },
+    { X86::VPMAXUDYrr,        X86::VPMAXUDYrm,         0 },
+    { X86::VPMAXUWYrr,        X86::VPMAXUWYrm,         0 },
+    { X86::VMPSADBWYrri,      X86::VMPSADBWYrmi,       0 },
+    { X86::VPMULDQYrr,        X86::VPMULDQYrm,         0 },
+    { X86::VPMULHRSWrr256,    X86::VPMULHRSWrm256,     0 },
+    { X86::VPMULHUWYrr,       X86::VPMULHUWYrm,        0 },
+    { X86::VPMULHWYrr,        X86::VPMULHWYrm,         0 },
+    { X86::VPMULLDYrr,        X86::VPMULLDYrm,         0 },
+    { X86::VPMULLWYrr,        X86::VPMULLWYrm,         0 },
+    { X86::VPMULUDQYrr,       X86::VPMULUDQYrm,        0 },
+    { X86::VPORYrr,           X86::VPORYrm,            0 },
+    { X86::VPSADBWYrr,        X86::VPSADBWYrm,         0 },
+    { X86::VPSHUFBYrr,        X86::VPSHUFBYrm,         0 },
+    { X86::VPSIGNBYrr,        X86::VPSIGNBYrm,         0 },
+    { X86::VPSIGNWYrr,        X86::VPSIGNWYrm,         0 },
+    { X86::VPSIGNDYrr,        X86::VPSIGNDYrm,         0 },
+    { X86::VPSLLDYrr,         X86::VPSLLDYrm,          0 },
+    { X86::VPSLLQYrr,         X86::VPSLLQYrm,          0 },
+    { X86::VPSLLWYrr,         X86::VPSLLWYrm,          0 },
+    { X86::VPSLLVDrr,         X86::VPSLLVDrm,          0 },
+    { X86::VPSLLVDYrr,        X86::VPSLLVDYrm,         0 },
+    { X86::VPSLLVQrr,         X86::VPSLLVQrm,          0 },
+    { X86::VPSLLVQYrr,        X86::VPSLLVQYrm,         0 },
+    { X86::VPSRADYrr,         X86::VPSRADYrm,          0 },
+    { X86::VPSRAWYrr,         X86::VPSRAWYrm,          0 },
+    { X86::VPSRAVDrr,         X86::VPSRAVDrm,          0 },
+    { X86::VPSRAVDYrr,        X86::VPSRAVDYrm,         0 },
+    { X86::VPSRLDYrr,         X86::VPSRLDYrm,          0 },
+    { X86::VPSRLQYrr,         X86::VPSRLQYrm,          0 },
+    { X86::VPSRLWYrr,         X86::VPSRLWYrm,          0 },
+    { X86::VPSRLVDrr,         X86::VPSRLVDrm,          0 },
+    { X86::VPSRLVDYrr,        X86::VPSRLVDYrm,         0 },
+    { X86::VPSRLVQrr,         X86::VPSRLVQrm,          0 },
+    { X86::VPSRLVQYrr,        X86::VPSRLVQYrm,         0 },
+    { X86::VPSUBBYrr,         X86::VPSUBBYrm,          0 },
+    { X86::VPSUBDYrr,         X86::VPSUBDYrm,          0 },
+    { X86::VPSUBQYrr,         X86::VPSUBQYrm,          0 },
+    { X86::VPSUBSBYrr,        X86::VPSUBSBYrm,         0 },
+    { X86::VPSUBSWYrr,        X86::VPSUBSWYrm,         0 },
+    { X86::VPSUBUSBYrr,       X86::VPSUBUSBYrm,        0 },
+    { X86::VPSUBUSWYrr,       X86::VPSUBUSWYrm,        0 },
+    { X86::VPSUBWYrr,         X86::VPSUBWYrm,          0 },
+    { X86::VPUNPCKHBWYrr,     X86::VPUNPCKHBWYrm,      0 },
+    { X86::VPUNPCKHDQYrr,     X86::VPUNPCKHDQYrm,      0 },
+    { X86::VPUNPCKHQDQYrr,    X86::VPUNPCKHQDQYrm,     0 },
+    { X86::VPUNPCKHWDYrr,     X86::VPUNPCKHWDYrm,      0 },
+    { X86::VPUNPCKLBWYrr,     X86::VPUNPCKLBWYrm,      0 },
+    { X86::VPUNPCKLDQYrr,     X86::VPUNPCKLDQYrm,      0 },
+    { X86::VPUNPCKLQDQYrr,    X86::VPUNPCKLQDQYrm,     0 },
+    { X86::VPUNPCKLWDYrr,     X86::VPUNPCKLWDYrm,      0 },
+    { X86::VPXORYrr,          X86::VPXORYrm,           0 },
+
+    // FMA4 foldable patterns
+    { X86::VFMADDSS4rr,       X86::VFMADDSS4mr,        TB_ALIGN_NONE },
+    { X86::VFMADDSD4rr,       X86::VFMADDSD4mr,        TB_ALIGN_NONE },
+    { X86::VFMADDPS4rr,       X86::VFMADDPS4mr,        TB_ALIGN_NONE },
+    { X86::VFMADDPD4rr,       X86::VFMADDPD4mr,        TB_ALIGN_NONE },
+    { X86::VFMADDPS4rrY,      X86::VFMADDPS4mrY,       TB_ALIGN_NONE },
+    { X86::VFMADDPD4rrY,      X86::VFMADDPD4mrY,       TB_ALIGN_NONE },
+    { X86::VFNMADDSS4rr,      X86::VFNMADDSS4mr,       TB_ALIGN_NONE },
+    { X86::VFNMADDSD4rr,      X86::VFNMADDSD4mr,       TB_ALIGN_NONE },
+    { X86::VFNMADDPS4rr,      X86::VFNMADDPS4mr,       TB_ALIGN_NONE },
+    { X86::VFNMADDPD4rr,      X86::VFNMADDPD4mr,       TB_ALIGN_NONE },
+    { X86::VFNMADDPS4rrY,     X86::VFNMADDPS4mrY,      TB_ALIGN_NONE },
+    { X86::VFNMADDPD4rrY,     X86::VFNMADDPD4mrY,      TB_ALIGN_NONE },
+    { X86::VFMSUBSS4rr,       X86::VFMSUBSS4mr,        TB_ALIGN_NONE },
+    { X86::VFMSUBSD4rr,       X86::VFMSUBSD4mr,        TB_ALIGN_NONE },
+    { X86::VFMSUBPS4rr,       X86::VFMSUBPS4mr,        TB_ALIGN_NONE },
+    { X86::VFMSUBPD4rr,       X86::VFMSUBPD4mr,        TB_ALIGN_NONE },
+    { X86::VFMSUBPS4rrY,      X86::VFMSUBPS4mrY,       TB_ALIGN_NONE },
+    { X86::VFMSUBPD4rrY,      X86::VFMSUBPD4mrY,       TB_ALIGN_NONE },
+    { X86::VFNMSUBSS4rr,      X86::VFNMSUBSS4mr,       TB_ALIGN_NONE },
+    { X86::VFNMSUBSD4rr,      X86::VFNMSUBSD4mr,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPS4rr,      X86::VFNMSUBPS4mr,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPD4rr,      X86::VFNMSUBPD4mr,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPS4rrY,     X86::VFNMSUBPS4mrY,      TB_ALIGN_NONE },
+    { X86::VFNMSUBPD4rrY,     X86::VFNMSUBPD4mrY,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPS4rr,    X86::VFMADDSUBPS4mr,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPD4rr,    X86::VFMADDSUBPD4mr,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPS4rrY,   X86::VFMADDSUBPS4mrY,    TB_ALIGN_NONE },
+    { X86::VFMADDSUBPD4rrY,   X86::VFMADDSUBPD4mrY,    TB_ALIGN_NONE },
+    { X86::VFMSUBADDPS4rr,    X86::VFMSUBADDPS4mr,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPD4rr,    X86::VFMSUBADDPD4mr,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPS4rrY,   X86::VFMSUBADDPS4mrY,    TB_ALIGN_NONE },
+    { X86::VFMSUBADDPD4rrY,   X86::VFMSUBADDPD4mrY,    TB_ALIGN_NONE },
+
+    // XOP foldable instructions
+    { X86::VPCMOVrr,          X86::VPCMOVmr,            0 },
+    { X86::VPCMOVrrY,         X86::VPCMOVmrY,           0 },
+    { X86::VPCOMBri,          X86::VPCOMBmi,            0 },
+    { X86::VPCOMDri,          X86::VPCOMDmi,            0 },
+    { X86::VPCOMQri,          X86::VPCOMQmi,            0 },
+    { X86::VPCOMWri,          X86::VPCOMWmi,            0 },
+    { X86::VPCOMUBri,         X86::VPCOMUBmi,           0 },
+    { X86::VPCOMUDri,         X86::VPCOMUDmi,           0 },
+    { X86::VPCOMUQri,         X86::VPCOMUQmi,           0 },
+    { X86::VPCOMUWri,         X86::VPCOMUWmi,           0 },
+    { X86::VPERMIL2PDrr,      X86::VPERMIL2PDmr,        0 },
+    { X86::VPERMIL2PDrrY,     X86::VPERMIL2PDmrY,       0 },
+    { X86::VPERMIL2PSrr,      X86::VPERMIL2PSmr,        0 },
+    { X86::VPERMIL2PSrrY,     X86::VPERMIL2PSmrY,       0 },
+    { X86::VPMACSDDrr,        X86::VPMACSDDrm,          0 },
+    { X86::VPMACSDQHrr,       X86::VPMACSDQHrm,         0 },
+    { X86::VPMACSDQLrr,       X86::VPMACSDQLrm,         0 },
+    { X86::VPMACSSDDrr,       X86::VPMACSSDDrm,         0 },
+    { X86::VPMACSSDQHrr,      X86::VPMACSSDQHrm,        0 },
+    { X86::VPMACSSDQLrr,      X86::VPMACSSDQLrm,        0 },
+    { X86::VPMACSSWDrr,       X86::VPMACSSWDrm,         0 },
+    { X86::VPMACSSWWrr,       X86::VPMACSSWWrm,         0 },
+    { X86::VPMACSWDrr,        X86::VPMACSWDrm,          0 },
+    { X86::VPMACSWWrr,        X86::VPMACSWWrm,          0 },
+    { X86::VPMADCSSWDrr,      X86::VPMADCSSWDrm,        0 },
+    { X86::VPMADCSWDrr,       X86::VPMADCSWDrm,         0 },
+    { X86::VPPERMrr,          X86::VPPERMmr,            0 },
+    { X86::VPROTBrr,          X86::VPROTBrm,            0 },
+    { X86::VPROTDrr,          X86::VPROTDrm,            0 },
+    { X86::VPROTQrr,          X86::VPROTQrm,            0 },
+    { X86::VPROTWrr,          X86::VPROTWrm,            0 },
+    { X86::VPSHABrr,          X86::VPSHABrm,            0 },
+    { X86::VPSHADrr,          X86::VPSHADrm,            0 },
+    { X86::VPSHAQrr,          X86::VPSHAQrm,            0 },
+    { X86::VPSHAWrr,          X86::VPSHAWrm,            0 },
+    { X86::VPSHLBrr,          X86::VPSHLBrm,            0 },
+    { X86::VPSHLDrr,          X86::VPSHLDrm,            0 },
+    { X86::VPSHLQrr,          X86::VPSHLQrm,            0 },
+    { X86::VPSHLWrr,          X86::VPSHLWrm,            0 },
+
+    // BMI/BMI2 foldable instructions
+    { X86::ANDN32rr,          X86::ANDN32rm,            0 },
+    { X86::ANDN64rr,          X86::ANDN64rm,            0 },
+    { X86::MULX32rr,          X86::MULX32rm,            0 },
+    { X86::MULX64rr,          X86::MULX64rm,            0 },
+    { X86::PDEP32rr,          X86::PDEP32rm,            0 },
+    { X86::PDEP64rr,          X86::PDEP64rm,            0 },
+    { X86::PEXT32rr,          X86::PEXT32rm,            0 },
+    { X86::PEXT64rr,          X86::PEXT64rm,            0 },
+
+    // ADX foldable instructions
+    { X86::ADCX32rr,          X86::ADCX32rm,            0 },
+    { X86::ADCX64rr,          X86::ADCX64rm,            0 },
+    { X86::ADOX32rr,          X86::ADOX32rm,            0 },
+    { X86::ADOX64rr,          X86::ADOX64rm,            0 },
+
+    // AVX-512 foldable instructions
+    { X86::VADDPSZrr,         X86::VADDPSZrm,           0 },
+    { X86::VADDPDZrr,         X86::VADDPDZrm,           0 },
+    { X86::VSUBPSZrr,         X86::VSUBPSZrm,           0 },
+    { X86::VSUBPDZrr,         X86::VSUBPDZrm,           0 },
+    { X86::VMULPSZrr,         X86::VMULPSZrm,           0 },
+    { X86::VMULPDZrr,         X86::VMULPDZrm,           0 },
+    { X86::VDIVPSZrr,         X86::VDIVPSZrm,           0 },
+    { X86::VDIVPDZrr,         X86::VDIVPDZrm,           0 },
+    { X86::VMINPSZrr,         X86::VMINPSZrm,           0 },
+    { X86::VMINPDZrr,         X86::VMINPDZrm,           0 },
+    { X86::VMAXPSZrr,         X86::VMAXPSZrm,           0 },
+    { X86::VMAXPDZrr,         X86::VMAXPDZrm,           0 },
+    { X86::VPADDDZrr,         X86::VPADDDZrm,           0 },
+    { X86::VPADDQZrr,         X86::VPADDQZrm,           0 },
+    { X86::VPERMPDZri,        X86::VPERMPDZmi,          0 },
+    { X86::VPERMPSZrr,        X86::VPERMPSZrm,          0 },
+    { X86::VPMAXSDZrr,        X86::VPMAXSDZrm,          0 },
+    { X86::VPMAXSQZrr,        X86::VPMAXSQZrm,          0 },
+    { X86::VPMAXUDZrr,        X86::VPMAXUDZrm,          0 },
+    { X86::VPMAXUQZrr,        X86::VPMAXUQZrm,          0 },
+    { X86::VPMINSDZrr,        X86::VPMINSDZrm,          0 },
+    { X86::VPMINSQZrr,        X86::VPMINSQZrm,          0 },
+    { X86::VPMINUDZrr,        X86::VPMINUDZrm,          0 },
+    { X86::VPMINUQZrr,        X86::VPMINUQZrm,          0 },
+    { X86::VPMULDQZrr,        X86::VPMULDQZrm,          0 },
+    { X86::VPSLLVDZrr,        X86::VPSLLVDZrm,          0 },
+    { X86::VPSLLVQZrr,        X86::VPSLLVQZrm,          0 },
+    { X86::VPSRAVDZrr,        X86::VPSRAVDZrm,          0 },
+    { X86::VPSRLVDZrr,        X86::VPSRLVDZrm,          0 },
+    { X86::VPSRLVQZrr,        X86::VPSRLVQZrm,          0 },
+    { X86::VPSUBDZrr,         X86::VPSUBDZrm,           0 },
+    { X86::VPSUBQZrr,         X86::VPSUBQZrm,           0 },
+    { X86::VSHUFPDZrri,       X86::VSHUFPDZrmi,         0 },
+    { X86::VSHUFPSZrri,       X86::VSHUFPSZrmi,         0 },
+    { X86::VALIGNQZrri,       X86::VALIGNQZrmi,         0 },
+    { X86::VALIGNDZrri,       X86::VALIGNDZrmi,         0 },
+    { X86::VPMULUDQZrr,       X86::VPMULUDQZrm,         0 },
+    { X86::VBROADCASTSSZrkz,  X86::VBROADCASTSSZmkz,    TB_NO_REVERSE },
+    { X86::VBROADCASTSDZrkz,  X86::VBROADCASTSDZmkz,    TB_NO_REVERSE },
+
+    // AVX-512{F,VL} foldable instructions
+    { X86::VBROADCASTSSZ256rkz,  X86::VBROADCASTSSZ256mkz,      TB_NO_REVERSE },
+    { X86::VBROADCASTSDZ256rkz,  X86::VBROADCASTSDZ256mkz,      TB_NO_REVERSE },
+    { X86::VBROADCASTSSZ128rkz,  X86::VBROADCASTSSZ128mkz,      TB_NO_REVERSE },
+
+    // AVX-512{F,VL} foldable instructions
+    { X86::VADDPDZ128rr,      X86::VADDPDZ128rm,        0 },
+    { X86::VADDPDZ256rr,      X86::VADDPDZ256rm,        0 },
+    { X86::VADDPSZ128rr,      X86::VADDPSZ128rm,        0 },
+    { X86::VADDPSZ256rr,      X86::VADDPSZ256rm,        0 },
+
+    // AES foldable instructions
+    { X86::AESDECLASTrr,      X86::AESDECLASTrm,        TB_ALIGN_16 },
+    { X86::AESDECrr,          X86::AESDECrm,            TB_ALIGN_16 },
+    { X86::AESENCLASTrr,      X86::AESENCLASTrm,        TB_ALIGN_16 },
+    { X86::AESENCrr,          X86::AESENCrm,            TB_ALIGN_16 },
+    { X86::VAESDECLASTrr,     X86::VAESDECLASTrm,       0 },
+    { X86::VAESDECrr,         X86::VAESDECrm,           0 },
+    { X86::VAESENCLASTrr,     X86::VAESENCLASTrm,       0 },
+    { X86::VAESENCrr,         X86::VAESENCrm,           0 },
+
+    // SHA foldable instructions
+    { X86::SHA1MSG1rr,        X86::SHA1MSG1rm,          TB_ALIGN_16 },
+    { X86::SHA1MSG2rr,        X86::SHA1MSG2rm,          TB_ALIGN_16 },
+    { X86::SHA1NEXTErr,       X86::SHA1NEXTErm,         TB_ALIGN_16 },
+    { X86::SHA1RNDS4rri,      X86::SHA1RNDS4rmi,        TB_ALIGN_16 },
+    { X86::SHA256MSG1rr,      X86::SHA256MSG1rm,        TB_ALIGN_16 },
+    { X86::SHA256MSG2rr,      X86::SHA256MSG2rm,        TB_ALIGN_16 },
+    { X86::SHA256RNDS2rr,     X86::SHA256RNDS2rm,       TB_ALIGN_16 }
+  };
+
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) {
+    AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
+                  Entry.RegOp, Entry.MemOp,
+                  // Index 2, folded load
+                  Entry.Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
+  }
+
+  static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
+    // FMA foldable instructions
+    { X86::VFMADDSSr231r,         X86::VFMADDSSr231m,         TB_ALIGN_NONE },
+    { X86::VFMADDSSr231r_Int,     X86::VFMADDSSr231m_Int,     TB_ALIGN_NONE },
+    { X86::VFMADDSDr231r,         X86::VFMADDSDr231m,         TB_ALIGN_NONE },
+    { X86::VFMADDSDr231r_Int,     X86::VFMADDSDr231m_Int,     TB_ALIGN_NONE },
+    { X86::VFMADDSSr132r,         X86::VFMADDSSr132m,         TB_ALIGN_NONE },
+    { X86::VFMADDSSr132r_Int,     X86::VFMADDSSr132m_Int,     TB_ALIGN_NONE },
+    { X86::VFMADDSDr132r,         X86::VFMADDSDr132m,         TB_ALIGN_NONE },
+    { X86::VFMADDSDr132r_Int,     X86::VFMADDSDr132m_Int,     TB_ALIGN_NONE },
+    { X86::VFMADDSSr213r,         X86::VFMADDSSr213m,         TB_ALIGN_NONE },
+    { X86::VFMADDSSr213r_Int,     X86::VFMADDSSr213m_Int,     TB_ALIGN_NONE },
+    { X86::VFMADDSDr213r,         X86::VFMADDSDr213m,         TB_ALIGN_NONE },
+    { X86::VFMADDSDr213r_Int,     X86::VFMADDSDr213m_Int,     TB_ALIGN_NONE },
+
+    { X86::VFMADDPSr231r,         X86::VFMADDPSr231m,         TB_ALIGN_NONE },
+    { X86::VFMADDPDr231r,         X86::VFMADDPDr231m,         TB_ALIGN_NONE },
+    { X86::VFMADDPSr132r,         X86::VFMADDPSr132m,         TB_ALIGN_NONE },
+    { X86::VFMADDPDr132r,         X86::VFMADDPDr132m,         TB_ALIGN_NONE },
+    { X86::VFMADDPSr213r,         X86::VFMADDPSr213m,         TB_ALIGN_NONE },
+    { X86::VFMADDPDr213r,         X86::VFMADDPDr213m,         TB_ALIGN_NONE },
+    { X86::VFMADDPSr231rY,        X86::VFMADDPSr231mY,        TB_ALIGN_NONE },
+    { X86::VFMADDPDr231rY,        X86::VFMADDPDr231mY,        TB_ALIGN_NONE },
+    { X86::VFMADDPSr132rY,        X86::VFMADDPSr132mY,        TB_ALIGN_NONE },
+    { X86::VFMADDPDr132rY,        X86::VFMADDPDr132mY,        TB_ALIGN_NONE },
+    { X86::VFMADDPSr213rY,        X86::VFMADDPSr213mY,        TB_ALIGN_NONE },
+    { X86::VFMADDPDr213rY,        X86::VFMADDPDr213mY,        TB_ALIGN_NONE },
+
+    { X86::VFNMADDSSr231r,        X86::VFNMADDSSr231m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSSr231r_Int,    X86::VFNMADDSSr231m_Int,    TB_ALIGN_NONE },
+    { X86::VFNMADDSDr231r,        X86::VFNMADDSDr231m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSDr231r_Int,    X86::VFNMADDSDr231m_Int,    TB_ALIGN_NONE },
+    { X86::VFNMADDSSr132r,        X86::VFNMADDSSr132m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSSr132r_Int,    X86::VFNMADDSSr132m_Int,    TB_ALIGN_NONE },
+    { X86::VFNMADDSDr132r,        X86::VFNMADDSDr132m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSDr132r_Int,    X86::VFNMADDSDr132m_Int,    TB_ALIGN_NONE },
+    { X86::VFNMADDSSr213r,        X86::VFNMADDSSr213m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSSr213r_Int,    X86::VFNMADDSSr213m_Int,    TB_ALIGN_NONE },
+    { X86::VFNMADDSDr213r,        X86::VFNMADDSDr213m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSDr213r_Int,    X86::VFNMADDSDr213m_Int,    TB_ALIGN_NONE },
+
+    { X86::VFNMADDPSr231r,        X86::VFNMADDPSr231m,        TB_ALIGN_NONE },
+    { X86::VFNMADDPDr231r,        X86::VFNMADDPDr231m,        TB_ALIGN_NONE },
+    { X86::VFNMADDPSr132r,        X86::VFNMADDPSr132m,        TB_ALIGN_NONE },
+    { X86::VFNMADDPDr132r,        X86::VFNMADDPDr132m,        TB_ALIGN_NONE },
+    { X86::VFNMADDPSr213r,        X86::VFNMADDPSr213m,        TB_ALIGN_NONE },
+    { X86::VFNMADDPDr213r,        X86::VFNMADDPDr213m,        TB_ALIGN_NONE },
+    { X86::VFNMADDPSr231rY,       X86::VFNMADDPSr231mY,       TB_ALIGN_NONE },
+    { X86::VFNMADDPDr231rY,       X86::VFNMADDPDr231mY,       TB_ALIGN_NONE },
+    { X86::VFNMADDPSr132rY,       X86::VFNMADDPSr132mY,       TB_ALIGN_NONE },
+    { X86::VFNMADDPDr132rY,       X86::VFNMADDPDr132mY,       TB_ALIGN_NONE },
+    { X86::VFNMADDPSr213rY,       X86::VFNMADDPSr213mY,       TB_ALIGN_NONE },
+    { X86::VFNMADDPDr213rY,       X86::VFNMADDPDr213mY,       TB_ALIGN_NONE },
+
+    { X86::VFMSUBSSr231r,         X86::VFMSUBSSr231m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSSr231r_Int,     X86::VFMSUBSSr231m_Int,     TB_ALIGN_NONE },
+    { X86::VFMSUBSDr231r,         X86::VFMSUBSDr231m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSDr231r_Int,     X86::VFMSUBSDr231m_Int,     TB_ALIGN_NONE },
+    { X86::VFMSUBSSr132r,         X86::VFMSUBSSr132m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSSr132r_Int,     X86::VFMSUBSSr132m_Int,     TB_ALIGN_NONE },
+    { X86::VFMSUBSDr132r,         X86::VFMSUBSDr132m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSDr132r_Int,     X86::VFMSUBSDr132m_Int,     TB_ALIGN_NONE },
+    { X86::VFMSUBSSr213r,         X86::VFMSUBSSr213m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSSr213r_Int,     X86::VFMSUBSSr213m_Int,     TB_ALIGN_NONE },
+    { X86::VFMSUBSDr213r,         X86::VFMSUBSDr213m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSDr213r_Int,     X86::VFMSUBSDr213m_Int,     TB_ALIGN_NONE },
+
+    { X86::VFMSUBPSr231r,         X86::VFMSUBPSr231m,         TB_ALIGN_NONE },
+    { X86::VFMSUBPDr231r,         X86::VFMSUBPDr231m,         TB_ALIGN_NONE },
+    { X86::VFMSUBPSr132r,         X86::VFMSUBPSr132m,         TB_ALIGN_NONE },
+    { X86::VFMSUBPDr132r,         X86::VFMSUBPDr132m,         TB_ALIGN_NONE },
+    { X86::VFMSUBPSr213r,         X86::VFMSUBPSr213m,         TB_ALIGN_NONE },
+    { X86::VFMSUBPDr213r,         X86::VFMSUBPDr213m,         TB_ALIGN_NONE },
+    { X86::VFMSUBPSr231rY,        X86::VFMSUBPSr231mY,        TB_ALIGN_NONE },
+    { X86::VFMSUBPDr231rY,        X86::VFMSUBPDr231mY,        TB_ALIGN_NONE },
+    { X86::VFMSUBPSr132rY,        X86::VFMSUBPSr132mY,        TB_ALIGN_NONE },
+    { X86::VFMSUBPDr132rY,        X86::VFMSUBPDr132mY,        TB_ALIGN_NONE },
+    { X86::VFMSUBPSr213rY,        X86::VFMSUBPSr213mY,        TB_ALIGN_NONE },
+    { X86::VFMSUBPDr213rY,        X86::VFMSUBPDr213mY,        TB_ALIGN_NONE },
+
+    { X86::VFNMSUBSSr231r,        X86::VFNMSUBSSr231m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSSr231r_Int,    X86::VFNMSUBSSr231m_Int,    TB_ALIGN_NONE },
+    { X86::VFNMSUBSDr231r,        X86::VFNMSUBSDr231m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSDr231r_Int,    X86::VFNMSUBSDr231m_Int,    TB_ALIGN_NONE },
+    { X86::VFNMSUBSSr132r,        X86::VFNMSUBSSr132m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSSr132r_Int,    X86::VFNMSUBSSr132m_Int,    TB_ALIGN_NONE },
+    { X86::VFNMSUBSDr132r,        X86::VFNMSUBSDr132m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSDr132r_Int,    X86::VFNMSUBSDr132m_Int,    TB_ALIGN_NONE },
+    { X86::VFNMSUBSSr213r,        X86::VFNMSUBSSr213m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSSr213r_Int,    X86::VFNMSUBSSr213m_Int,    TB_ALIGN_NONE },
+    { X86::VFNMSUBSDr213r,        X86::VFNMSUBSDr213m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSDr213r_Int,    X86::VFNMSUBSDr213m_Int,    TB_ALIGN_NONE },
+
+    { X86::VFNMSUBPSr231r,        X86::VFNMSUBPSr231m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBPDr231r,        X86::VFNMSUBPDr231m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBPSr132r,        X86::VFNMSUBPSr132m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBPDr132r,        X86::VFNMSUBPDr132m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBPSr213r,        X86::VFNMSUBPSr213m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBPDr213r,        X86::VFNMSUBPDr213m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBPSr231rY,       X86::VFNMSUBPSr231mY,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPDr231rY,       X86::VFNMSUBPDr231mY,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPSr132rY,       X86::VFNMSUBPSr132mY,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPDr132rY,       X86::VFNMSUBPDr132mY,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPSr213rY,       X86::VFNMSUBPSr213mY,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPDr213rY,       X86::VFNMSUBPDr213mY,       TB_ALIGN_NONE },
+
+    { X86::VFMADDSUBPSr231r,      X86::VFMADDSUBPSr231m,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPDr231r,      X86::VFMADDSUBPDr231m,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPSr132r,      X86::VFMADDSUBPSr132m,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPDr132r,      X86::VFMADDSUBPDr132m,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPSr213r,      X86::VFMADDSUBPSr213m,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPDr213r,      X86::VFMADDSUBPDr213m,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPSr231rY,     X86::VFMADDSUBPSr231mY,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPDr231rY,     X86::VFMADDSUBPDr231mY,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPSr132rY,     X86::VFMADDSUBPSr132mY,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPDr132rY,     X86::VFMADDSUBPDr132mY,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPSr213rY,     X86::VFMADDSUBPSr213mY,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPDr213rY,     X86::VFMADDSUBPDr213mY,     TB_ALIGN_NONE },
+
+    { X86::VFMSUBADDPSr231r,      X86::VFMSUBADDPSr231m,      TB_ALIGN_NONE },
+    { X86::VFMSUBADDPDr231r,      X86::VFMSUBADDPDr231m,      TB_ALIGN_NONE },
+    { X86::VFMSUBADDPSr132r,      X86::VFMSUBADDPSr132m,      TB_ALIGN_NONE },
+    { X86::VFMSUBADDPDr132r,      X86::VFMSUBADDPDr132m,      TB_ALIGN_NONE },
+    { X86::VFMSUBADDPSr213r,      X86::VFMSUBADDPSr213m,      TB_ALIGN_NONE },
+    { X86::VFMSUBADDPDr213r,      X86::VFMSUBADDPDr213m,      TB_ALIGN_NONE },
+    { X86::VFMSUBADDPSr231rY,     X86::VFMSUBADDPSr231mY,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPDr231rY,     X86::VFMSUBADDPDr231mY,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPSr132rY,     X86::VFMSUBADDPSr132mY,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPDr132rY,     X86::VFMSUBADDPDr132mY,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPSr213rY,     X86::VFMSUBADDPSr213mY,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPDr213rY,     X86::VFMSUBADDPDr213mY,     TB_ALIGN_NONE },
+
+    // FMA4 foldable patterns
+    { X86::VFMADDSS4rr,           X86::VFMADDSS4rm,           TB_ALIGN_NONE },
+    { X86::VFMADDSD4rr,           X86::VFMADDSD4rm,           TB_ALIGN_NONE },
+    { X86::VFMADDPS4rr,           X86::VFMADDPS4rm,           TB_ALIGN_NONE },
+    { X86::VFMADDPD4rr,           X86::VFMADDPD4rm,           TB_ALIGN_NONE },
+    { X86::VFMADDPS4rrY,          X86::VFMADDPS4rmY,          TB_ALIGN_NONE },
+    { X86::VFMADDPD4rrY,          X86::VFMADDPD4rmY,          TB_ALIGN_NONE },
+    { X86::VFNMADDSS4rr,          X86::VFNMADDSS4rm,          TB_ALIGN_NONE },
+    { X86::VFNMADDSD4rr,          X86::VFNMADDSD4rm,          TB_ALIGN_NONE },
+    { X86::VFNMADDPS4rr,          X86::VFNMADDPS4rm,          TB_ALIGN_NONE },
+    { X86::VFNMADDPD4rr,          X86::VFNMADDPD4rm,          TB_ALIGN_NONE },
+    { X86::VFNMADDPS4rrY,         X86::VFNMADDPS4rmY,         TB_ALIGN_NONE },
+    { X86::VFNMADDPD4rrY,         X86::VFNMADDPD4rmY,         TB_ALIGN_NONE },
+    { X86::VFMSUBSS4rr,           X86::VFMSUBSS4rm,           TB_ALIGN_NONE },
+    { X86::VFMSUBSD4rr,           X86::VFMSUBSD4rm,           TB_ALIGN_NONE },
+    { X86::VFMSUBPS4rr,           X86::VFMSUBPS4rm,           TB_ALIGN_NONE },
+    { X86::VFMSUBPD4rr,           X86::VFMSUBPD4rm,           TB_ALIGN_NONE },
+    { X86::VFMSUBPS4rrY,          X86::VFMSUBPS4rmY,          TB_ALIGN_NONE },
+    { X86::VFMSUBPD4rrY,          X86::VFMSUBPD4rmY,          TB_ALIGN_NONE },
+    { X86::VFNMSUBSS4rr,          X86::VFNMSUBSS4rm,          TB_ALIGN_NONE },
+    { X86::VFNMSUBSD4rr,          X86::VFNMSUBSD4rm,          TB_ALIGN_NONE },
+    { X86::VFNMSUBPS4rr,          X86::VFNMSUBPS4rm,          TB_ALIGN_NONE },
+    { X86::VFNMSUBPD4rr,          X86::VFNMSUBPD4rm,          TB_ALIGN_NONE },
+    { X86::VFNMSUBPS4rrY,         X86::VFNMSUBPS4rmY,         TB_ALIGN_NONE },
+    { X86::VFNMSUBPD4rrY,         X86::VFNMSUBPD4rmY,         TB_ALIGN_NONE },
+    { X86::VFMADDSUBPS4rr,        X86::VFMADDSUBPS4rm,        TB_ALIGN_NONE },
+    { X86::VFMADDSUBPD4rr,        X86::VFMADDSUBPD4rm,        TB_ALIGN_NONE },
+    { X86::VFMADDSUBPS4rrY,       X86::VFMADDSUBPS4rmY,       TB_ALIGN_NONE },
+    { X86::VFMADDSUBPD4rrY,       X86::VFMADDSUBPD4rmY,       TB_ALIGN_NONE },
+    { X86::VFMSUBADDPS4rr,        X86::VFMSUBADDPS4rm,        TB_ALIGN_NONE },
+    { X86::VFMSUBADDPD4rr,        X86::VFMSUBADDPD4rm,        TB_ALIGN_NONE },
+    { X86::VFMSUBADDPS4rrY,       X86::VFMSUBADDPS4rmY,       TB_ALIGN_NONE },
+    { X86::VFMSUBADDPD4rrY,       X86::VFMSUBADDPD4rmY,       TB_ALIGN_NONE },
+
+    // XOP foldable instructions
+    { X86::VPCMOVrr,              X86::VPCMOVrm,              0 },
+    { X86::VPCMOVrrY,             X86::VPCMOVrmY,             0 },
+    { X86::VPERMIL2PDrr,          X86::VPERMIL2PDrm,          0 },
+    { X86::VPERMIL2PDrrY,         X86::VPERMIL2PDrmY,         0 },
+    { X86::VPERMIL2PSrr,          X86::VPERMIL2PSrm,          0 },
+    { X86::VPERMIL2PSrrY,         X86::VPERMIL2PSrmY,         0 },
+    { X86::VPPERMrr,              X86::VPPERMrm,              0 },
+
+    // AVX-512 VPERMI instructions with 3 source operands.
+    { X86::VPERMI2Drr,            X86::VPERMI2Drm,            0 },
+    { X86::VPERMI2Qrr,            X86::VPERMI2Qrm,            0 },
+    { X86::VPERMI2PSrr,           X86::VPERMI2PSrm,           0 },
+    { X86::VPERMI2PDrr,           X86::VPERMI2PDrm,           0 },
+    { X86::VBLENDMPDZrr,          X86::VBLENDMPDZrm,          0 },
+    { X86::VBLENDMPSZrr,          X86::VBLENDMPSZrm,          0 },
+    { X86::VPBLENDMDZrr,          X86::VPBLENDMDZrm,          0 },
+    { X86::VPBLENDMQZrr,          X86::VPBLENDMQZrm,          0 },
+    { X86::VBROADCASTSSZrk,       X86::VBROADCASTSSZmk,       TB_NO_REVERSE },
+    { X86::VBROADCASTSDZrk,       X86::VBROADCASTSDZmk,       TB_NO_REVERSE },
+    { X86::VBROADCASTSSZ256rk,    X86::VBROADCASTSSZ256mk,    TB_NO_REVERSE },
+    { X86::VBROADCASTSDZ256rk,    X86::VBROADCASTSDZ256mk,    TB_NO_REVERSE },
+    { X86::VBROADCASTSSZ128rk,    X86::VBROADCASTSSZ128mk,    TB_NO_REVERSE },
+     // AVX-512 arithmetic instructions
+    { X86::VADDPSZrrkz,           X86::VADDPSZrmkz,           0 },
+    { X86::VADDPDZrrkz,           X86::VADDPDZrmkz,           0 },
+    { X86::VSUBPSZrrkz,           X86::VSUBPSZrmkz,           0 },
+    { X86::VSUBPDZrrkz,           X86::VSUBPDZrmkz,           0 },
+    { X86::VMULPSZrrkz,           X86::VMULPSZrmkz,           0 },
+    { X86::VMULPDZrrkz,           X86::VMULPDZrmkz,           0 },
+    { X86::VDIVPSZrrkz,           X86::VDIVPSZrmkz,           0 },
+    { X86::VDIVPDZrrkz,           X86::VDIVPDZrmkz,           0 },
+    { X86::VMINPSZrrkz,           X86::VMINPSZrmkz,           0 },
+    { X86::VMINPDZrrkz,           X86::VMINPDZrmkz,           0 },
+    { X86::VMAXPSZrrkz,           X86::VMAXPSZrmkz,           0 },
+    { X86::VMAXPDZrrkz,           X86::VMAXPDZrmkz,           0 },
+    // AVX-512{F,VL} arithmetic instructions 256-bit
+    { X86::VADDPSZ256rrkz,        X86::VADDPSZ256rmkz,        0 },
+    { X86::VADDPDZ256rrkz,        X86::VADDPDZ256rmkz,        0 },
+    { X86::VSUBPSZ256rrkz,        X86::VSUBPSZ256rmkz,        0 },
+    { X86::VSUBPDZ256rrkz,        X86::VSUBPDZ256rmkz,        0 },
+    { X86::VMULPSZ256rrkz,        X86::VMULPSZ256rmkz,        0 },
+    { X86::VMULPDZ256rrkz,        X86::VMULPDZ256rmkz,        0 },
+    { X86::VDIVPSZ256rrkz,        X86::VDIVPSZ256rmkz,        0 },
+    { X86::VDIVPDZ256rrkz,        X86::VDIVPDZ256rmkz,        0 },
+    { X86::VMINPSZ256rrkz,        X86::VMINPSZ256rmkz,        0 },
+    { X86::VMINPDZ256rrkz,        X86::VMINPDZ256rmkz,        0 },
+    { X86::VMAXPSZ256rrkz,        X86::VMAXPSZ256rmkz,        0 },
+    { X86::VMAXPDZ256rrkz,        X86::VMAXPDZ256rmkz,        0 },
+    // AVX-512{F,VL} arithmetic instructions 128-bit
+    { X86::VADDPSZ128rrkz,        X86::VADDPSZ128rmkz,        0 },
+    { X86::VADDPDZ128rrkz,        X86::VADDPDZ128rmkz,        0 },
+    { X86::VSUBPSZ128rrkz,        X86::VSUBPSZ128rmkz,        0 },
+    { X86::VSUBPDZ128rrkz,        X86::VSUBPDZ128rmkz,        0 },
+    { X86::VMULPSZ128rrkz,        X86::VMULPSZ128rmkz,        0 },
+    { X86::VMULPDZ128rrkz,        X86::VMULPDZ128rmkz,        0 },
+    { X86::VDIVPSZ128rrkz,        X86::VDIVPSZ128rmkz,        0 },
+    { X86::VDIVPDZ128rrkz,        X86::VDIVPDZ128rmkz,        0 },
+    { X86::VMINPSZ128rrkz,        X86::VMINPSZ128rmkz,        0 },
+    { X86::VMINPDZ128rrkz,        X86::VMINPDZ128rmkz,        0 },
+    { X86::VMAXPSZ128rrkz,        X86::VMAXPSZ128rmkz,        0 },
+    { X86::VMAXPDZ128rrkz,        X86::VMAXPDZ128rmkz,        0 }
+  };
+
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
+    AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+                  Entry.RegOp, Entry.MemOp,
+                  // Index 3, folded load
+                  Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
+  }
+
+  static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
+     // AVX-512 foldable instructions
+    { X86::VADDPSZrrk,         X86::VADDPSZrmk,           0 },
+    { X86::VADDPDZrrk,         X86::VADDPDZrmk,           0 },
+    { X86::VSUBPSZrrk,         X86::VSUBPSZrmk,           0 },
+    { X86::VSUBPDZrrk,         X86::VSUBPDZrmk,           0 },
+    { X86::VMULPSZrrk,         X86::VMULPSZrmk,           0 },
+    { X86::VMULPDZrrk,         X86::VMULPDZrmk,           0 },
+    { X86::VDIVPSZrrk,         X86::VDIVPSZrmk,           0 },
+    { X86::VDIVPDZrrk,         X86::VDIVPDZrmk,           0 },
+    { X86::VMINPSZrrk,         X86::VMINPSZrmk,           0 },
+    { X86::VMINPDZrrk,         X86::VMINPDZrmk,           0 },
+    { X86::VMAXPSZrrk,         X86::VMAXPSZrmk,           0 },
+    { X86::VMAXPDZrrk,         X86::VMAXPDZrmk,           0 },
+    // AVX-512{F,VL} foldable instructions 256-bit
+    { X86::VADDPSZ256rrk,      X86::VADDPSZ256rmk,        0 },
+    { X86::VADDPDZ256rrk,      X86::VADDPDZ256rmk,        0 },
+    { X86::VSUBPSZ256rrk,      X86::VSUBPSZ256rmk,        0 },
+    { X86::VSUBPDZ256rrk,      X86::VSUBPDZ256rmk,        0 },
+    { X86::VMULPSZ256rrk,      X86::VMULPSZ256rmk,        0 },
+    { X86::VMULPDZ256rrk,      X86::VMULPDZ256rmk,        0 },
+    { X86::VDIVPSZ256rrk,      X86::VDIVPSZ256rmk,        0 },
+    { X86::VDIVPDZ256rrk,      X86::VDIVPDZ256rmk,        0 },
+    { X86::VMINPSZ256rrk,      X86::VMINPSZ256rmk,        0 },
+    { X86::VMINPDZ256rrk,      X86::VMINPDZ256rmk,        0 },
+    { X86::VMAXPSZ256rrk,      X86::VMAXPSZ256rmk,        0 },
+    { X86::VMAXPDZ256rrk,      X86::VMAXPDZ256rmk,        0 },
+    // AVX-512{F,VL} foldable instructions 128-bit
+    { X86::VADDPSZ128rrk,      X86::VADDPSZ128rmk,        0 },
+    { X86::VADDPDZ128rrk,      X86::VADDPDZ128rmk,        0 },
+    { X86::VSUBPSZ128rrk,      X86::VSUBPSZ128rmk,        0 },
+    { X86::VSUBPDZ128rrk,      X86::VSUBPDZ128rmk,        0 },
+    { X86::VMULPSZ128rrk,      X86::VMULPSZ128rmk,        0 },
+    { X86::VMULPDZ128rrk,      X86::VMULPDZ128rmk,        0 },
+    { X86::VDIVPSZ128rrk,      X86::VDIVPSZ128rmk,        0 },
+    { X86::VDIVPDZ128rrk,      X86::VDIVPDZ128rmk,        0 },
+    { X86::VMINPSZ128rrk,      X86::VMINPSZ128rmk,        0 },
+    { X86::VMINPDZ128rrk,      X86::VMINPDZ128rmk,        0 },
+    { X86::VMAXPSZ128rrk,      X86::VMAXPSZ128rmk,        0 },
+    { X86::VMAXPDZ128rrk,      X86::VMAXPDZ128rmk,        0 }
+  };
+
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {
+    AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
+                  Entry.RegOp, Entry.MemOp,
+                  // Index 4, folded load
+                  Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
+  }
+}
+
+void
+X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable,
+                            MemOp2RegOpTableType &M2RTable,
+                            unsigned RegOp, unsigned MemOp, unsigned Flags) {
+    if ((Flags & TB_NO_FORWARD) == 0) {
+      assert(!R2MTable.count(RegOp) && "Duplicate entry!");
+      R2MTable[RegOp] = std::make_pair(MemOp, Flags);
+    }
+    if ((Flags & TB_NO_REVERSE) == 0) {
+      assert(!M2RTable.count(MemOp) &&
+           "Duplicated entries in unfolding maps?");
+      M2RTable[MemOp] = std::make_pair(RegOp, Flags);
+    }
+}
+
+bool
+X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                    unsigned &SrcReg, unsigned &DstReg,
+                                    unsigned &SubIdx) const {
+  switch (MI.getOpcode()) {
+  default: break;
+  case X86::MOVSX16rr8:
+  case X86::MOVZX16rr8:
+  case X86::MOVSX32rr8:
+  case X86::MOVZX32rr8:
+  case X86::MOVSX64rr8:
+    if (!Subtarget.is64Bit())
+      // It's not always legal to reference the low 8-bit of the larger
+      // register in 32-bit mode.
+      return false;
+  case X86::MOVSX32rr16:
+  case X86::MOVZX32rr16:
+  case X86::MOVSX64rr16:
+  case X86::MOVSX64rr32: {
+    if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
+      // Be conservative.
+      return false;
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    switch (MI.getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::MOVSX16rr8:
+    case X86::MOVZX16rr8:
+    case X86::MOVSX32rr8:
+    case X86::MOVZX32rr8:
+    case X86::MOVSX64rr8:
+      SubIdx = X86::sub_8bit;
+      break;
+    case X86::MOVSX32rr16:
+    case X86::MOVZX32rr16:
+    case X86::MOVSX64rr16:
+      SubIdx = X86::sub_16bit;
+      break;
+    case X86::MOVSX64rr32:
+      SubIdx = X86::sub_32bit;
+      break;
+    }
+    return true;
+  }
+  }
+  return false;
+}
+
+int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const {
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+
+  if (MI->getOpcode() == getCallFrameSetupOpcode() ||
+      MI->getOpcode() == getCallFrameDestroyOpcode()) {
+    unsigned StackAlign = TFI->getStackAlignment();
+    int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign *
+                 StackAlign;
+
+    SPAdj -= MI->getOperand(1).getImm();
+
+    if (MI->getOpcode() == getCallFrameSetupOpcode())
+      return SPAdj;
+    else
+      return -SPAdj;
+  }
+
+  // To know whether a call adjusts the stack, we need information
+  // that is bound to the following ADJCALLSTACKUP pseudo.
+  // Look for the next ADJCALLSTACKUP that follows the call.
+  if (MI->isCall()) {
+    const MachineBasicBlock* MBB = MI->getParent();
+    auto I = ++MachineBasicBlock::const_iterator(MI);
+    for (auto E = MBB->end(); I != E; ++I) {
+      if (I->getOpcode() == getCallFrameDestroyOpcode() ||
+          I->isCall())
+        break;
+    }
+
+    // If we could not find a frame destroy opcode, then it has already
+    // been simplified, so we don't care.
+    if (I->getOpcode() != getCallFrameDestroyOpcode())
+      return 0;
+
+    return -(I->getOperand(1).getImm());
+  }
+
+  // Currently handle only PUSHes we can reasonably expect to see
+  // in call sequences
+  switch (MI->getOpcode()) {
+  default:
+    return 0;
+  case X86::PUSH32i8:
+  case X86::PUSH32r:
+  case X86::PUSH32rmm:
+  case X86::PUSH32rmr:
+  case X86::PUSHi32:
+    return 4;
+  }
+}
+
+/// Return true and the FrameIndex if the specified
+/// operand and follow operands form a reference to the stack frame.
+bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
+                                  int &FrameIndex) const {
+  if (MI->getOperand(Op+X86::AddrBaseReg).isFI() &&
+      MI->getOperand(Op+X86::AddrScaleAmt).isImm() &&
+      MI->getOperand(Op+X86::AddrIndexReg).isReg() &&
+      MI->getOperand(Op+X86::AddrDisp).isImm() &&
+      MI->getOperand(Op+X86::AddrScaleAmt).getImm() == 1 &&
+      MI->getOperand(Op+X86::AddrIndexReg).getReg() == 0 &&
+      MI->getOperand(Op+X86::AddrDisp).getImm() == 0) {
+    FrameIndex = MI->getOperand(Op+X86::AddrBaseReg).getIndex();
+    return true;
+  }
+  return false;
+}
+
+static bool isFrameLoadOpcode(int Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp64m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVUPSYrm:
+  case X86::VMOVAPSYrm:
+  case X86::VMOVUPDYrm:
+  case X86::VMOVAPDYrm:
+  case X86::VMOVDQUYrm:
+  case X86::VMOVDQAYrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::VMOVAPSZrm:
+  case X86::VMOVUPSZrm:
+    return true;
+  }
+}
+
+static bool isFrameStoreOpcode(int Opcode) {
+  switch (Opcode) {
+  default: break;
+  case X86::MOV8mr:
+  case X86::MOV16mr:
+  case X86::MOV32mr:
+  case X86::MOV64mr:
+  case X86::ST_FpP64m:
+  case X86::MOVSSmr:
+  case X86::MOVSDmr:
+  case X86::MOVAPSmr:
+  case X86::MOVAPDmr:
+  case X86::MOVDQAmr:
+  case X86::VMOVSSmr:
+  case X86::VMOVSDmr:
+  case X86::VMOVAPSmr:
+  case X86::VMOVAPDmr:
+  case X86::VMOVDQAmr:
+  case X86::VMOVUPSYmr:
+  case X86::VMOVAPSYmr:
+  case X86::VMOVUPDYmr:
+  case X86::VMOVAPDYmr:
+  case X86::VMOVDQUYmr:
+  case X86::VMOVDQAYmr:
+  case X86::VMOVUPSZmr:
+  case X86::VMOVAPSZmr:
+  case X86::MMX_MOVD64mr:
+  case X86::MMX_MOVQ64mr:
+  case X86::MMX_MOVNTQmr:
+    return true;
+  }
+  return false;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                           int &FrameIndex) const {
+  if (isFrameLoadOpcode(MI->getOpcode()))
+    if (MI->getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
+      return MI->getOperand(0).getReg();
+  return 0;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                                 int &FrameIndex) const {
+  if (isFrameLoadOpcode(MI->getOpcode())) {
+    unsigned Reg;
+    if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
+      return Reg;
+    // Check for post-frame index elimination operations
+    const MachineMemOperand *Dummy;
+    return hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+  }
+  return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                          int &FrameIndex) const {
+  if (isFrameStoreOpcode(MI->getOpcode()))
+    if (MI->getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
+        isFrameOperand(MI, 0, FrameIndex))
+      return MI->getOperand(X86::AddrNumOperands).getReg();
+  return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
+                                                int &FrameIndex) const {
+  if (isFrameStoreOpcode(MI->getOpcode())) {
+    unsigned Reg;
+    if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
+      return Reg;
+    // Check for post-frame index elimination operations
+    const MachineMemOperand *Dummy;
+    return hasStoreToStackSlot(MI, Dummy, FrameIndex);
+  }
+  return 0;
+}
+
+/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
+static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
+  // Don't waste compile time scanning use-def chains of physregs.
+  if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
+    return false;
+  bool isPICBase = false;
+  for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
+         E = MRI.def_instr_end(); I != E; ++I) {
+    MachineInstr *DefMI = &*I;
+    if (DefMI->getOpcode() != X86::MOVPC32r)
+      return false;
+    assert(!isPICBase && "More than one PIC base?");
+    isPICBase = true;
+  }
+  return isPICBase;
+}
+
+bool
+X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
+                                                AliasAnalysis *AA) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp64m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::MOVDQUrm:
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
+  case X86::VMOVAPSYrm:
+  case X86::VMOVUPSYrm:
+  case X86::VMOVAPDYrm:
+  case X86::VMOVDQAYrm:
+  case X86::VMOVDQUYrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::FsVMOVAPSrm:
+  case X86::FsVMOVAPDrm:
+  case X86::FsMOVAPSrm:
+  case X86::FsMOVAPDrm:
+  // AVX-512
+  case X86::VMOVAPDZ128rm:
+  case X86::VMOVAPDZ256rm:
+  case X86::VMOVAPDZrm:
+  case X86::VMOVAPSZ128rm:
+  case X86::VMOVAPSZ256rm:
+  case X86::VMOVAPSZrm:
+  case X86::VMOVDQA32Z128rm:
+  case X86::VMOVDQA32Z256rm:
+  case X86::VMOVDQA32Zrm:
+  case X86::VMOVDQA64Z128rm:
+  case X86::VMOVDQA64Z256rm:
+  case X86::VMOVDQA64Zrm:
+  case X86::VMOVDQU16Z128rm:
+  case X86::VMOVDQU16Z256rm:
+  case X86::VMOVDQU16Zrm:
+  case X86::VMOVDQU32Z128rm:
+  case X86::VMOVDQU32Z256rm:
+  case X86::VMOVDQU32Zrm:
+  case X86::VMOVDQU64Z128rm:
+  case X86::VMOVDQU64Z256rm:
+  case X86::VMOVDQU64Zrm:
+  case X86::VMOVDQU8Z128rm:
+  case X86::VMOVDQU8Z256rm:
+  case X86::VMOVDQU8Zrm:
+  case X86::VMOVUPSZ128rm:
+  case X86::VMOVUPSZ256rm:
+  case X86::VMOVUPSZrm: {
+    // Loads from constant pools are trivially rematerializable.
+    if (MI->getOperand(1+X86::AddrBaseReg).isReg() &&
+        MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
+        MI->getOperand(1+X86::AddrIndexReg).isReg() &&
+        MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 &&
+        MI->isInvariantLoad(AA)) {
+      unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg();
+      if (BaseReg == 0 || BaseReg == X86::RIP)
+        return true;
+      // Allow re-materialization of PIC load.
+      if (!ReMatPICStubLoad && MI->getOperand(1+X86::AddrDisp).isGlobal())
+        return false;
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const MachineRegisterInfo &MRI = MF.getRegInfo();
+      return regIsPICBase(BaseReg, MRI);
+    }
+    return false;
+  }
+
+  case X86::LEA32r:
+  case X86::LEA64r: {
+    if (MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
+        MI->getOperand(1+X86::AddrIndexReg).isReg() &&
+        MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 &&
+        !MI->getOperand(1+X86::AddrDisp).isReg()) {
+      // lea fi#, lea GV, etc. are all rematerializable.
+      if (!MI->getOperand(1+X86::AddrBaseReg).isReg())
+        return true;
+      unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg();
+      if (BaseReg == 0)
+        return true;
+      // Allow re-materialization of lea PICBase + x.
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const MachineRegisterInfo &MRI = MF.getRegInfo();
+      return regIsPICBase(BaseReg, MRI);
+    }
+    return false;
+  }
+  }
+
+  // All other instructions marked M_REMATERIALIZABLE are always trivially
+  // rematerializable.
+  return true;
+}
+
+bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I) const {
+  MachineBasicBlock::iterator E = MBB.end();
+
+  // For compile time consideration, if we are not able to determine the
+  // safety after visiting 4 instructions in each direction, we will assume
+  // it's not safe.
+  MachineBasicBlock::iterator Iter = I;
+  for (unsigned i = 0; Iter != E && i < 4; ++i) {
+    bool SeenDef = false;
+    for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+      MachineOperand &MO = Iter->getOperand(j);
+      if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
+        SeenDef = true;
+      if (!MO.isReg())
+        continue;
+      if (MO.getReg() == X86::EFLAGS) {
+        if (MO.isUse())
+          return false;
+        SeenDef = true;
+      }
+    }
+
+    if (SeenDef)
+      // This instruction defines EFLAGS, no need to look any further.
+      return true;
+    ++Iter;
+    // Skip over DBG_VALUE.
+    while (Iter != E && Iter->isDebugValue())
+      ++Iter;
+  }
+
+  // It is safe to clobber EFLAGS at the end of a block of no successor has it
+  // live in.
+  if (Iter == E) {
+    for (MachineBasicBlock *S : MBB.successors())
+      if (S->isLiveIn(X86::EFLAGS))
+        return false;
+    return true;
+  }
+
+  MachineBasicBlock::iterator B = MBB.begin();
+  Iter = I;
+  for (unsigned i = 0; i < 4; ++i) {
+    // If we make it to the beginning of the block, it's safe to clobber
+    // EFLAGS iff EFLAGS is not live-in.
+    if (Iter == B)
+      return !MBB.isLiveIn(X86::EFLAGS);
+
+    --Iter;
+    // Skip over DBG_VALUE.
+    while (Iter != B && Iter->isDebugValue())
+      --Iter;
+
+    bool SawKill = false;
+    for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+      MachineOperand &MO = Iter->getOperand(j);
+      // A register mask may clobber EFLAGS, but we should still look for a
+      // live EFLAGS def.
+      if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
+        SawKill = true;
+      if (MO.isReg() && MO.getReg() == X86::EFLAGS) {
+        if (MO.isDef()) return MO.isDead();
+        if (MO.isKill()) SawKill = true;
+      }
+    }
+
+    if (SawKill)
+      // This instruction kills EFLAGS and doesn't redefine it, so
+      // there's no need to look further.
+      return true;
+  }
+
+  // Conservative answer.
+  return false;
+}
+
+void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I,
+                                 unsigned DestReg, unsigned SubIdx,
+                                 const MachineInstr *Orig,
+                                 const TargetRegisterInfo &TRI) const {
+  bool ClobbersEFLAGS = false;
+  for (const MachineOperand &MO : Orig->operands()) {
+    if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
+      ClobbersEFLAGS = true;
+      break;
+    }
+  }
+
+  if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) {
+    // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
+    // effects.
+    int Value;
+    switch (Orig->getOpcode()) {
+    case X86::MOV32r0:  Value = 0; break;
+    case X86::MOV32r1:  Value = 1; break;
+    case X86::MOV32r_1: Value = -1; break;
+    default:
+      llvm_unreachable("Unexpected instruction!");
+    }
+
+    DebugLoc DL = Orig->getDebugLoc();
+    BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0))
+      .addImm(Value);
+  } else {
+    MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+    MBB.insert(I, MI);
+  }
+
+  MachineInstr *NewMI = std::prev(I);
+  NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
+}
+
+/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
+bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr *MI) const {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef() &&
+        MO.getReg() == X86::EFLAGS && !MO.isDead()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Check whether the shift count for a machine operand is non-zero.
+inline static unsigned getTruncatedShiftCount(MachineInstr *MI,
+                                              unsigned ShiftAmtOperandIdx) {
+  // The shift count is six bits with the REX.W prefix and five bits without.
+  unsigned ShiftCountMask = (MI->getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
+  unsigned Imm = MI->getOperand(ShiftAmtOperandIdx).getImm();
+  return Imm & ShiftCountMask;
+}
+
+/// Check whether the given shift count is appropriate
+/// can be represented by a LEA instruction.
+inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
+  // Left shift instructions can be transformed into load-effective-address
+  // instructions if we can encode them appropriately.
+  // A LEA instruction utilizes a SIB byte to encode its scale factor.
+  // The SIB.scale field is two bits wide which means that we can encode any
+  // shift amount less than 4.
+  return ShAmt < 4 && ShAmt > 0;
+}
+
+bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
+                                  unsigned Opc, bool AllowSP,
+                                  unsigned &NewSrc, bool &isKill, bool &isUndef,
+                                  MachineOperand &ImplicitOp) const {
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetRegisterClass *RC;
+  if (AllowSP) {
+    RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
+  } else {
+    RC = Opc != X86::LEA32r ?
+      &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
+  }
+  unsigned SrcReg = Src.getReg();
+
+  // For both LEA64 and LEA32 the register already has essentially the right
+  // type (32-bit or 64-bit) we may just need to forbid SP.
+  if (Opc != X86::LEA64_32r) {
+    NewSrc = SrcReg;
+    isKill = Src.isKill();
+    isUndef = Src.isUndef();
+
+    if (TargetRegisterInfo::isVirtualRegister(NewSrc) &&
+        !MF.getRegInfo().constrainRegClass(NewSrc, RC))
+      return false;
+
+    return true;
+  }
+
+  // This is for an LEA64_32r and incoming registers are 32-bit. One way or
+  // another we need to add 64-bit registers to the final MI.
+  if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+    ImplicitOp = Src;
+    ImplicitOp.setImplicit();
+
+    NewSrc = getX86SubSuperRegister(Src.getReg(), 64);
+    MachineBasicBlock::LivenessQueryResult LQR =
+      MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI);
+
+    switch (LQR) {
+    case MachineBasicBlock::LQR_Unknown:
+      // We can't give sane liveness flags to the instruction, abandon LEA
+      // formation.
+      return false;
+    case MachineBasicBlock::LQR_Live:
+      isKill = MI->killsRegister(SrcReg);
+      isUndef = false;
+      break;
+    default:
+      // The physreg itself is dead, so we have to use it as an <undef>.
+      isKill = false;
+      isUndef = true;
+      break;
+    }
+  } else {
+    // Virtual register of the wrong class, we have to create a temporary 64-bit
+    // vreg to feed into the LEA.
+    NewSrc = MF.getRegInfo().createVirtualRegister(RC);
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+            get(TargetOpcode::COPY))
+      .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
+        .addOperand(Src);
+
+    // Which is obviously going to be dead after we're done with it.
+    isKill = true;
+    isUndef = false;
+  }
+
+  // We've set all the parameters without issue.
+  return true;
+}
+
+/// Helper for convertToThreeAddress when 16-bit LEA is disabled, use 32-bit
+/// LEA to form 3-address code by promoting to a 32-bit superregister and then
+/// truncating back down to a 16-bit subregister.
+MachineInstr *
+X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
+                                           MachineFunction::iterator &MFI,
+                                           MachineBasicBlock::iterator &MBBI,
+                                           LiveVariables *LV) const {
+  MachineInstr *MI = MBBI;
+  unsigned Dest = MI->getOperand(0).getReg();
+  unsigned Src = MI->getOperand(1).getReg();
+  bool isDead = MI->getOperand(0).isDead();
+  bool isKill = MI->getOperand(1).isKill();
+
+  MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
+  unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+  unsigned Opc, leaInReg;
+  if (Subtarget.is64Bit()) {
+    Opc = X86::LEA64_32r;
+    leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+  } else {
+    Opc = X86::LEA32r;
+    leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+  }
+
+  // Build and insert into an implicit UNDEF value. This is OK because
+  // well be shifting and then extracting the lower 16-bits.
+  // This has the potential to cause partial register stall. e.g.
+  //   movw    (%rbp,%rcx,2), %dx
+  //   leal    -65(%rdx), %esi
+  // But testing has shown this *does* help performance in 64-bit mode (at
+  // least on modern x86 machines).
+  BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
+  MachineInstr *InsMI =
+    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
+    .addReg(leaInReg, RegState::Define, X86::sub_16bit)
+    .addReg(Src, getKillRegState(isKill));
+
+  MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(),
+                                    get(Opc), leaOutReg);
+  switch (MIOpc) {
+  default: llvm_unreachable("Unreachable!");
+  case X86::SHL16ri: {
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    MIB.addReg(0).addImm(1 << ShAmt)
+       .addReg(leaInReg, RegState::Kill).addImm(0).addReg(0);
+    break;
+  }
+  case X86::INC16r:
+    addRegOffset(MIB, leaInReg, true, 1);
+    break;
+  case X86::DEC16r:
+    addRegOffset(MIB, leaInReg, true, -1);
+    break;
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD16ri_DB:
+  case X86::ADD16ri8_DB:
+    addRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm());
+    break;
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB: {
+    unsigned Src2 = MI->getOperand(2).getReg();
+    bool isKill2 = MI->getOperand(2).isKill();
+    unsigned leaInReg2 = 0;
+    MachineInstr *InsMI2 = nullptr;
+    if (Src == Src2) {
+      // ADD16rr %reg1028<kill>, %reg1028
+      // just a single insert_subreg.
+      addRegReg(MIB, leaInReg, true, leaInReg, false);
+    } else {
+      if (Subtarget.is64Bit())
+        leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+      else
+        leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+      // Build and insert into an implicit UNDEF value. This is OK because
+      // well be shifting and then extracting the lower 16-bits.
+      BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2);
+      InsMI2 =
+        BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(TargetOpcode::COPY))
+        .addReg(leaInReg2, RegState::Define, X86::sub_16bit)
+        .addReg(Src2, getKillRegState(isKill2));
+      addRegReg(MIB, leaInReg, true, leaInReg2, true);
+    }
+    if (LV && isKill2 && InsMI2)
+      LV->replaceKillInstruction(Src2, MI, InsMI2);
+    break;
+  }
+  }
+
+  MachineInstr *NewMI = MIB;
+  MachineInstr *ExtMI =
+    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
+    .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+    .addReg(leaOutReg, RegState::Kill, X86::sub_16bit);
+
+  if (LV) {
+    // Update live variables
+    LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
+    LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
+    if (isKill)
+      LV->replaceKillInstruction(Src, MI, InsMI);
+    if (isDead)
+      LV->replaceKillInstruction(Dest, MI, ExtMI);
+  }
+
+  return ExtMI;
+}
+
+/// This method must be implemented by targets that
+/// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
+/// may be able to convert a two-address instruction into a true
+/// three-address instruction on demand.  This allows the X86 target (for
+/// example) to convert ADD and SHL instructions into LEA instructions if they
+/// would require register copies due to two-addressness.
+///
+/// This method returns a null pointer if the transformation cannot be
+/// performed, otherwise it returns the new instruction.
+///
+MachineInstr *
+X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                    MachineBasicBlock::iterator &MBBI,
+                                    LiveVariables *LV) const {
+  MachineInstr *MI = MBBI;
+
+  // The following opcodes also sets the condition code register(s). Only
+  // convert them to equivalent lea if the condition code register def's
+  // are dead!
+  if (hasLiveCondCodeDef(MI))
+    return nullptr;
+
+  MachineFunction &MF = *MI->getParent()->getParent();
+  // All instructions input are two-addr instructions.  Get the known operands.
+  const MachineOperand &Dest = MI->getOperand(0);
+  const MachineOperand &Src = MI->getOperand(1);
+
+  MachineInstr *NewMI = nullptr;
+  // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
+  // we have better subtarget support, enable the 16-bit LEA generation here.
+  // 16-bit LEA is also slow on Core2.
+  bool DisableLEA16 = true;
+  bool is64Bit = Subtarget.is64Bit();
+
+  unsigned MIOpc = MI->getOpcode();
+  switch (MIOpc) {
+  default: return nullptr;
+  case X86::SHL64ri: {
+    assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+
+    // LEA can't handle RSP.
+    if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+        !MF.getRegInfo().constrainRegClass(Src.getReg(),
+                                           &X86::GR64_NOSPRegClass))
+      return nullptr;
+
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
+    break;
+  }
+  case X86::SHL32ri: {
+    assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+
+    unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+    // LEA can't handle ESP.
+    bool isKill, isUndef;
+    unsigned SrcReg;
+    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+                        SrcReg, isKill, isUndef, ImplicitOp))
+      return nullptr;
+
+    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt)
+      .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+      .addImm(0).addReg(0);
+    if (ImplicitOp.getReg() != 0)
+      MIB.addOperand(ImplicitOp);
+    NewMI = MIB;
+
+    break;
+  }
+  case X86::SHL16ri: {
+    assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+
+    if (DisableLEA16)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : nullptr;
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
+    break;
+  }
+  case X86::INC64r:
+  case X86::INC32r: {
+    assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+    unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
+      : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+    bool isKill, isUndef;
+    unsigned SrcReg;
+    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+                        SrcReg, isKill, isUndef, ImplicitOp))
+      return nullptr;
+
+    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+        .addOperand(Dest)
+        .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef));
+    if (ImplicitOp.getReg() != 0)
+      MIB.addOperand(ImplicitOp);
+
+    NewMI = addOffset(MIB, 1);
+    break;
+  }
+  case X86::INC16r:
+    if (DisableLEA16)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                     : nullptr;
+    assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+    NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                      .addOperand(Dest).addOperand(Src), 1);
+    break;
+  case X86::DEC64r:
+  case X86::DEC32r: {
+    assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+    unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
+      : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+
+    bool isKill, isUndef;
+    unsigned SrcReg;
+    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+                        SrcReg, isKill, isUndef, ImplicitOp))
+      return nullptr;
+
+    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+        .addOperand(Dest)
+        .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
+    if (ImplicitOp.getReg() != 0)
+      MIB.addOperand(ImplicitOp);
+
+    NewMI = addOffset(MIB, -1);
+
+    break;
+  }
+  case X86::DEC16r:
+    if (DisableLEA16)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                     : nullptr;
+    assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+    NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                      .addOperand(Dest).addOperand(Src), -1);
+    break;
+  case X86::ADD64rr:
+  case X86::ADD64rr_DB:
+  case X86::ADD32rr:
+  case X86::ADD32rr_DB: {
+    assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+    unsigned Opc;
+    if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
+      Opc = X86::LEA64r;
+    else
+      Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+    bool isKill, isUndef;
+    unsigned SrcReg;
+    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+                        SrcReg, isKill, isUndef, ImplicitOp))
+      return nullptr;
+
+    const MachineOperand &Src2 = MI->getOperand(2);
+    bool isKill2, isUndef2;
+    unsigned SrcReg2;
+    MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
+                        SrcReg2, isKill2, isUndef2, ImplicitOp2))
+      return nullptr;
+
+    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+      .addOperand(Dest);
+    if (ImplicitOp.getReg() != 0)
+      MIB.addOperand(ImplicitOp);
+    if (ImplicitOp2.getReg() != 0)
+      MIB.addOperand(ImplicitOp2);
+
+    NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
+
+    // Preserve undefness of the operands.
+    NewMI->getOperand(1).setIsUndef(isUndef);
+    NewMI->getOperand(3).setIsUndef(isUndef2);
+
+    if (LV && Src2.isKill())
+      LV->replaceKillInstruction(SrcReg2, MI, NewMI);
+    break;
+  }
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB: {
+    if (DisableLEA16)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                     : nullptr;
+    assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+    unsigned Src2 = MI->getOperand(2).getReg();
+    bool isKill2 = MI->getOperand(2).isKill();
+    NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                      .addOperand(Dest),
+                      Src.getReg(), Src.isKill(), Src2, isKill2);
+
+    // Preserve undefness of the operands.
+    bool isUndef = MI->getOperand(1).isUndef();
+    bool isUndef2 = MI->getOperand(2).isUndef();
+    NewMI->getOperand(1).setIsUndef(isUndef);
+    NewMI->getOperand(3).setIsUndef(isUndef2);
+
+    if (LV && isKill2)
+      LV->replaceKillInstruction(Src2, MI, NewMI);
+    break;
+  }
+  case X86::ADD64ri32:
+  case X86::ADD64ri8:
+  case X86::ADD64ri32_DB:
+  case X86::ADD64ri8_DB:
+    assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+    NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+                      .addOperand(Dest).addOperand(Src),
+                      MI->getOperand(2).getImm());
+    break;
+  case X86::ADD32ri:
+  case X86::ADD32ri8:
+  case X86::ADD32ri_DB:
+  case X86::ADD32ri8_DB: {
+    assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+    unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+    bool isKill, isUndef;
+    unsigned SrcReg;
+    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+                        SrcReg, isKill, isUndef, ImplicitOp))
+      return nullptr;
+
+    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+        .addOperand(Dest)
+        .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
+    if (ImplicitOp.getReg() != 0)
+      MIB.addOperand(ImplicitOp);
+
+    NewMI = addOffset(MIB, MI->getOperand(2).getImm());
+    break;
+  }
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD16ri_DB:
+  case X86::ADD16ri8_DB:
+    if (DisableLEA16)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                     : nullptr;
+    assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+    NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                      .addOperand(Dest).addOperand(Src),
+                      MI->getOperand(2).getImm());
+    break;
+  }
+
+  if (!NewMI) return nullptr;
+
+  if (LV) {  // Update live variables
+    if (Src.isKill())
+      LV->replaceKillInstruction(Src.getReg(), MI, NewMI);
+    if (Dest.isDead())
+      LV->replaceKillInstruction(Dest.getReg(), MI, NewMI);
+  }
+
+  MFI->insert(MBBI, NewMI);          // Insert the new inst
+  return NewMI;
+}
+
+/// Returns true if the given instruction opcode is FMA3.
+/// Otherwise, returns false.
+/// The second parameter is optional and is used as the second return from
+/// the function. It is set to true if the given instruction has FMA3 opcode
+/// that is used for lowering of scalar FMA intrinsics, and it is set to false
+/// otherwise.
+static bool isFMA3(unsigned Opcode, bool *IsIntrinsic = nullptr) {
+  if (IsIntrinsic)
+    *IsIntrinsic = false;
+
+  switch (Opcode) {
+    case X86::VFMADDSDr132r:      case X86::VFMADDSDr132m:
+    case X86::VFMADDSSr132r:      case X86::VFMADDSSr132m:
+    case X86::VFMSUBSDr132r:      case X86::VFMSUBSDr132m:
+    case X86::VFMSUBSSr132r:      case X86::VFMSUBSSr132m:
+    case X86::VFNMADDSDr132r:     case X86::VFNMADDSDr132m:
+    case X86::VFNMADDSSr132r:     case X86::VFNMADDSSr132m:
+    case X86::VFNMSUBSDr132r:     case X86::VFNMSUBSDr132m:
+    case X86::VFNMSUBSSr132r:     case X86::VFNMSUBSSr132m:
+
+    case X86::VFMADDSDr213r:      case X86::VFMADDSDr213m:
+    case X86::VFMADDSSr213r:      case X86::VFMADDSSr213m:
+    case X86::VFMSUBSDr213r:      case X86::VFMSUBSDr213m:
+    case X86::VFMSUBSSr213r:      case X86::VFMSUBSSr213m:
+    case X86::VFNMADDSDr213r:     case X86::VFNMADDSDr213m:
+    case X86::VFNMADDSSr213r:     case X86::VFNMADDSSr213m:
+    case X86::VFNMSUBSDr213r:     case X86::VFNMSUBSDr213m:
+    case X86::VFNMSUBSSr213r:     case X86::VFNMSUBSSr213m:
+
+    case X86::VFMADDSDr231r:      case X86::VFMADDSDr231m:
+    case X86::VFMADDSSr231r:      case X86::VFMADDSSr231m:
+    case X86::VFMSUBSDr231r:      case X86::VFMSUBSDr231m:
+    case X86::VFMSUBSSr231r:      case X86::VFMSUBSSr231m:
+    case X86::VFNMADDSDr231r:     case X86::VFNMADDSDr231m:
+    case X86::VFNMADDSSr231r:     case X86::VFNMADDSSr231m:
+    case X86::VFNMSUBSDr231r:     case X86::VFNMSUBSDr231m:
+    case X86::VFNMSUBSSr231r:     case X86::VFNMSUBSSr231m:
+
+    case X86::VFMADDSUBPDr132r:   case X86::VFMADDSUBPDr132m:
+    case X86::VFMADDSUBPSr132r:   case X86::VFMADDSUBPSr132m:
+    case X86::VFMSUBADDPDr132r:   case X86::VFMSUBADDPDr132m:
+    case X86::VFMSUBADDPSr132r:   case X86::VFMSUBADDPSr132m:
+    case X86::VFMADDSUBPDr132rY:  case X86::VFMADDSUBPDr132mY:
+    case X86::VFMADDSUBPSr132rY:  case X86::VFMADDSUBPSr132mY:
+    case X86::VFMSUBADDPDr132rY:  case X86::VFMSUBADDPDr132mY:
+    case X86::VFMSUBADDPSr132rY:  case X86::VFMSUBADDPSr132mY:
+
+    case X86::VFMADDPDr132r:      case X86::VFMADDPDr132m:
+    case X86::VFMADDPSr132r:      case X86::VFMADDPSr132m:
+    case X86::VFMSUBPDr132r:      case X86::VFMSUBPDr132m:
+    case X86::VFMSUBPSr132r:      case X86::VFMSUBPSr132m:
+    case X86::VFNMADDPDr132r:     case X86::VFNMADDPDr132m:
+    case X86::VFNMADDPSr132r:     case X86::VFNMADDPSr132m:
+    case X86::VFNMSUBPDr132r:     case X86::VFNMSUBPDr132m:
+    case X86::VFNMSUBPSr132r:     case X86::VFNMSUBPSr132m:
+    case X86::VFMADDPDr132rY:     case X86::VFMADDPDr132mY:
+    case X86::VFMADDPSr132rY:     case X86::VFMADDPSr132mY:
+    case X86::VFMSUBPDr132rY:     case X86::VFMSUBPDr132mY:
+    case X86::VFMSUBPSr132rY:     case X86::VFMSUBPSr132mY:
+    case X86::VFNMADDPDr132rY:    case X86::VFNMADDPDr132mY:
+    case X86::VFNMADDPSr132rY:    case X86::VFNMADDPSr132mY:
+    case X86::VFNMSUBPDr132rY:    case X86::VFNMSUBPDr132mY:
+    case X86::VFNMSUBPSr132rY:    case X86::VFNMSUBPSr132mY:
+
+    case X86::VFMADDSUBPDr213r:   case X86::VFMADDSUBPDr213m:
+    case X86::VFMADDSUBPSr213r:   case X86::VFMADDSUBPSr213m:
+    case X86::VFMSUBADDPDr213r:   case X86::VFMSUBADDPDr213m:
+    case X86::VFMSUBADDPSr213r:   case X86::VFMSUBADDPSr213m:
+    case X86::VFMADDSUBPDr213rY:  case X86::VFMADDSUBPDr213mY:
+    case X86::VFMADDSUBPSr213rY:  case X86::VFMADDSUBPSr213mY:
+    case X86::VFMSUBADDPDr213rY:  case X86::VFMSUBADDPDr213mY:
+    case X86::VFMSUBADDPSr213rY:  case X86::VFMSUBADDPSr213mY:
+
+    case X86::VFMADDPDr213r:      case X86::VFMADDPDr213m:
+    case X86::VFMADDPSr213r:      case X86::VFMADDPSr213m:
+    case X86::VFMSUBPDr213r:      case X86::VFMSUBPDr213m:
+    case X86::VFMSUBPSr213r:      case X86::VFMSUBPSr213m:
+    case X86::VFNMADDPDr213r:     case X86::VFNMADDPDr213m:
+    case X86::VFNMADDPSr213r:     case X86::VFNMADDPSr213m:
+    case X86::VFNMSUBPDr213r:     case X86::VFNMSUBPDr213m:
+    case X86::VFNMSUBPSr213r:     case X86::VFNMSUBPSr213m:
+    case X86::VFMADDPDr213rY:     case X86::VFMADDPDr213mY:
+    case X86::VFMADDPSr213rY:     case X86::VFMADDPSr213mY:
+    case X86::VFMSUBPDr213rY:     case X86::VFMSUBPDr213mY:
+    case X86::VFMSUBPSr213rY:     case X86::VFMSUBPSr213mY:
+    case X86::VFNMADDPDr213rY:    case X86::VFNMADDPDr213mY:
+    case X86::VFNMADDPSr213rY:    case X86::VFNMADDPSr213mY:
+    case X86::VFNMSUBPDr213rY:    case X86::VFNMSUBPDr213mY:
+    case X86::VFNMSUBPSr213rY:    case X86::VFNMSUBPSr213mY:
+
+    case X86::VFMADDSUBPDr231r:   case X86::VFMADDSUBPDr231m:
+    case X86::VFMADDSUBPSr231r:   case X86::VFMADDSUBPSr231m:
+    case X86::VFMSUBADDPDr231r:   case X86::VFMSUBADDPDr231m:
+    case X86::VFMSUBADDPSr231r:   case X86::VFMSUBADDPSr231m:
+    case X86::VFMADDSUBPDr231rY:  case X86::VFMADDSUBPDr231mY:
+    case X86::VFMADDSUBPSr231rY:  case X86::VFMADDSUBPSr231mY:
+    case X86::VFMSUBADDPDr231rY:  case X86::VFMSUBADDPDr231mY:
+    case X86::VFMSUBADDPSr231rY:  case X86::VFMSUBADDPSr231mY:
+
+    case X86::VFMADDPDr231r:      case X86::VFMADDPDr231m:
+    case X86::VFMADDPSr231r:      case X86::VFMADDPSr231m:
+    case X86::VFMSUBPDr231r:      case X86::VFMSUBPDr231m:
+    case X86::VFMSUBPSr231r:      case X86::VFMSUBPSr231m:
+    case X86::VFNMADDPDr231r:     case X86::VFNMADDPDr231m:
+    case X86::VFNMADDPSr231r:     case X86::VFNMADDPSr231m:
+    case X86::VFNMSUBPDr231r:     case X86::VFNMSUBPDr231m:
+    case X86::VFNMSUBPSr231r:     case X86::VFNMSUBPSr231m:
+    case X86::VFMADDPDr231rY:     case X86::VFMADDPDr231mY:
+    case X86::VFMADDPSr231rY:     case X86::VFMADDPSr231mY:
+    case X86::VFMSUBPDr231rY:     case X86::VFMSUBPDr231mY:
+    case X86::VFMSUBPSr231rY:     case X86::VFMSUBPSr231mY:
+    case X86::VFNMADDPDr231rY:    case X86::VFNMADDPDr231mY:
+    case X86::VFNMADDPSr231rY:    case X86::VFNMADDPSr231mY:
+    case X86::VFNMSUBPDr231rY:    case X86::VFNMSUBPDr231mY:
+    case X86::VFNMSUBPSr231rY:    case X86::VFNMSUBPSr231mY:
+      return true;
+
+    case X86::VFMADDSDr132r_Int:  case X86::VFMADDSDr132m_Int:
+    case X86::VFMADDSSr132r_Int:  case X86::VFMADDSSr132m_Int:
+    case X86::VFMSUBSDr132r_Int:  case X86::VFMSUBSDr132m_Int:
+    case X86::VFMSUBSSr132r_Int:  case X86::VFMSUBSSr132m_Int:
+    case X86::VFNMADDSDr132r_Int: case X86::VFNMADDSDr132m_Int:
+    case X86::VFNMADDSSr132r_Int: case X86::VFNMADDSSr132m_Int:
+    case X86::VFNMSUBSDr132r_Int: case X86::VFNMSUBSDr132m_Int:
+    case X86::VFNMSUBSSr132r_Int: case X86::VFNMSUBSSr132m_Int:
+
+    case X86::VFMADDSDr213r_Int:  case X86::VFMADDSDr213m_Int:
+    case X86::VFMADDSSr213r_Int:  case X86::VFMADDSSr213m_Int:
+    case X86::VFMSUBSDr213r_Int:  case X86::VFMSUBSDr213m_Int:
+    case X86::VFMSUBSSr213r_Int:  case X86::VFMSUBSSr213m_Int:
+    case X86::VFNMADDSDr213r_Int: case X86::VFNMADDSDr213m_Int:
+    case X86::VFNMADDSSr213r_Int: case X86::VFNMADDSSr213m_Int:
+    case X86::VFNMSUBSDr213r_Int: case X86::VFNMSUBSDr213m_Int:
+    case X86::VFNMSUBSSr213r_Int: case X86::VFNMSUBSSr213m_Int:
+
+    case X86::VFMADDSDr231r_Int:  case X86::VFMADDSDr231m_Int:
+    case X86::VFMADDSSr231r_Int:  case X86::VFMADDSSr231m_Int:
+    case X86::VFMSUBSDr231r_Int:  case X86::VFMSUBSDr231m_Int:
+    case X86::VFMSUBSSr231r_Int:  case X86::VFMSUBSSr231m_Int:
+    case X86::VFNMADDSDr231r_Int: case X86::VFNMADDSDr231m_Int:
+    case X86::VFNMADDSSr231r_Int: case X86::VFNMADDSSr231m_Int:
+    case X86::VFNMSUBSDr231r_Int: case X86::VFNMSUBSDr231m_Int:
+    case X86::VFNMSUBSSr231r_Int: case X86::VFNMSUBSSr231m_Int:
+      if (IsIntrinsic)
+        *IsIntrinsic = true;
+      return true;
+    default:
+      return false;
+  }
+  llvm_unreachable("Opcode not handled by the switch");
+}
+
+MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI,
+                                                   bool NewMI,
+                                                   unsigned OpIdx1,
+                                                   unsigned OpIdx2) const {
+  switch (MI->getOpcode()) {
+  case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
+  case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
+  case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
+  case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
+  case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
+  case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
+    unsigned Opc;
+    unsigned Size;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
+    case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
+    case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
+    case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
+    case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
+    case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
+    }
+    unsigned Amt = MI->getOperand(3).getImm();
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
+    }
+    MI->setDesc(get(Opc));
+    MI->getOperand(3).setImm(Size-Amt);
+    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+  }
+  case X86::BLENDPDrri:
+  case X86::BLENDPSrri:
+  case X86::PBLENDWrri:
+  case X86::VBLENDPDrri:
+  case X86::VBLENDPSrri:
+  case X86::VBLENDPDYrri:
+  case X86::VBLENDPSYrri:
+  case X86::VPBLENDDrri:
+  case X86::VPBLENDWrri:
+  case X86::VPBLENDDYrri:
+  case X86::VPBLENDWYrri:{
+    unsigned Mask;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::BLENDPDrri:    Mask = 0x03; break;
+    case X86::BLENDPSrri:    Mask = 0x0F; break;
+    case X86::PBLENDWrri:    Mask = 0xFF; break;
+    case X86::VBLENDPDrri:   Mask = 0x03; break;
+    case X86::VBLENDPSrri:   Mask = 0x0F; break;
+    case X86::VBLENDPDYrri:  Mask = 0x0F; break;
+    case X86::VBLENDPSYrri:  Mask = 0xFF; break;
+    case X86::VPBLENDDrri:   Mask = 0x0F; break;
+    case X86::VPBLENDWrri:   Mask = 0xFF; break;
+    case X86::VPBLENDDYrri:  Mask = 0xFF; break;
+    case X86::VPBLENDWYrri:  Mask = 0xFF; break;
+    }
+    // Only the least significant bits of Imm are used.
+    unsigned Imm = MI->getOperand(3).getImm() & Mask;
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
+    }
+    MI->getOperand(3).setImm(Mask ^ Imm);
+    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+  }
+  case X86::PCLMULQDQrr:
+  case X86::VPCLMULQDQrr:{
+    // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
+    // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
+    unsigned Imm = MI->getOperand(3).getImm();
+    unsigned Src1Hi = Imm & 0x01;
+    unsigned Src2Hi = Imm & 0x10;
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
+    }
+    MI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
+    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+  }
+  case X86::CMPPDrri:
+  case X86::CMPPSrri:
+  case X86::VCMPPDrri:
+  case X86::VCMPPSrri:
+  case X86::VCMPPDYrri:
+  case X86::VCMPPSYrri: {
+    // Float comparison can be safely commuted for
+    // Ordered/Unordered/Equal/NotEqual tests
+    unsigned Imm = MI->getOperand(3).getImm() & 0x7;
+    switch (Imm) {
+    case 0x00: // EQUAL
+    case 0x03: // UNORDERED
+    case 0x04: // NOT EQUAL
+    case 0x07: // ORDERED
+      if (NewMI) {
+        MachineFunction &MF = *MI->getParent()->getParent();
+        MI = MF.CloneMachineInstr(MI);
+        NewMI = false;
+      }
+      return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+    default:
+      return nullptr;
+    }
+  }
+  case X86::VPCOMBri: case X86::VPCOMUBri:
+  case X86::VPCOMDri: case X86::VPCOMUDri:
+  case X86::VPCOMQri: case X86::VPCOMUQri:
+  case X86::VPCOMWri: case X86::VPCOMUWri: {
+    // Flip comparison mode immediate (if necessary).
+    unsigned Imm = MI->getOperand(3).getImm() & 0x7;
+    switch (Imm) {
+    case 0x00: Imm = 0x02; break; // LT -> GT
+    case 0x01: Imm = 0x03; break; // LE -> GE
+    case 0x02: Imm = 0x00; break; // GT -> LT
+    case 0x03: Imm = 0x01; break; // GE -> LE
+    case 0x04: // EQ
+    case 0x05: // NE
+    case 0x06: // FALSE
+    case 0x07: // TRUE
+    default:
+      break;
+    }
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
+    }
+    MI->getOperand(3).setImm(Imm);
+    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+  }
+  case X86::CMOVB16rr:  case X86::CMOVB32rr:  case X86::CMOVB64rr:
+  case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
+  case X86::CMOVE16rr:  case X86::CMOVE32rr:  case X86::CMOVE64rr:
+  case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
+  case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr:
+  case X86::CMOVA16rr:  case X86::CMOVA32rr:  case X86::CMOVA64rr:
+  case X86::CMOVL16rr:  case X86::CMOVL32rr:  case X86::CMOVL64rr:
+  case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr:
+  case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr:
+  case X86::CMOVG16rr:  case X86::CMOVG32rr:  case X86::CMOVG64rr:
+  case X86::CMOVS16rr:  case X86::CMOVS32rr:  case X86::CMOVS64rr:
+  case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr:
+  case X86::CMOVP16rr:  case X86::CMOVP32rr:  case X86::CMOVP64rr:
+  case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr:
+  case X86::CMOVO16rr:  case X86::CMOVO32rr:  case X86::CMOVO64rr:
+  case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: {
+    unsigned Opc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::CMOVB16rr:  Opc = X86::CMOVAE16rr; break;
+    case X86::CMOVB32rr:  Opc = X86::CMOVAE32rr; break;
+    case X86::CMOVB64rr:  Opc = X86::CMOVAE64rr; break;
+    case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break;
+    case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break;
+    case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break;
+    case X86::CMOVE16rr:  Opc = X86::CMOVNE16rr; break;
+    case X86::CMOVE32rr:  Opc = X86::CMOVNE32rr; break;
+    case X86::CMOVE64rr:  Opc = X86::CMOVNE64rr; break;
+    case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break;
+    case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break;
+    case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break;
+    case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break;
+    case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break;
+    case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break;
+    case X86::CMOVA16rr:  Opc = X86::CMOVBE16rr; break;
+    case X86::CMOVA32rr:  Opc = X86::CMOVBE32rr; break;
+    case X86::CMOVA64rr:  Opc = X86::CMOVBE64rr; break;
+    case X86::CMOVL16rr:  Opc = X86::CMOVGE16rr; break;
+    case X86::CMOVL32rr:  Opc = X86::CMOVGE32rr; break;
+    case X86::CMOVL64rr:  Opc = X86::CMOVGE64rr; break;
+    case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break;
+    case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break;
+    case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break;
+    case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break;
+    case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break;
+    case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break;
+    case X86::CMOVG16rr:  Opc = X86::CMOVLE16rr; break;
+    case X86::CMOVG32rr:  Opc = X86::CMOVLE32rr; break;
+    case X86::CMOVG64rr:  Opc = X86::CMOVLE64rr; break;
+    case X86::CMOVS16rr:  Opc = X86::CMOVNS16rr; break;
+    case X86::CMOVS32rr:  Opc = X86::CMOVNS32rr; break;
+    case X86::CMOVS64rr:  Opc = X86::CMOVNS64rr; break;
+    case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break;
+    case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break;
+    case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break;
+    case X86::CMOVP16rr:  Opc = X86::CMOVNP16rr; break;
+    case X86::CMOVP32rr:  Opc = X86::CMOVNP32rr; break;
+    case X86::CMOVP64rr:  Opc = X86::CMOVNP64rr; break;
+    case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break;
+    case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
+    case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
+    case X86::CMOVO16rr:  Opc = X86::CMOVNO16rr; break;
+    case X86::CMOVO32rr:  Opc = X86::CMOVNO32rr; break;
+    case X86::CMOVO64rr:  Opc = X86::CMOVNO64rr; break;
+    case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break;
+    case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
+    case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
+    }
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
+    }
+    MI->setDesc(get(Opc));
+    // Fallthrough intended.
+  }
+  default:
+    if (isFMA3(MI->getOpcode())) {
+      unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2);
+      if (Opc == 0)
+        return nullptr;
+      if (NewMI) {
+        MachineFunction &MF = *MI->getParent()->getParent();
+        MI = MF.CloneMachineInstr(MI);
+        NewMI = false;
+      }
+      MI->setDesc(get(Opc));
+    }
+    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+  }
+}
+
+bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI,
+                                             unsigned &SrcOpIdx1,
+                                             unsigned &SrcOpIdx2) const {
+
+  unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3;
+
+  // Only the first RegOpsNum operands are commutable.
+  // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
+  // that the operand is not specified/fixed.
+  if (SrcOpIdx1 != CommuteAnyOperandIndex &&
+      (SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum))
+    return false;
+  if (SrcOpIdx2 != CommuteAnyOperandIndex &&
+      (SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum))
+    return false;
+
+  // Look for two different register operands assumed to be commutable
+  // regardless of the FMA opcode. The FMA opcode is adjusted later.
+  if (SrcOpIdx1 == CommuteAnyOperandIndex ||
+      SrcOpIdx2 == CommuteAnyOperandIndex) {
+    unsigned CommutableOpIdx1 = SrcOpIdx1;
+    unsigned CommutableOpIdx2 = SrcOpIdx2;
+
+    // At least one of operands to be commuted is not specified and
+    // this method is free to choose appropriate commutable operands.
+    if (SrcOpIdx1 == SrcOpIdx2)
+      // Both of operands are not fixed. By default set one of commutable
+      // operands to the last register operand of the instruction.
+      CommutableOpIdx2 = RegOpsNum;
+    else if (SrcOpIdx2 == CommuteAnyOperandIndex)
+      // Only one of operands is not fixed.
+      CommutableOpIdx2 = SrcOpIdx1;
+
+    // CommutableOpIdx2 is well defined now. Let's choose another commutable
+    // operand and assign its index to CommutableOpIdx1.
+    unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg();
+    for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) {
+      // The commuted operands must have different registers.
+      // Otherwise, the commute transformation does not change anything and
+      // is useless then.
+      if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg())
+        break;
+    }
+
+    // No appropriate commutable operands were found.
+    if (CommutableOpIdx1 == 0)
+      return false;
+
+    // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
+    // to return those values.
+    if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+                              CommutableOpIdx1, CommutableOpIdx2))
+      return false;
+  }
+
+  // Check if we can adjust the opcode to preserve the semantics when
+  // commute the register operands.
+  return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0;
+}
+
+unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
+                                                      unsigned SrcOpIdx1,
+                                                      unsigned SrcOpIdx2) const {
+  unsigned Opc = MI->getOpcode();
+
+  // Define the array that holds FMA opcodes in groups
+  // of 3 opcodes(132, 213, 231) in each group.
+  static const unsigned RegularOpcodeGroups[][3] = {
+    { X86::VFMADDSSr132r,   X86::VFMADDSSr213r,   X86::VFMADDSSr231r  },
+    { X86::VFMADDSDr132r,   X86::VFMADDSDr213r,   X86::VFMADDSDr231r  },
+    { X86::VFMADDPSr132r,   X86::VFMADDPSr213r,   X86::VFMADDPSr231r  },
+    { X86::VFMADDPDr132r,   X86::VFMADDPDr213r,   X86::VFMADDPDr231r  },
+    { X86::VFMADDPSr132rY,  X86::VFMADDPSr213rY,  X86::VFMADDPSr231rY },
+    { X86::VFMADDPDr132rY,  X86::VFMADDPDr213rY,  X86::VFMADDPDr231rY },
+    { X86::VFMADDSSr132m,   X86::VFMADDSSr213m,   X86::VFMADDSSr231m  },
+    { X86::VFMADDSDr132m,   X86::VFMADDSDr213m,   X86::VFMADDSDr231m  },
+    { X86::VFMADDPSr132m,   X86::VFMADDPSr213m,   X86::VFMADDPSr231m  },
+    { X86::VFMADDPDr132m,   X86::VFMADDPDr213m,   X86::VFMADDPDr231m  },
+    { X86::VFMADDPSr132mY,  X86::VFMADDPSr213mY,  X86::VFMADDPSr231mY },
+    { X86::VFMADDPDr132mY,  X86::VFMADDPDr213mY,  X86::VFMADDPDr231mY },
+
+    { X86::VFMSUBSSr132r,   X86::VFMSUBSSr213r,   X86::VFMSUBSSr231r  },
+    { X86::VFMSUBSDr132r,   X86::VFMSUBSDr213r,   X86::VFMSUBSDr231r  },
+    { X86::VFMSUBPSr132r,   X86::VFMSUBPSr213r,   X86::VFMSUBPSr231r  },
+    { X86::VFMSUBPDr132r,   X86::VFMSUBPDr213r,   X86::VFMSUBPDr231r  },
+    { X86::VFMSUBPSr132rY,  X86::VFMSUBPSr213rY,  X86::VFMSUBPSr231rY },
+    { X86::VFMSUBPDr132rY,  X86::VFMSUBPDr213rY,  X86::VFMSUBPDr231rY },
+    { X86::VFMSUBSSr132m,   X86::VFMSUBSSr213m,   X86::VFMSUBSSr231m  },
+    { X86::VFMSUBSDr132m,   X86::VFMSUBSDr213m,   X86::VFMSUBSDr231m  },
+    { X86::VFMSUBPSr132m,   X86::VFMSUBPSr213m,   X86::VFMSUBPSr231m  },
+    { X86::VFMSUBPDr132m,   X86::VFMSUBPDr213m,   X86::VFMSUBPDr231m  },
+    { X86::VFMSUBPSr132mY,  X86::VFMSUBPSr213mY,  X86::VFMSUBPSr231mY },
+    { X86::VFMSUBPDr132mY,  X86::VFMSUBPDr213mY,  X86::VFMSUBPDr231mY },
+
+    { X86::VFNMADDSSr132r,  X86::VFNMADDSSr213r,  X86::VFNMADDSSr231r  },
+    { X86::VFNMADDSDr132r,  X86::VFNMADDSDr213r,  X86::VFNMADDSDr231r  },
+    { X86::VFNMADDPSr132r,  X86::VFNMADDPSr213r,  X86::VFNMADDPSr231r  },
+    { X86::VFNMADDPDr132r,  X86::VFNMADDPDr213r,  X86::VFNMADDPDr231r  },
+    { X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY },
+    { X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY },
+    { X86::VFNMADDSSr132m,  X86::VFNMADDSSr213m,  X86::VFNMADDSSr231m  },
+    { X86::VFNMADDSDr132m,  X86::VFNMADDSDr213m,  X86::VFNMADDSDr231m  },
+    { X86::VFNMADDPSr132m,  X86::VFNMADDPSr213m,  X86::VFNMADDPSr231m  },
+    { X86::VFNMADDPDr132m,  X86::VFNMADDPDr213m,  X86::VFNMADDPDr231m  },
+    { X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY },
+    { X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY },
+
+    { X86::VFNMSUBSSr132r,  X86::VFNMSUBSSr213r,  X86::VFNMSUBSSr231r  },
+    { X86::VFNMSUBSDr132r,  X86::VFNMSUBSDr213r,  X86::VFNMSUBSDr231r  },
+    { X86::VFNMSUBPSr132r,  X86::VFNMSUBPSr213r,  X86::VFNMSUBPSr231r  },
+    { X86::VFNMSUBPDr132r,  X86::VFNMSUBPDr213r,  X86::VFNMSUBPDr231r  },
+    { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY },
+    { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY },
+    { X86::VFNMSUBSSr132m,  X86::VFNMSUBSSr213m,  X86::VFNMSUBSSr231m  },
+    { X86::VFNMSUBSDr132m,  X86::VFNMSUBSDr213m,  X86::VFNMSUBSDr231m  },
+    { X86::VFNMSUBPSr132m,  X86::VFNMSUBPSr213m,  X86::VFNMSUBPSr231m  },
+    { X86::VFNMSUBPDr132m,  X86::VFNMSUBPDr213m,  X86::VFNMSUBPDr231m  },
+    { X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY },
+    { X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY },
+
+    { X86::VFMADDSUBPSr132r,  X86::VFMADDSUBPSr213r,  X86::VFMADDSUBPSr231r  },
+    { X86::VFMADDSUBPDr132r,  X86::VFMADDSUBPDr213r,  X86::VFMADDSUBPDr231r  },
+    { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY },
+    { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY },
+    { X86::VFMADDSUBPSr132m,  X86::VFMADDSUBPSr213m,  X86::VFMADDSUBPSr231m  },
+    { X86::VFMADDSUBPDr132m,  X86::VFMADDSUBPDr213m,  X86::VFMADDSUBPDr231m  },
+    { X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY },
+    { X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY },
+
+    { X86::VFMSUBADDPSr132r,  X86::VFMSUBADDPSr213r,  X86::VFMSUBADDPSr231r  },
+    { X86::VFMSUBADDPDr132r,  X86::VFMSUBADDPDr213r,  X86::VFMSUBADDPDr231r  },
+    { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY },
+    { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY },
+    { X86::VFMSUBADDPSr132m,  X86::VFMSUBADDPSr213m,  X86::VFMSUBADDPSr231m  },
+    { X86::VFMSUBADDPDr132m,  X86::VFMSUBADDPDr213m,  X86::VFMSUBADDPDr231m  },
+    { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY },
+    { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY }
+  };
+
+  // Define the array that holds FMA*_Int opcodes in groups
+  // of 3 opcodes(132, 213, 231) in each group.
+  static const unsigned IntrinOpcodeGroups[][3] = {
+    { X86::VFMADDSSr132r_Int,  X86::VFMADDSSr213r_Int,  X86::VFMADDSSr231r_Int },
+    { X86::VFMADDSDr132r_Int,  X86::VFMADDSDr213r_Int,  X86::VFMADDSDr231r_Int },
+    { X86::VFMADDSSr132m_Int,  X86::VFMADDSSr213m_Int,  X86::VFMADDSSr231m_Int },
+    { X86::VFMADDSDr132m_Int,  X86::VFMADDSDr213m_Int,  X86::VFMADDSDr231m_Int },
+
+    { X86::VFMSUBSSr132r_Int,  X86::VFMSUBSSr213r_Int,  X86::VFMSUBSSr231r_Int },
+    { X86::VFMSUBSDr132r_Int,  X86::VFMSUBSDr213r_Int,  X86::VFMSUBSDr231r_Int },
+    { X86::VFMSUBSSr132m_Int,  X86::VFMSUBSSr213m_Int,  X86::VFMSUBSSr231m_Int },
+    { X86::VFMSUBSDr132m_Int,  X86::VFMSUBSDr213m_Int,  X86::VFMSUBSDr231m_Int },
+
+    { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr231r_Int },
+    { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr231r_Int },
+    { X86::VFNMADDSSr132m_Int, X86::VFNMADDSSr213m_Int, X86::VFNMADDSSr231m_Int },
+    { X86::VFNMADDSDr132m_Int, X86::VFNMADDSDr213m_Int, X86::VFNMADDSDr231m_Int },
+
+    { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr231r_Int },
+    { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr231r_Int },
+    { X86::VFNMSUBSSr132m_Int, X86::VFNMSUBSSr213m_Int, X86::VFNMSUBSSr231m_Int },
+    { X86::VFNMSUBSDr132m_Int, X86::VFNMSUBSDr213m_Int, X86::VFNMSUBSDr231m_Int },
+  };
+
+  const unsigned Form132Index = 0;
+  const unsigned Form213Index = 1;
+  const unsigned Form231Index = 2;
+  const unsigned FormsNum = 3;
+
+  bool IsIntrinOpcode;
+  isFMA3(Opc, &IsIntrinOpcode);
+
+  size_t GroupsNum;
+  const unsigned (*OpcodeGroups)[3];
+  if (IsIntrinOpcode) {
+    GroupsNum = array_lengthof(IntrinOpcodeGroups);
+    OpcodeGroups = IntrinOpcodeGroups;
+  } else {
+    GroupsNum = array_lengthof(RegularOpcodeGroups);
+    OpcodeGroups = RegularOpcodeGroups;
+  }
+
+  const unsigned *FoundOpcodesGroup = nullptr;
+  size_t FormIndex;
+
+  // Look for the input opcode in the corresponding opcodes table.
+  for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup;
+         ++GroupIndex) {
+    for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) {
+      if (OpcodeGroups[GroupIndex][FormIndex] == Opc) {
+        FoundOpcodesGroup = OpcodeGroups[GroupIndex];
+        break;
+      }
+    }
+  }
+
+  // The input opcode does not match with any of the opcodes from the tables.
+  // The unsupported FMA opcode must be added to one of the two opcode groups
+  // defined above.
+  assert(FoundOpcodesGroup != nullptr && "Unexpected FMA3 opcode");
+
+  // Put the lowest index to SrcOpIdx1 to simplify the checks below.
+  if (SrcOpIdx1 > SrcOpIdx2)
+    std::swap(SrcOpIdx1, SrcOpIdx2);
+
+  // TODO: Commuting the 1st operand of FMA*_Int requires some additional
+  // analysis. The commute optimization is legal only if all users of FMA*_Int
+  // use only the lowest element of the FMA*_Int instruction. Such analysis are
+  // not implemented yet. So, just return 0 in that case.
+  // When such analysis are available this place will be the right place for
+  // calling it.
+  if (IsIntrinOpcode && SrcOpIdx1 == 1)
+    return 0;
+
+  unsigned Case;
+  if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2)
+    Case = 0;
+  else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3)
+    Case = 1;
+  else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3)
+    Case = 2;
+  else
+    return 0;
+
+  // Define the FMA forms mapping array that helps to map input FMA form
+  // to output FMA form to preserve the operation semantics after
+  // commuting the operands.
+  static const unsigned FormMapping[][3] = {
+    // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
+    // FMA132 A, C, b; ==> FMA231 C, A, b;
+    // FMA213 B, A, c; ==> FMA213 A, B, c;
+    // FMA231 C, A, b; ==> FMA132 A, C, b;
+    { Form231Index, Form213Index, Form132Index },
+    // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
+    // FMA132 A, c, B; ==> FMA132 B, c, A;
+    // FMA213 B, a, C; ==> FMA231 C, a, B;
+    // FMA231 C, a, B; ==> FMA213 B, a, C;
+    { Form132Index, Form231Index, Form213Index },
+    // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
+    // FMA132 a, C, B; ==> FMA213 a, B, C;
+    // FMA213 b, A, C; ==> FMA132 b, C, A;
+    // FMA231 c, A, B; ==> FMA231 c, B, A;
+    { Form213Index, Form132Index, Form231Index }
+  };
+
+  // Everything is ready, just adjust the FMA opcode and return it.
+  FormIndex = FormMapping[Case][FormIndex];
+  return FoundOpcodesGroup[FormIndex];
+}
+
+bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI,
+                                         unsigned &SrcOpIdx1,
+                                         unsigned &SrcOpIdx2) const {
+  switch (MI->getOpcode()) {
+    case X86::CMPPDrri:
+    case X86::CMPPSrri:
+    case X86::VCMPPDrri:
+    case X86::VCMPPSrri:
+    case X86::VCMPPDYrri:
+    case X86::VCMPPSYrri: {
+      // Float comparison can be safely commuted for
+      // Ordered/Unordered/Equal/NotEqual tests
+      unsigned Imm = MI->getOperand(3).getImm() & 0x7;
+      switch (Imm) {
+        case 0x00: // EQUAL
+        case 0x03: // UNORDERED
+        case 0x04: // NOT EQUAL
+        case 0x07: // ORDERED
+          // The indices of the commutable operands are 1 and 2.
+          // Assign them to the returned operand indices here.
+          return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
+      }
+      return false;
+    }
+    default:
+      if (isFMA3(MI->getOpcode()))
+        return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+      return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+  }
+  return false;
+}
+
+static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
+  switch (BrOpc) {
+  default: return X86::COND_INVALID;
+  case X86::JE_1:  return X86::COND_E;
+  case X86::JNE_1: return X86::COND_NE;
+  case X86::JL_1:  return X86::COND_L;
+  case X86::JLE_1: return X86::COND_LE;
+  case X86::JG_1:  return X86::COND_G;
+  case X86::JGE_1: return X86::COND_GE;
+  case X86::JB_1:  return X86::COND_B;
+  case X86::JBE_1: return X86::COND_BE;
+  case X86::JA_1:  return X86::COND_A;
+  case X86::JAE_1: return X86::COND_AE;
+  case X86::JS_1:  return X86::COND_S;
+  case X86::JNS_1: return X86::COND_NS;
+  case X86::JP_1:  return X86::COND_P;
+  case X86::JNP_1: return X86::COND_NP;
+  case X86::JO_1:  return X86::COND_O;
+  case X86::JNO_1: return X86::COND_NO;
+  }
+}
+
+/// Return condition code of a SET opcode.
+static X86::CondCode getCondFromSETOpc(unsigned Opc) {
+  switch (Opc) {
+  default: return X86::COND_INVALID;
+  case X86::SETAr:  case X86::SETAm:  return X86::COND_A;
+  case X86::SETAEr: case X86::SETAEm: return X86::COND_AE;
+  case X86::SETBr:  case X86::SETBm:  return X86::COND_B;
+  case X86::SETBEr: case X86::SETBEm: return X86::COND_BE;
+  case X86::SETEr:  case X86::SETEm:  return X86::COND_E;
+  case X86::SETGr:  case X86::SETGm:  return X86::COND_G;
+  case X86::SETGEr: case X86::SETGEm: return X86::COND_GE;
+  case X86::SETLr:  case X86::SETLm:  return X86::COND_L;
+  case X86::SETLEr: case X86::SETLEm: return X86::COND_LE;
+  case X86::SETNEr: case X86::SETNEm: return X86::COND_NE;
+  case X86::SETNOr: case X86::SETNOm: return X86::COND_NO;
+  case X86::SETNPr: case X86::SETNPm: return X86::COND_NP;
+  case X86::SETNSr: case X86::SETNSm: return X86::COND_NS;
+  case X86::SETOr:  case X86::SETOm:  return X86::COND_O;
+  case X86::SETPr:  case X86::SETPm:  return X86::COND_P;
+  case X86::SETSr:  case X86::SETSm:  return X86::COND_S;
+  }
+}
+
+/// Return condition code of a CMov opcode.
+X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
+  switch (Opc) {
+  default: return X86::COND_INVALID;
+  case X86::CMOVA16rm:  case X86::CMOVA16rr:  case X86::CMOVA32rm:
+  case X86::CMOVA32rr:  case X86::CMOVA64rm:  case X86::CMOVA64rr:
+    return X86::COND_A;
+  case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm:
+  case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr:
+    return X86::COND_AE;
+  case X86::CMOVB16rm:  case X86::CMOVB16rr:  case X86::CMOVB32rm:
+  case X86::CMOVB32rr:  case X86::CMOVB64rm:  case X86::CMOVB64rr:
+    return X86::COND_B;
+  case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm:
+  case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr:
+    return X86::COND_BE;
+  case X86::CMOVE16rm:  case X86::CMOVE16rr:  case X86::CMOVE32rm:
+  case X86::CMOVE32rr:  case X86::CMOVE64rm:  case X86::CMOVE64rr:
+    return X86::COND_E;
+  case X86::CMOVG16rm:  case X86::CMOVG16rr:  case X86::CMOVG32rm:
+  case X86::CMOVG32rr:  case X86::CMOVG64rm:  case X86::CMOVG64rr:
+    return X86::COND_G;
+  case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm:
+  case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr:
+    return X86::COND_GE;
+  case X86::CMOVL16rm:  case X86::CMOVL16rr:  case X86::CMOVL32rm:
+  case X86::CMOVL32rr:  case X86::CMOVL64rm:  case X86::CMOVL64rr:
+    return X86::COND_L;
+  case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm:
+  case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr:
+    return X86::COND_LE;
+  case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm:
+  case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr:
+    return X86::COND_NE;
+  case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm:
+  case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr:
+    return X86::COND_NO;
+  case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm:
+  case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr:
+    return X86::COND_NP;
+  case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm:
+  case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr:
+    return X86::COND_NS;
+  case X86::CMOVO16rm:  case X86::CMOVO16rr:  case X86::CMOVO32rm:
+  case X86::CMOVO32rr:  case X86::CMOVO64rm:  case X86::CMOVO64rr:
+    return X86::COND_O;
+  case X86::CMOVP16rm:  case X86::CMOVP16rr:  case X86::CMOVP32rm:
+  case X86::CMOVP32rr:  case X86::CMOVP64rm:  case X86::CMOVP64rr:
+    return X86::COND_P;
+  case X86::CMOVS16rm:  case X86::CMOVS16rr:  case X86::CMOVS32rm:
+  case X86::CMOVS32rr:  case X86::CMOVS64rm:  case X86::CMOVS64rr:
+    return X86::COND_S;
+  }
+}
+
+unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case X86::COND_E:  return X86::JE_1;
+  case X86::COND_NE: return X86::JNE_1;
+  case X86::COND_L:  return X86::JL_1;
+  case X86::COND_LE: return X86::JLE_1;
+  case X86::COND_G:  return X86::JG_1;
+  case X86::COND_GE: return X86::JGE_1;
+  case X86::COND_B:  return X86::JB_1;
+  case X86::COND_BE: return X86::JBE_1;
+  case X86::COND_A:  return X86::JA_1;
+  case X86::COND_AE: return X86::JAE_1;
+  case X86::COND_S:  return X86::JS_1;
+  case X86::COND_NS: return X86::JNS_1;
+  case X86::COND_P:  return X86::JP_1;
+  case X86::COND_NP: return X86::JNP_1;
+  case X86::COND_O:  return X86::JO_1;
+  case X86::COND_NO: return X86::JNO_1;
+  }
+}
+
+/// Return the inverse of the specified condition,
+/// e.g. turning COND_E to COND_NE.
+X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case X86::COND_E:  return X86::COND_NE;
+  case X86::COND_NE: return X86::COND_E;
+  case X86::COND_L:  return X86::COND_GE;
+  case X86::COND_LE: return X86::COND_G;
+  case X86::COND_G:  return X86::COND_LE;
+  case X86::COND_GE: return X86::COND_L;
+  case X86::COND_B:  return X86::COND_AE;
+  case X86::COND_BE: return X86::COND_A;
+  case X86::COND_A:  return X86::COND_BE;
+  case X86::COND_AE: return X86::COND_B;
+  case X86::COND_S:  return X86::COND_NS;
+  case X86::COND_NS: return X86::COND_S;
+  case X86::COND_P:  return X86::COND_NP;
+  case X86::COND_NP: return X86::COND_P;
+  case X86::COND_O:  return X86::COND_NO;
+  case X86::COND_NO: return X86::COND_O;
+  }
+}
+
+/// Assuming the flags are set by MI(a,b), return the condition code if we
+/// modify the instructions such that flags are set by MI(b,a).
+static X86::CondCode getSwappedCondition(X86::CondCode CC) {
+  switch (CC) {
+  default: return X86::COND_INVALID;
+  case X86::COND_E:  return X86::COND_E;
+  case X86::COND_NE: return X86::COND_NE;
+  case X86::COND_L:  return X86::COND_G;
+  case X86::COND_LE: return X86::COND_GE;
+  case X86::COND_G:  return X86::COND_L;
+  case X86::COND_GE: return X86::COND_LE;
+  case X86::COND_B:  return X86::COND_A;
+  case X86::COND_BE: return X86::COND_AE;
+  case X86::COND_A:  return X86::COND_B;
+  case X86::COND_AE: return X86::COND_BE;
+  }
+}
+
+/// Return a set opcode for the given condition and
+/// whether it has memory operand.
+unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
+  static const uint16_t Opc[16][2] = {
+    { X86::SETAr,  X86::SETAm  },
+    { X86::SETAEr, X86::SETAEm },
+    { X86::SETBr,  X86::SETBm  },
+    { X86::SETBEr, X86::SETBEm },
+    { X86::SETEr,  X86::SETEm  },
+    { X86::SETGr,  X86::SETGm  },
+    { X86::SETGEr, X86::SETGEm },
+    { X86::SETLr,  X86::SETLm  },
+    { X86::SETLEr, X86::SETLEm },
+    { X86::SETNEr, X86::SETNEm },
+    { X86::SETNOr, X86::SETNOm },
+    { X86::SETNPr, X86::SETNPm },
+    { X86::SETNSr, X86::SETNSm },
+    { X86::SETOr,  X86::SETOm  },
+    { X86::SETPr,  X86::SETPm  },
+    { X86::SETSr,  X86::SETSm  }
+  };
+
+  assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes");
+  return Opc[CC][HasMemoryOperand ? 1 : 0];
+}
+
+/// Return a cmov opcode for the given condition,
+/// register size in bytes, and operand type.
+unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
+                              bool HasMemoryOperand) {
+  static const uint16_t Opc[32][3] = {
+    { X86::CMOVA16rr,  X86::CMOVA32rr,  X86::CMOVA64rr  },
+    { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
+    { X86::CMOVB16rr,  X86::CMOVB32rr,  X86::CMOVB64rr  },
+    { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr },
+    { X86::CMOVE16rr,  X86::CMOVE32rr,  X86::CMOVE64rr  },
+    { X86::CMOVG16rr,  X86::CMOVG32rr,  X86::CMOVG64rr  },
+    { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr },
+    { X86::CMOVL16rr,  X86::CMOVL32rr,  X86::CMOVL64rr  },
+    { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr },
+    { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr },
+    { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr },
+    { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr },
+    { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr },
+    { X86::CMOVO16rr,  X86::CMOVO32rr,  X86::CMOVO64rr  },
+    { X86::CMOVP16rr,  X86::CMOVP32rr,  X86::CMOVP64rr  },
+    { X86::CMOVS16rr,  X86::CMOVS32rr,  X86::CMOVS64rr  },
+    { X86::CMOVA16rm,  X86::CMOVA32rm,  X86::CMOVA64rm  },
+    { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm },
+    { X86::CMOVB16rm,  X86::CMOVB32rm,  X86::CMOVB64rm  },
+    { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm },
+    { X86::CMOVE16rm,  X86::CMOVE32rm,  X86::CMOVE64rm  },
+    { X86::CMOVG16rm,  X86::CMOVG32rm,  X86::CMOVG64rm  },
+    { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm },
+    { X86::CMOVL16rm,  X86::CMOVL32rm,  X86::CMOVL64rm  },
+    { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm },
+    { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm },
+    { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm },
+    { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm },
+    { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm },
+    { X86::CMOVO16rm,  X86::CMOVO32rm,  X86::CMOVO64rm  },
+    { X86::CMOVP16rm,  X86::CMOVP32rm,  X86::CMOVP64rm  },
+    { X86::CMOVS16rm,  X86::CMOVS32rm,  X86::CMOVS64rm  }
+  };
+
+  assert(CC < 16 && "Can only handle standard cond codes");
+  unsigned Idx = HasMemoryOperand ? 16+CC : CC;
+  switch(RegBytes) {
+  default: llvm_unreachable("Illegal register size!");
+  case 2: return Opc[Idx][0];
+  case 4: return Opc[Idx][1];
+  case 8: return Opc[Idx][2];
+  }
+}
+
+bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  if (!MI->isTerminator()) return false;
+
+  // Conditional branch is a special case.
+  if (MI->isBranch() && !MI->isBarrier())
+    return true;
+  if (!MI->isPredicable())
+    return true;
+  return !isPredicated(MI);
+}
+
+bool X86InstrInfo::AnalyzeBranchImpl(
+    MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+    SmallVectorImpl<MachineOperand> &Cond,
+    SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
+
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  MachineBasicBlock::iterator UnCondBrIter = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+
+    // Working from the bottom, when we see a non-terminator instruction, we're
+    // done.
+    if (!isUnpredicatedTerminator(I))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled by this
+    // analysis.
+    if (!I->isBranch())
+      return true;
+
+    // Handle unconditional branches.
+    if (I->getOpcode() == X86::JMP_1) {
+      UnCondBrIter = I;
+
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a JMP, delete them.
+      while (std::next(I) != MBB.end())
+        std::next(I)->eraseFromParent();
+
+      Cond.clear();
+      FBB = nullptr;
+
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = nullptr;
+        I->eraseFromParent();
+        I = MBB.end();
+        UnCondBrIter = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditional destination.
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+
+    // Handle conditional branches.
+    X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode());
+    if (BranchCode == X86::COND_INVALID)
+      return true;  // Can't handle indirect branch.
+
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
+      if (AllowModify && UnCondBrIter != MBB.end() &&
+          MBB.isLayoutSuccessor(TargetBB)) {
+        // If we can modify the code and it ends in something like:
+        //
+        //     jCC L1
+        //     jmp L2
+        //   L1:
+        //     ...
+        //   L2:
+        //
+        // Then we can change this to:
+        //
+        //     jnCC L2
+        //   L1:
+        //     ...
+        //   L2:
+        //
+        // Which is a bit more efficient.
+        // We conditionally jump to the fall-through block.
+        BranchCode = GetOppositeBranchCondition(BranchCode);
+        unsigned JNCC = GetCondBranchFromCond(BranchCode);
+        MachineBasicBlock::iterator OldInst = I;
+
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
+          .addMBB(UnCondBrIter->getOperand(0).getMBB());
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
+          .addMBB(TargetBB);
+
+        OldInst->eraseFromParent();
+        UnCondBrIter->eraseFromParent();
+
+        // Restart the analysis.
+        UnCondBrIter = MBB.end();
+        I = MBB.end();
+        continue;
+      }
+
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      CondBranches.push_back(I);
+      continue;
+    }
+
+    // Handle subsequent conditional branches. Only handle the case where all
+    // conditional branches branch to the same destination and their condition
+    // opcodes fit one of the special multi-branch idioms.
+    assert(Cond.size() == 1);
+    assert(TBB);
+
+    // Only handle the case where all conditional branches branch to the same
+    // destination.
+    if (TBB != I->getOperand(0).getMBB())
+      return true;
+
+    // If the conditions are the same, we can leave them alone.
+    X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
+    if (OldBranchCode == BranchCode)
+      continue;
+
+    // If they differ, see if they fit one of the known patterns. Theoretically,
+    // we could handle more patterns here, but we shouldn't expect to see them
+    // if instruction selection has done a reasonable job.
+    if ((OldBranchCode == X86::COND_NP &&
+         BranchCode == X86::COND_E) ||
+        (OldBranchCode == X86::COND_E &&
+         BranchCode == X86::COND_NP))
+      BranchCode = X86::COND_NP_OR_E;
+    else if ((OldBranchCode == X86::COND_P &&
+              BranchCode == X86::COND_NE) ||
+             (OldBranchCode == X86::COND_NE &&
+              BranchCode == X86::COND_P))
+      BranchCode = X86::COND_NE_OR_P;
+    else
+      return true;
+
+    // Update the MachineOperand.
+    Cond[0].setImm(BranchCode);
+    CondBranches.push_back(I);
+  }
+
+  return false;
+}
+
+bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                 MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  SmallVector<MachineInstr *, 4> CondBranches;
+  return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
+}
+
+bool X86InstrInfo::AnalyzeBranchPredicate(MachineBasicBlock &MBB,
+                                          MachineBranchPredicate &MBP,
+                                          bool AllowModify) const {
+  using namespace std::placeholders;
+
+  SmallVector<MachineOperand, 4> Cond;
+  SmallVector<MachineInstr *, 4> CondBranches;
+  if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
+                        AllowModify))
+    return true;
+
+  if (Cond.size() != 1)
+    return true;
+
+  assert(MBP.TrueDest && "expected!");
+
+  if (!MBP.FalseDest)
+    MBP.FalseDest = MBB.getNextNode();
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+  MachineInstr *ConditionDef = nullptr;
+  bool SingleUseCondition = true;
+
+  for (auto I = std::next(MBB.rbegin()), E = MBB.rend(); I != E; ++I) {
+    if (I->modifiesRegister(X86::EFLAGS, TRI)) {
+      ConditionDef = &*I;
+      break;
+    }
+
+    if (I->readsRegister(X86::EFLAGS, TRI))
+      SingleUseCondition = false;
+  }
+
+  if (!ConditionDef)
+    return true;
+
+  if (SingleUseCondition) {
+    for (auto *Succ : MBB.successors())
+      if (Succ->isLiveIn(X86::EFLAGS))
+        SingleUseCondition = false;
+  }
+
+  MBP.ConditionDef = ConditionDef;
+  MBP.SingleUseCondition = SingleUseCondition;
+
+  // Currently we only recognize the simple pattern:
+  //
+  //   test %reg, %reg
+  //   je %label
+  //
+  const unsigned TestOpcode =
+      Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
+
+  if (ConditionDef->getOpcode() == TestOpcode &&
+      ConditionDef->getNumOperands() == 3 &&
+      ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
+      (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
+    MBP.LHS = ConditionDef->getOperand(0);
+    MBP.RHS = MachineOperand::CreateImm(0);
+    MBP.Predicate = Cond[0].getImm() == X86::COND_NE
+                        ? MachineBranchPredicate::PRED_NE
+                        : MachineBranchPredicate::PRED_EQ;
+    return false;
+  }
+
+  return true;
+}
+
+unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+    if (I->getOpcode() != X86::JMP_1 &&
+        getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+unsigned
+X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                           MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                           DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "X86 branch conditions have one component!");
+
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  unsigned Count = 0;
+  X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
+  switch (CC) {
+  case X86::COND_NP_OR_E:
+    // Synthesize NP_OR_E with two branches.
+    BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB);
+    ++Count;
+    BuildMI(&MBB, DL, get(X86::JE_1)).addMBB(TBB);
+    ++Count;
+    break;
+  case X86::COND_NE_OR_P:
+    // Synthesize NE_OR_P with two branches.
+    BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB);
+    ++Count;
+    BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB);
+    ++Count;
+    break;
+  default: {
+    unsigned Opc = GetCondBranchFromCond(CC);
+    BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
+    ++Count;
+  }
+  }
+  if (FBB) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
+    ++Count;
+  }
+  return Count;
+}
+
+bool X86InstrInfo::
+canInsertSelect(const MachineBasicBlock &MBB,
+                ArrayRef<MachineOperand> Cond,
+                unsigned TrueReg, unsigned FalseReg,
+                int &CondCycles, int &TrueCycles, int &FalseCycles) const {
+  // Not all subtargets have cmov instructions.
+  if (!Subtarget.hasCMov())
+    return false;
+  if (Cond.size() != 1)
+    return false;
+  // We cannot do the composite conditions, at least not in SSA form.
+  if ((X86::CondCode)Cond[0].getImm() > X86::COND_S)
+    return false;
+
+  // Check register classes.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+    RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+  if (!RC)
+    return false;
+
+  // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
+  if (X86::GR16RegClass.hasSubClassEq(RC) ||
+      X86::GR32RegClass.hasSubClassEq(RC) ||
+      X86::GR64RegClass.hasSubClassEq(RC)) {
+    // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
+    // Bridge. Probably Ivy Bridge as well.
+    CondCycles = 2;
+    TrueCycles = 2;
+    FalseCycles = 2;
+    return true;
+  }
+
+  // Can't do vectors.
+  return false;
+}
+
+void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I, DebugLoc DL,
+                                unsigned DstReg, ArrayRef<MachineOperand> Cond,
+                                unsigned TrueReg, unsigned FalseReg) const {
+   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+   assert(Cond.size() == 1 && "Invalid Cond array");
+   unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
+                                  MRI.getRegClass(DstReg)->getSize(),
+                                  false/*HasMemoryOperand*/);
+   BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
+}
+
+/// Test if the given register is a physical h register.
+static bool isHReg(unsigned Reg) {
+  return X86::GR8_ABCD_HRegClass.contains(Reg);
+}
+
+// Try and copy between VR128/VR64 and GR64 registers.
+static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
+                                        const X86Subtarget &Subtarget) {
+
+  // SrcReg(VR128) -> DestReg(GR64)
+  // SrcReg(VR64)  -> DestReg(GR64)
+  // SrcReg(GR64)  -> DestReg(VR128)
+  // SrcReg(GR64)  -> DestReg(VR64)
+
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
+  if (X86::GR64RegClass.contains(DestReg)) {
+    if (X86::VR128XRegClass.contains(SrcReg))
+      // Copy from a VR128 register to a GR64 register.
+      return HasAVX512 ? X86::VMOVPQIto64Zrr: (HasAVX ? X86::VMOVPQIto64rr :
+                                               X86::MOVPQIto64rr);
+    if (X86::VR64RegClass.contains(SrcReg))
+      // Copy from a VR64 register to a GR64 register.
+      return X86::MMX_MOVD64from64rr;
+  } else if (X86::GR64RegClass.contains(SrcReg)) {
+    // Copy from a GR64 register to a VR128 register.
+    if (X86::VR128XRegClass.contains(DestReg))
+      return HasAVX512 ? X86::VMOV64toPQIZrr: (HasAVX ? X86::VMOV64toPQIrr :
+                                               X86::MOV64toPQIrr);
+    // Copy from a GR64 register to a VR64 register.
+    if (X86::VR64RegClass.contains(DestReg))
+      return X86::MMX_MOVD64to64rr;
+  }
+
+  // SrcReg(FR32) -> DestReg(GR32)
+  // SrcReg(GR32) -> DestReg(FR32)
+
+  if (X86::GR32RegClass.contains(DestReg) && X86::FR32XRegClass.contains(SrcReg))
+    // Copy from a FR32 register to a GR32 register.
+    return HasAVX512 ? X86::VMOVSS2DIZrr : (HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr);
+
+  if (X86::FR32XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
+    // Copy from a GR32 register to a FR32 register.
+    return HasAVX512 ? X86::VMOVDI2SSZrr : (HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr);
+  return 0;
+}
+
+static bool MaskRegClassContains(unsigned Reg) {
+  return X86::VK8RegClass.contains(Reg) ||
+         X86::VK16RegClass.contains(Reg) ||
+         X86::VK32RegClass.contains(Reg) ||
+         X86::VK64RegClass.contains(Reg) ||
+         X86::VK1RegClass.contains(Reg);
+}
+
+static bool GRRegClassContains(unsigned Reg) {
+  return X86::GR64RegClass.contains(Reg) ||
+         X86::GR32RegClass.contains(Reg) ||
+         X86::GR16RegClass.contains(Reg) ||
+         X86::GR8RegClass.contains(Reg);
+}
+static
+unsigned copyPhysRegOpcode_AVX512_DQ(unsigned& DestReg, unsigned& SrcReg) {
+  if (MaskRegClassContains(SrcReg) && X86::GR8RegClass.contains(DestReg)) {
+    DestReg = getX86SubSuperRegister(DestReg, 32);
+    return X86::KMOVBrk;
+  }
+  if (MaskRegClassContains(DestReg) && X86::GR8RegClass.contains(SrcReg)) {
+    SrcReg = getX86SubSuperRegister(SrcReg, 32);
+    return X86::KMOVBkr;
+  }
+  return 0;
+}
+
+static
+unsigned copyPhysRegOpcode_AVX512_BW(unsigned& DestReg, unsigned& SrcReg) {
+  if (MaskRegClassContains(SrcReg) && MaskRegClassContains(DestReg))
+    return X86::KMOVQkk;
+  if (MaskRegClassContains(SrcReg) && X86::GR32RegClass.contains(DestReg))
+    return X86::KMOVDrk;
+  if (MaskRegClassContains(SrcReg) && X86::GR64RegClass.contains(DestReg))
+    return X86::KMOVQrk;
+  if (MaskRegClassContains(DestReg) && X86::GR32RegClass.contains(SrcReg))
+    return X86::KMOVDkr;
+  if (MaskRegClassContains(DestReg) && X86::GR64RegClass.contains(SrcReg))
+    return X86::KMOVQkr;
+  return 0;
+}
+
+static
+unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg,
+                                  const X86Subtarget &Subtarget)
+{
+  if (Subtarget.hasDQI())
+    if (auto Opc = copyPhysRegOpcode_AVX512_DQ(DestReg, SrcReg))
+      return Opc;
+  if (Subtarget.hasBWI())
+    if (auto Opc = copyPhysRegOpcode_AVX512_BW(DestReg, SrcReg))
+      return Opc;
+  if (X86::VR128XRegClass.contains(DestReg, SrcReg) ||
+      X86::VR256XRegClass.contains(DestReg, SrcReg) ||
+      X86::VR512RegClass.contains(DestReg, SrcReg)) {
+     DestReg = get512BitSuperRegister(DestReg);
+     SrcReg = get512BitSuperRegister(SrcReg);
+     return X86::VMOVAPSZrr;
+  }
+  if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg))
+    return X86::KMOVWkk;
+  if (MaskRegClassContains(DestReg) && GRRegClassContains(SrcReg)) {
+    SrcReg = getX86SubSuperRegister(SrcReg, 32);
+    return X86::KMOVWkr;
+  }
+  if (GRRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) {
+    DestReg = getX86SubSuperRegister(DestReg, 32);
+    return X86::KMOVWrk;
+  }
+  return 0;
+}
+
+void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MI, DebugLoc DL,
+                               unsigned DestReg, unsigned SrcReg,
+                               bool KillSrc) const {
+  // First deal with the normal symmetric copies.
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
+  unsigned Opc = 0;
+  if (X86::GR64RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOV64rr;
+  else if (X86::GR32RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOV32rr;
+  else if (X86::GR16RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOV16rr;
+  else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
+    // Copying to or from a physical H register on x86-64 requires a NOREX
+    // move.  Otherwise use a normal move.
+    if ((isHReg(DestReg) || isHReg(SrcReg)) &&
+        Subtarget.is64Bit()) {
+      Opc = X86::MOV8rr_NOREX;
+      // Both operands must be encodable without an REX prefix.
+      assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
+             "8-bit H register can not be copied outside GR8_NOREX");
+    } else
+      Opc = X86::MOV8rr;
+  }
+  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MMX_MOVQ64rr;
+  else if (HasAVX512)
+    Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg, Subtarget);
+  else if (X86::VR128RegClass.contains(DestReg, SrcReg))
+    Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
+  else if (X86::VR256RegClass.contains(DestReg, SrcReg))
+    Opc = X86::VMOVAPSYrr;
+  if (!Opc)
+    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
+
+  if (Opc) {
+    BuildMI(MBB, MI, DL, get(Opc), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  bool FromEFLAGS = SrcReg == X86::EFLAGS;
+  bool ToEFLAGS = DestReg == X86::EFLAGS;
+  int Reg = FromEFLAGS ? DestReg : SrcReg;
+  bool is32 = X86::GR32RegClass.contains(Reg);
+  bool is64 = X86::GR64RegClass.contains(Reg);
+
+  if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) {
+    int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
+    int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
+    int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32;
+    int Pop = is64 ? X86::POP64r : X86::POP32r;
+    int PopF = is64 ? X86::POPF64 : X86::POPF32;
+    int AX = is64 ? X86::RAX : X86::EAX;
+
+    if (!Subtarget.hasLAHFSAHF()) {
+      assert(Subtarget.is64Bit() &&
+             "Not having LAHF/SAHF only happens on 64-bit.");
+      // Moving EFLAGS to / from another register requires a push and a pop.
+      // Notice that we have to adjust the stack if we don't want to clobber the
+      // first frame index. See X86FrameLowering.cpp - usesTheStack.
+      if (FromEFLAGS) {
+        BuildMI(MBB, MI, DL, get(PushF));
+        BuildMI(MBB, MI, DL, get(Pop), DestReg);
+      }
+      if (ToEFLAGS) {
+        BuildMI(MBB, MI, DL, get(Push))
+            .addReg(SrcReg, getKillRegState(KillSrc));
+        BuildMI(MBB, MI, DL, get(PopF));
+      }
+      return;
+    }
+
+    // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is
+    // inefficient. Instead:
+    //   - Save the overflow flag OF into AL using SETO, and restore it using a
+    //     signed 8-bit addition of AL and INT8_MAX.
+    //   - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH
+    //     using LAHF/SAHF.
+    //   - When RAX/EAX is live and isn't the destination register, make sure it
+    //     isn't clobbered by PUSH/POP'ing it before and after saving/restoring
+    //     the flags.
+    // This approach is ~2.25x faster than using PUSHF/POPF.
+    //
+    // This is still somewhat inefficient because we don't know which flags are
+    // actually live inside EFLAGS. Were we able to do a single SETcc instead of
+    // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster.
+    //
+    // PUSHF/POPF is also potentially incorrect because it affects other flags
+    // such as TF/IF/DF, which LLVM doesn't model.
+    //
+    // Notice that we have to adjust the stack if we don't want to clobber the
+    // first frame index.
+    // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment.
+
+
+    bool AXDead = (Reg == AX) ||
+                  (MachineBasicBlock::LQR_Dead ==
+                   MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI));
+    if (!AXDead) {
+      // FIXME: If computeRegisterLiveness() reported LQR_Unknown then AX may
+      // actually be dead. This is not a problem for correctness as we are just
+      // (unnecessarily) saving+restoring a dead register. However the
+      // MachineVerifier expects operands that read from dead registers
+      // to be marked with the "undef" flag.
+      // An example of this can be found in
+      // test/CodeGen/X86/peephole-na-phys-copy-folding.ll and
+      // test/CodeGen/X86/cmpxchg-clobber-flags.ll when using
+      // -verify-machineinstrs.
+      BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
+    }
+    if (FromEFLAGS) {
+      BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL);
+      BuildMI(MBB, MI, DL, get(X86::LAHF));
+      BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX);
+    }
+    if (ToEFLAGS) {
+      BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc));
+      BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL)
+          .addReg(X86::AL)
+          .addImm(INT8_MAX);
+      BuildMI(MBB, MI, DL, get(X86::SAHF));
+    }
+    if (!AXDead)
+      BuildMI(MBB, MI, DL, get(Pop), AX);
+    return;
+  }
+
+  DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
+               << " to " << RI.getName(DestReg) << '\n');
+  llvm_unreachable("Cannot emit physreg copy instruction");
+}
+
+static unsigned getLoadStoreRegOpcode(unsigned Reg,
+                                      const TargetRegisterClass *RC,
+                                      bool isStackAligned,
+                                      const X86Subtarget &STI,
+                                      bool load) {
+  if (STI.hasAVX512()) {
+    if (X86::VK8RegClass.hasSubClassEq(RC)  ||
+      X86::VK16RegClass.hasSubClassEq(RC))
+      return load ? X86::KMOVWkm : X86::KMOVWmk;
+    if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC))
+      return load ? X86::VMOVSSZrm : X86::VMOVSSZmr;
+    if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC))
+      return load ? X86::VMOVSDZrm : X86::VMOVSDZmr;
+    if (X86::VR512RegClass.hasSubClassEq(RC))
+      return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
+  }
+
+  bool HasAVX = STI.hasAVX();
+  switch (RC->getSize()) {
+  default:
+    llvm_unreachable("Unknown spill size");
+  case 1:
+    assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
+    if (STI.is64Bit())
+      // Copying to or from a physical H register on x86-64 requires a NOREX
+      // move.  Otherwise use a normal move.
+      if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
+        return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
+    return load ? X86::MOV8rm : X86::MOV8mr;
+  case 2:
+    assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
+    return load ? X86::MOV16rm : X86::MOV16mr;
+  case 4:
+    if (X86::GR32RegClass.hasSubClassEq(RC))
+      return load ? X86::MOV32rm : X86::MOV32mr;
+    if (X86::FR32RegClass.hasSubClassEq(RC))
+      return load ?
+        (HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
+        (HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
+    if (X86::RFP32RegClass.hasSubClassEq(RC))
+      return load ? X86::LD_Fp32m : X86::ST_Fp32m;
+    llvm_unreachable("Unknown 4-byte regclass");
+  case 8:
+    if (X86::GR64RegClass.hasSubClassEq(RC))
+      return load ? X86::MOV64rm : X86::MOV64mr;
+    if (X86::FR64RegClass.hasSubClassEq(RC))
+      return load ?
+        (HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
+        (HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
+    if (X86::VR64RegClass.hasSubClassEq(RC))
+      return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
+    if (X86::RFP64RegClass.hasSubClassEq(RC))
+      return load ? X86::LD_Fp64m : X86::ST_Fp64m;
+    llvm_unreachable("Unknown 8-byte regclass");
+  case 10:
+    assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
+    return load ? X86::LD_Fp80m : X86::ST_FpP80m;
+  case 16: {
+    assert((X86::VR128RegClass.hasSubClassEq(RC) ||
+            X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass");
+    // If stack is realigned we can use aligned stores.
+    if (isStackAligned)
+      return load ?
+        (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) :
+        (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr);
+    else
+      return load ?
+        (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) :
+        (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
+  }
+  case 32:
+    assert((X86::VR256RegClass.hasSubClassEq(RC) ||
+            X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass");
+    // If stack is realigned we can use aligned stores.
+    if (isStackAligned)
+      return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr;
+    else
+      return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr;
+  case 64:
+    assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
+    if (isStackAligned)
+      return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
+    else
+      return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
+  }
+}
+
+bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *MemOp, unsigned &BaseReg,
+                                         unsigned &Offset,
+                                         const TargetRegisterInfo *TRI) const {
+  const MCInstrDesc &Desc = MemOp->getDesc();
+  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags, MemOp->getOpcode());
+  if (MemRefBegin < 0)
+    return false;
+
+  MemRefBegin += X86II::getOperandBias(Desc);
+
+  BaseReg = MemOp->getOperand(MemRefBegin + X86::AddrBaseReg).getReg();
+  if (MemOp->getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
+    return false;
+
+  if (MemOp->getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
+      X86::NoRegister)
+    return false;
+
+  const MachineOperand &DispMO = MemOp->getOperand(MemRefBegin + X86::AddrDisp);
+
+  // Displacement can be symbolic
+  if (!DispMO.isImm())
+    return false;
+
+  Offset = DispMO.getImm();
+
+  return (MemOp->getOperand(MemRefBegin + X86::AddrIndexReg).getReg() ==
+          X86::NoRegister);
+}
+
+static unsigned getStoreRegOpcode(unsigned SrcReg,
+                                  const TargetRegisterClass *RC,
+                                  bool isStackAligned,
+                                  const X86Subtarget &STI) {
+  return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false);
+}
+
+
+static unsigned getLoadRegOpcode(unsigned DestReg,
+                                 const TargetRegisterClass *RC,
+                                 bool isStackAligned,
+                                 const X86Subtarget &STI) {
+  return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true);
+}
+
+void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       unsigned SrcReg, bool isKill, int FrameIdx,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  const MachineFunction &MF = *MBB.getParent();
+  assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
+         "Stack slot too small for store");
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  bool isAligned =
+      (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
+      RI.canRealignStack(MF);
+  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
+    .addReg(SrcReg, getKillRegState(isKill));
+}
+
+void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+                                  bool isKill,
+                                  SmallVectorImpl<MachineOperand> &Addr,
+                                  const TargetRegisterClass *RC,
+                                  MachineInstr::mmo_iterator MMOBegin,
+                                  MachineInstr::mmo_iterator MMOEnd,
+                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  bool isAligned = MMOBegin != MMOEnd &&
+                   (*MMOBegin)->getAlignment() >= Alignment;
+  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
+  DebugLoc DL;
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  MIB.addReg(SrcReg, getKillRegState(isKill));
+  (*MIB).setMemRefs(MMOBegin, MMOEnd);
+  NewMIs.push_back(MIB);
+}
+
+
+void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        unsigned DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC,
+                                        const TargetRegisterInfo *TRI) const {
+  const MachineFunction &MF = *MBB.getParent();
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  bool isAligned =
+      (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
+      RI.canRealignStack(MF);
+  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
+}
+
+void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                                 SmallVectorImpl<MachineOperand> &Addr,
+                                 const TargetRegisterClass *RC,
+                                 MachineInstr::mmo_iterator MMOBegin,
+                                 MachineInstr::mmo_iterator MMOEnd,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  bool isAligned = MMOBegin != MMOEnd &&
+                   (*MMOBegin)->getAlignment() >= Alignment;
+  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
+  DebugLoc DL;
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  (*MIB).setMemRefs(MMOBegin, MMOEnd);
+  NewMIs.push_back(MIB);
+}
+
+bool X86InstrInfo::
+analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
+               int &CmpMask, int &CmpValue) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP8ri:
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI->getOperand(1).getImm();
+    return true;
+  // A SUB can be used to perform comparison.
+  case X86::SUB64rm:
+  case X86::SUB32rm:
+  case X86::SUB16rm:
+  case X86::SUB8rm:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case X86::SUB64rr:
+  case X86::SUB32rr:
+  case X86::SUB16rr:
+  case X86::SUB8rr:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = MI->getOperand(2).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB8ri:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI->getOperand(2).getImm();
+    return true;
+  case X86::CMP64rr:
+  case X86::CMP32rr:
+  case X86::CMP16rr:
+  case X86::CMP8rr:
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = MI->getOperand(1).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case X86::TEST8rr:
+  case X86::TEST16rr:
+  case X86::TEST32rr:
+  case X86::TEST64rr:
+    SrcReg = MI->getOperand(0).getReg();
+    if (MI->getOperand(1).getReg() != SrcReg) return false;
+    // Compare against zero.
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  }
+  return false;
+}
+
+/// Check whether the first instruction, whose only
+/// purpose is to update flags, can be made redundant.
+/// CMPrr can be made redundant by SUBrr if the operands are the same.
+/// This function can be extended later on.
+/// SrcReg, SrcRegs: register operands for FlagI.
+/// ImmValue: immediate for FlagI if it takes an immediate.
+inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg,
+                                        unsigned SrcReg2, int ImmValue,
+                                        MachineInstr *OI) {
+  if (((FlagI->getOpcode() == X86::CMP64rr &&
+        OI->getOpcode() == X86::SUB64rr) ||
+       (FlagI->getOpcode() == X86::CMP32rr &&
+        OI->getOpcode() == X86::SUB32rr)||
+       (FlagI->getOpcode() == X86::CMP16rr &&
+        OI->getOpcode() == X86::SUB16rr)||
+       (FlagI->getOpcode() == X86::CMP8rr &&
+        OI->getOpcode() == X86::SUB8rr)) &&
+      ((OI->getOperand(1).getReg() == SrcReg &&
+        OI->getOperand(2).getReg() == SrcReg2) ||
+       (OI->getOperand(1).getReg() == SrcReg2 &&
+        OI->getOperand(2).getReg() == SrcReg)))
+    return true;
+
+  if (((FlagI->getOpcode() == X86::CMP64ri32 &&
+        OI->getOpcode() == X86::SUB64ri32) ||
+       (FlagI->getOpcode() == X86::CMP64ri8 &&
+        OI->getOpcode() == X86::SUB64ri8) ||
+       (FlagI->getOpcode() == X86::CMP32ri &&
+        OI->getOpcode() == X86::SUB32ri) ||
+       (FlagI->getOpcode() == X86::CMP32ri8 &&
+        OI->getOpcode() == X86::SUB32ri8) ||
+       (FlagI->getOpcode() == X86::CMP16ri &&
+        OI->getOpcode() == X86::SUB16ri) ||
+       (FlagI->getOpcode() == X86::CMP16ri8 &&
+        OI->getOpcode() == X86::SUB16ri8) ||
+       (FlagI->getOpcode() == X86::CMP8ri &&
+        OI->getOpcode() == X86::SUB8ri)) &&
+      OI->getOperand(1).getReg() == SrcReg &&
+      OI->getOperand(2).getImm() == ImmValue)
+    return true;
+  return false;
+}
+
+/// Check whether the definition can be converted
+/// to remove a comparison against zero.
+inline static bool isDefConvertible(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return false;
+
+  // The shift instructions only modify ZF if their shift count is non-zero.
+  // N.B.: The processor truncates the shift count depending on the encoding.
+  case X86::SAR8ri:    case X86::SAR16ri:  case X86::SAR32ri:case X86::SAR64ri:
+  case X86::SHR8ri:    case X86::SHR16ri:  case X86::SHR32ri:case X86::SHR64ri:
+     return getTruncatedShiftCount(MI, 2) != 0;
+
+  // Some left shift instructions can be turned into LEA instructions but only
+  // if their flags aren't used. Avoid transforming such instructions.
+  case X86::SHL8ri:    case X86::SHL16ri:  case X86::SHL32ri:case X86::SHL64ri:{
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (isTruncatedShiftCountForLEA(ShAmt)) return false;
+    return ShAmt != 0;
+  }
+
+  case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
+  case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
+     return getTruncatedShiftCount(MI, 3) != 0;
+
+  case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
+  case X86::SUB32ri8:  case X86::SUB16ri:  case X86::SUB16ri8:
+  case X86::SUB8ri:    case X86::SUB64rr:  case X86::SUB32rr:
+  case X86::SUB16rr:   case X86::SUB8rr:   case X86::SUB64rm:
+  case X86::SUB32rm:   case X86::SUB16rm:  case X86::SUB8rm:
+  case X86::DEC64r:    case X86::DEC32r:   case X86::DEC16r: case X86::DEC8r:
+  case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
+  case X86::ADD32ri8:  case X86::ADD16ri:  case X86::ADD16ri8:
+  case X86::ADD8ri:    case X86::ADD64rr:  case X86::ADD32rr:
+  case X86::ADD16rr:   case X86::ADD8rr:   case X86::ADD64rm:
+  case X86::ADD32rm:   case X86::ADD16rm:  case X86::ADD8rm:
+  case X86::INC64r:    case X86::INC32r:   case X86::INC16r: case X86::INC8r:
+  case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
+  case X86::AND32ri8:  case X86::AND16ri:  case X86::AND16ri8:
+  case X86::AND8ri:    case X86::AND64rr:  case X86::AND32rr:
+  case X86::AND16rr:   case X86::AND8rr:   case X86::AND64rm:
+  case X86::AND32rm:   case X86::AND16rm:  case X86::AND8rm:
+  case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
+  case X86::XOR32ri8:  case X86::XOR16ri:  case X86::XOR16ri8:
+  case X86::XOR8ri:    case X86::XOR64rr:  case X86::XOR32rr:
+  case X86::XOR16rr:   case X86::XOR8rr:   case X86::XOR64rm:
+  case X86::XOR32rm:   case X86::XOR16rm:  case X86::XOR8rm:
+  case X86::OR64ri32:  case X86::OR64ri8:  case X86::OR32ri:
+  case X86::OR32ri8:   case X86::OR16ri:   case X86::OR16ri8:
+  case X86::OR8ri:     case X86::OR64rr:   case X86::OR32rr:
+  case X86::OR16rr:    case X86::OR8rr:    case X86::OR64rm:
+  case X86::OR32rm:    case X86::OR16rm:   case X86::OR8rm:
+  case X86::NEG8r:     case X86::NEG16r:   case X86::NEG32r: case X86::NEG64r:
+  case X86::SAR8r1:    case X86::SAR16r1:  case X86::SAR32r1:case X86::SAR64r1:
+  case X86::SHR8r1:    case X86::SHR16r1:  case X86::SHR32r1:case X86::SHR64r1:
+  case X86::SHL8r1:    case X86::SHL16r1:  case X86::SHL32r1:case X86::SHL64r1:
+  case X86::ADC32ri:   case X86::ADC32ri8:
+  case X86::ADC32rr:   case X86::ADC64ri32:
+  case X86::ADC64ri8:  case X86::ADC64rr:
+  case X86::SBB32ri:   case X86::SBB32ri8:
+  case X86::SBB32rr:   case X86::SBB64ri32:
+  case X86::SBB64ri8:  case X86::SBB64rr:
+  case X86::ANDN32rr:  case X86::ANDN32rm:
+  case X86::ANDN64rr:  case X86::ANDN64rm:
+  case X86::BEXTR32rr: case X86::BEXTR64rr:
+  case X86::BEXTR32rm: case X86::BEXTR64rm:
+  case X86::BLSI32rr:  case X86::BLSI32rm:
+  case X86::BLSI64rr:  case X86::BLSI64rm:
+  case X86::BLSMSK32rr:case X86::BLSMSK32rm:
+  case X86::BLSMSK64rr:case X86::BLSMSK64rm:
+  case X86::BLSR32rr:  case X86::BLSR32rm:
+  case X86::BLSR64rr:  case X86::BLSR64rm:
+  case X86::BZHI32rr:  case X86::BZHI32rm:
+  case X86::BZHI64rr:  case X86::BZHI64rm:
+  case X86::LZCNT16rr: case X86::LZCNT16rm:
+  case X86::LZCNT32rr: case X86::LZCNT32rm:
+  case X86::LZCNT64rr: case X86::LZCNT64rm:
+  case X86::POPCNT16rr:case X86::POPCNT16rm:
+  case X86::POPCNT32rr:case X86::POPCNT32rm:
+  case X86::POPCNT64rr:case X86::POPCNT64rm:
+  case X86::TZCNT16rr: case X86::TZCNT16rm:
+  case X86::TZCNT32rr: case X86::TZCNT32rm:
+  case X86::TZCNT64rr: case X86::TZCNT64rm:
+    return true;
+  }
+}
+
+/// Check whether the use can be converted to remove a comparison against zero.
+static X86::CondCode isUseDefConvertible(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return X86::COND_INVALID;
+  case X86::LZCNT16rr: case X86::LZCNT16rm:
+  case X86::LZCNT32rr: case X86::LZCNT32rm:
+  case X86::LZCNT64rr: case X86::LZCNT64rm:
+    return X86::COND_B;
+  case X86::POPCNT16rr:case X86::POPCNT16rm:
+  case X86::POPCNT32rr:case X86::POPCNT32rm:
+  case X86::POPCNT64rr:case X86::POPCNT64rm:
+    return X86::COND_E;
+  case X86::TZCNT16rr: case X86::TZCNT16rm:
+  case X86::TZCNT32rr: case X86::TZCNT32rm:
+  case X86::TZCNT64rr: case X86::TZCNT64rm:
+    return X86::COND_B;
+  }
+}
+
+/// Check if there exists an earlier instruction that
+/// operates on the same source operands and sets flags in the same way as
+/// Compare; remove Compare if possible.
+bool X86InstrInfo::
+optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
+                     int CmpMask, int CmpValue,
+                     const MachineRegisterInfo *MRI) const {
+  // Check whether we can replace SUB with CMP.
+  unsigned NewOpcode = 0;
+  switch (CmpInstr->getOpcode()) {
+  default: break;
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB8ri:
+  case X86::SUB64rm:
+  case X86::SUB32rm:
+  case X86::SUB16rm:
+  case X86::SUB8rm:
+  case X86::SUB64rr:
+  case X86::SUB32rr:
+  case X86::SUB16rr:
+  case X86::SUB8rr: {
+    if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
+      return false;
+    // There is no use of the destination register, we can replace SUB with CMP.
+    switch (CmpInstr->getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::SUB64rm:   NewOpcode = X86::CMP64rm;   break;
+    case X86::SUB32rm:   NewOpcode = X86::CMP32rm;   break;
+    case X86::SUB16rm:   NewOpcode = X86::CMP16rm;   break;
+    case X86::SUB8rm:    NewOpcode = X86::CMP8rm;    break;
+    case X86::SUB64rr:   NewOpcode = X86::CMP64rr;   break;
+    case X86::SUB32rr:   NewOpcode = X86::CMP32rr;   break;
+    case X86::SUB16rr:   NewOpcode = X86::CMP16rr;   break;
+    case X86::SUB8rr:    NewOpcode = X86::CMP8rr;    break;
+    case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
+    case X86::SUB64ri8:  NewOpcode = X86::CMP64ri8;  break;
+    case X86::SUB32ri:   NewOpcode = X86::CMP32ri;   break;
+    case X86::SUB32ri8:  NewOpcode = X86::CMP32ri8;  break;
+    case X86::SUB16ri:   NewOpcode = X86::CMP16ri;   break;
+    case X86::SUB16ri8:  NewOpcode = X86::CMP16ri8;  break;
+    case X86::SUB8ri:    NewOpcode = X86::CMP8ri;    break;
+    }
+    CmpInstr->setDesc(get(NewOpcode));
+    CmpInstr->RemoveOperand(0);
+    // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
+    if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
+        NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
+      return false;
+  }
+  }
+
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI) return false;
+
+  // CmpInstr is the first instruction of the BB.
+  MachineBasicBlock::iterator I = CmpInstr, Def = MI;
+
+  // If we are comparing against zero, check whether we can use MI to update
+  // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
+  bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
+  if (IsCmpZero && MI->getParent() != CmpInstr->getParent())
+    return false;
+
+  // If we have a use of the source register between the def and our compare
+  // instruction we can eliminate the compare iff the use sets EFLAGS in the
+  // right way.
+  bool ShouldUpdateCC = false;
+  X86::CondCode NewCC = X86::COND_INVALID;
+  if (IsCmpZero && !isDefConvertible(MI)) {
+    // Scan forward from the use until we hit the use we're looking for or the
+    // compare instruction.
+    for (MachineBasicBlock::iterator J = MI;; ++J) {
+      // Do we have a convertible instruction?
+      NewCC = isUseDefConvertible(J);
+      if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() &&
+          J->getOperand(1).getReg() == SrcReg) {
+        assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
+        ShouldUpdateCC = true; // Update CC later on.
+        // This is not a def of SrcReg, but still a def of EFLAGS. Keep going
+        // with the new def.
+        MI = Def = J;
+        break;
+      }
+
+      if (J == I)
+        return false;
+    }
+  }
+
+  // We are searching for an earlier instruction that can make CmpInstr
+  // redundant and that instruction will be saved in Sub.
+  MachineInstr *Sub = nullptr;
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+  // We iterate backward, starting from the instruction before CmpInstr and
+  // stop when reaching the definition of a source register or done with the BB.
+  // RI points to the instruction before CmpInstr.
+  // If the definition is in this basic block, RE points to the definition;
+  // otherwise, RE is the rend of the basic block.
+  MachineBasicBlock::reverse_iterator
+      RI = MachineBasicBlock::reverse_iterator(I),
+      RE = CmpInstr->getParent() == MI->getParent() ?
+           MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ :
+           CmpInstr->getParent()->rend();
+  MachineInstr *Movr0Inst = nullptr;
+  for (; RI != RE; ++RI) {
+    MachineInstr *Instr = &*RI;
+    // Check whether CmpInstr can be made redundant by the current instruction.
+    if (!IsCmpZero &&
+        isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) {
+      Sub = Instr;
+      break;
+    }
+
+    if (Instr->modifiesRegister(X86::EFLAGS, TRI) ||
+        Instr->readsRegister(X86::EFLAGS, TRI)) {
+      // This instruction modifies or uses EFLAGS.
+
+      // MOV32r0 etc. are implemented with xor which clobbers condition code.
+      // They are safe to move up, if the definition to EFLAGS is dead and
+      // earlier instructions do not read or write EFLAGS.
+      if (!Movr0Inst && Instr->getOpcode() == X86::MOV32r0 &&
+          Instr->registerDefIsDead(X86::EFLAGS, TRI)) {
+        Movr0Inst = Instr;
+        continue;
+      }
+
+      // We can't remove CmpInstr.
+      return false;
+    }
+  }
+
+  // Return false if no candidates exist.
+  if (!IsCmpZero && !Sub)
+    return false;
+
+  bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+                    Sub->getOperand(2).getReg() == SrcReg);
+
+  // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
+  // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
+  // If we are done with the basic block, we need to check whether EFLAGS is
+  // live-out.
+  bool IsSafe = false;
+  SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate;
+  MachineBasicBlock::iterator E = CmpInstr->getParent()->end();
+  for (++I; I != E; ++I) {
+    const MachineInstr &Instr = *I;
+    bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
+    bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
+    // We should check the usage if this instruction uses and updates EFLAGS.
+    if (!UseEFLAGS && ModifyEFLAGS) {
+      // It is safe to remove CmpInstr if EFLAGS is updated again.
+      IsSafe = true;
+      break;
+    }
+    if (!UseEFLAGS && !ModifyEFLAGS)
+      continue;
+
+    // EFLAGS is used by this instruction.
+    X86::CondCode OldCC = X86::COND_INVALID;
+    bool OpcIsSET = false;
+    if (IsCmpZero || IsSwapped) {
+      // We decode the condition code from opcode.
+      if (Instr.isBranch())
+        OldCC = getCondFromBranchOpc(Instr.getOpcode());
+      else {
+        OldCC = getCondFromSETOpc(Instr.getOpcode());
+        if (OldCC != X86::COND_INVALID)
+          OpcIsSET = true;
+        else
+          OldCC = X86::getCondFromCMovOpc(Instr.getOpcode());
+      }
+      if (OldCC == X86::COND_INVALID) return false;
+    }
+    if (IsCmpZero) {
+      switch (OldCC) {
+      default: break;
+      case X86::COND_A: case X86::COND_AE:
+      case X86::COND_B: case X86::COND_BE:
+      case X86::COND_G: case X86::COND_GE:
+      case X86::COND_L: case X86::COND_LE:
+      case X86::COND_O: case X86::COND_NO:
+        // CF and OF are used, we can't perform this optimization.
+        return false;
+      }
+
+      // If we're updating the condition code check if we have to reverse the
+      // condition.
+      if (ShouldUpdateCC)
+        switch (OldCC) {
+        default:
+          return false;
+        case X86::COND_E:
+          break;
+        case X86::COND_NE:
+          NewCC = GetOppositeBranchCondition(NewCC);
+          break;
+        }
+    } else if (IsSwapped) {
+      // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
+      // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+      // We swap the condition code and synthesize the new opcode.
+      NewCC = getSwappedCondition(OldCC);
+      if (NewCC == X86::COND_INVALID) return false;
+    }
+
+    if ((ShouldUpdateCC || IsSwapped) && NewCC != OldCC) {
+      // Synthesize the new opcode.
+      bool HasMemoryOperand = Instr.hasOneMemOperand();
+      unsigned NewOpc;
+      if (Instr.isBranch())
+        NewOpc = GetCondBranchFromCond(NewCC);
+      else if(OpcIsSET)
+        NewOpc = getSETFromCond(NewCC, HasMemoryOperand);
+      else {
+        unsigned DstReg = Instr.getOperand(0).getReg();
+        NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(),
+                                 HasMemoryOperand);
+      }
+
+      // Push the MachineInstr to OpsToUpdate.
+      // If it is safe to remove CmpInstr, the condition code of these
+      // instructions will be modified.
+      OpsToUpdate.push_back(std::make_pair(&*I, NewOpc));
+    }
+    if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
+      // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
+      IsSafe = true;
+      break;
+    }
+  }
+
+  // If EFLAGS is not killed nor re-defined, we should check whether it is
+  // live-out. If it is live-out, do not optimize.
+  if ((IsCmpZero || IsSwapped) && !IsSafe) {
+    MachineBasicBlock *MBB = CmpInstr->getParent();
+    for (MachineBasicBlock *Successor : MBB->successors())
+      if (Successor->isLiveIn(X86::EFLAGS))
+        return false;
+  }
+
+  // The instruction to be updated is either Sub or MI.
+  Sub = IsCmpZero ? MI : Sub;
+  // Move Movr0Inst to the appropriate place before Sub.
+  if (Movr0Inst) {
+    // Look backwards until we find a def that doesn't use the current EFLAGS.
+    Def = Sub;
+    MachineBasicBlock::reverse_iterator
+      InsertI = MachineBasicBlock::reverse_iterator(++Def),
+                InsertE = Sub->getParent()->rend();
+    for (; InsertI != InsertE; ++InsertI) {
+      MachineInstr *Instr = &*InsertI;
+      if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
+          Instr->modifiesRegister(X86::EFLAGS, TRI)) {
+        Sub->getParent()->remove(Movr0Inst);
+        Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
+                                   Movr0Inst);
+        break;
+      }
+    }
+    if (InsertI == InsertE)
+      return false;
+  }
+
+  // Make sure Sub instruction defines EFLAGS and mark the def live.
+  unsigned i = 0, e = Sub->getNumOperands();
+  for (; i != e; ++i) {
+    MachineOperand &MO = Sub->getOperand(i);
+    if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
+      MO.setIsDead(false);
+      break;
+    }
+  }
+  assert(i != e && "Unable to locate a def EFLAGS operand");
+
+  CmpInstr->eraseFromParent();
+
+  // Modify the condition code of instructions in OpsToUpdate.
+  for (auto &Op : OpsToUpdate)
+    Op.first->setDesc(get(Op.second));
+  return true;
+}
+
+/// Try to remove the load by folding it to a register
+/// operand at the use. We fold the load instructions if load defines a virtual
+/// register, the virtual register is used once in the same BB, and the
+/// instructions in-between do not load or store, and have no side effects.
+MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
+                                              const MachineRegisterInfo *MRI,
+                                              unsigned &FoldAsLoadDefReg,
+                                              MachineInstr *&DefMI) const {
+  if (FoldAsLoadDefReg == 0)
+    return nullptr;
+  // To be conservative, if there exists another load, clear the load candidate.
+  if (MI->mayLoad()) {
+    FoldAsLoadDefReg = 0;
+    return nullptr;
+  }
+
+  // Check whether we can move DefMI here.
+  DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
+  assert(DefMI);
+  bool SawStore = false;
+  if (!DefMI->isSafeToMove(nullptr, SawStore))
+    return nullptr;
+
+  // Collect information about virtual register operands of MI.
+  unsigned SrcOperandId = 0;
+  bool FoundSrcOperand = false;
+  for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg != FoldAsLoadDefReg)
+      continue;
+    // Do not fold if we have a subreg use or a def or multiple uses.
+    if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
+      return nullptr;
+
+    SrcOperandId = i;
+    FoundSrcOperand = true;
+  }
+  if (!FoundSrcOperand)
+    return nullptr;
+
+  // Check whether we can fold the def into SrcOperandId.
+  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI)) {
+    FoldAsLoadDefReg = 0;
+    return FoldMI;
+  }
+
+  return nullptr;
+}
+
+/// Expand a single-def pseudo instruction to a two-addr
+/// instruction with two undef reads of the register being defined.
+/// This is used for mapping:
+///   %xmm4 = V_SET0
+/// to:
+///   %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef>
+///
+static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
+                             const MCInstrDesc &Desc) {
+  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
+  unsigned Reg = MIB->getOperand(0).getReg();
+  MIB->setDesc(Desc);
+
+  // MachineInstr::addOperand() will insert explicit operands before any
+  // implicit operands.
+  MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+  // But we don't trust that.
+  assert(MIB->getOperand(1).getReg() == Reg &&
+         MIB->getOperand(2).getReg() == Reg && "Misplaced operand");
+  return true;
+}
+
+/// Expand a single-def pseudo instruction to a two-addr
+/// instruction with two %k0 reads.
+/// This is used for mapping:
+///   %k4 = K_SET1
+/// to:
+///   %k4 = KXNORrr %k0, %k0
+static bool Expand2AddrKreg(MachineInstrBuilder &MIB,
+                            const MCInstrDesc &Desc, unsigned Reg) {
+  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
+  MIB->setDesc(Desc);
+  MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+  return true;
+}
+
+static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
+                          bool MinusOne) {
+  MachineBasicBlock &MBB = *MIB->getParent();
+  DebugLoc DL = MIB->getDebugLoc();
+  unsigned Reg = MIB->getOperand(0).getReg();
+
+  // Insert the XOR.
+  BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
+      .addReg(Reg, RegState::Undef)
+      .addReg(Reg, RegState::Undef);
+
+  // Turn the pseudo into an INC or DEC.
+  MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
+  MIB.addReg(Reg);
+
+  return true;
+}
+
+// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
+// code sequence is needed for other targets.
+static void expandLoadStackGuard(MachineInstrBuilder &MIB,
+                                 const TargetInstrInfo &TII) {
+  MachineBasicBlock &MBB = *MIB->getParent();
+  DebugLoc DL = MIB->getDebugLoc();
+  unsigned Reg = MIB->getOperand(0).getReg();
+  const GlobalValue *GV =
+      cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
+  unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
+  MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+      MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 8, 8);
+  MachineBasicBlock::iterator I = MIB.getInstr();
+
+  BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
+      .addReg(0).addGlobalAddress(GV, 0, X86II::MO_GOTPCREL).addReg(0)
+      .addMemOperand(MMO);
+  MIB->setDebugLoc(DL);
+  MIB->setDesc(TII.get(X86::MOV64rm));
+  MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
+}
+
+bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  bool HasAVX = Subtarget.hasAVX();
+  MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+  switch (MI->getOpcode()) {
+  case X86::MOV32r0:
+    return Expand2AddrUndef(MIB, get(X86::XOR32rr));
+  case X86::MOV32r1:
+    return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
+  case X86::MOV32r_1:
+    return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
+  case X86::SETB_C8r:
+    return Expand2AddrUndef(MIB, get(X86::SBB8rr));
+  case X86::SETB_C16r:
+    return Expand2AddrUndef(MIB, get(X86::SBB16rr));
+  case X86::SETB_C32r:
+    return Expand2AddrUndef(MIB, get(X86::SBB32rr));
+  case X86::SETB_C64r:
+    return Expand2AddrUndef(MIB, get(X86::SBB64rr));
+  case X86::V_SET0:
+  case X86::FsFLD0SS:
+  case X86::FsFLD0SD:
+    return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
+  case X86::AVX_SET0:
+    assert(HasAVX && "AVX not supported");
+    return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
+  case X86::AVX512_512_SET0:
+    return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
+  case X86::V_SETALLONES:
+    return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
+  case X86::AVX2_SETALLONES:
+    return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+  case X86::TEST8ri_NOREX:
+    MI->setDesc(get(X86::TEST8ri));
+    return true;
+  case X86::MOV32ri64:
+    MI->setDesc(get(X86::MOV32ri));
+    return true;
+
+  // KNL does not recognize dependency-breaking idioms for mask registers,
+  // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
+  // Using %k0 as the undef input register is a performance heuristic based
+  // on the assumption that %k0 is used less frequently than the other mask
+  // registers, since it is not usable as a write mask.
+  // FIXME: A more advanced approach would be to choose the best input mask
+  // register based on context.
+  case X86::KSET0B:
+  case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
+  case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
+  case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
+  case X86::KSET1B:
+  case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
+  case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
+  case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
+  case TargetOpcode::LOAD_STACK_GUARD:
+    expandLoadStackGuard(MIB, *this);
+    return true;
+  }
+  return false;
+}
+
+static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
+                        int PtrOffset = 0) {
+  unsigned NumAddrOps = MOs.size();
+
+  if (NumAddrOps < 4) {
+    // FrameIndex only - add an immediate offset (whether its zero or not).
+    for (unsigned i = 0; i != NumAddrOps; ++i)
+      MIB.addOperand(MOs[i]);
+    addOffset(MIB, PtrOffset);
+  } else {
+    // General Memory Addressing - we need to add any offset to an existing
+    // offset.
+    assert(MOs.size() == 5 && "Unexpected memory operand list length");
+    for (unsigned i = 0; i != NumAddrOps; ++i) {
+      const MachineOperand &MO = MOs[i];
+      if (i == 3 && PtrOffset != 0) {
+        MIB.addDisp(MO, PtrOffset);
+      } else {
+        MIB.addOperand(MO);
+      }
+    }
+  }
+}
+
+static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
+                                     ArrayRef<MachineOperand> MOs,
+                                     MachineBasicBlock::iterator InsertPt,
+                                     MachineInstr *MI,
+                                     const TargetInstrInfo &TII) {
+  // Create the base instruction with the memory operand as the first part.
+  // Omit the implicit operands, something BuildMI can't do.
+  MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
+                                              MI->getDebugLoc(), true);
+  MachineInstrBuilder MIB(MF, NewMI);
+  addOperands(MIB, MOs);
+
+  // Loop over the rest of the ri operands, converting them over.
+  unsigned NumOps = MI->getDesc().getNumOperands()-2;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    MachineOperand &MO = MI->getOperand(i+2);
+    MIB.addOperand(MO);
+  }
+  for (unsigned i = NumOps+2, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    MIB.addOperand(MO);
+  }
+
+  MachineBasicBlock *MBB = InsertPt->getParent();
+  MBB->insert(InsertPt, NewMI);
+
+  return MIB;
+}
+
+static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
+                              unsigned OpNo, ArrayRef<MachineOperand> MOs,
+                              MachineBasicBlock::iterator InsertPt,
+                              MachineInstr *MI, const TargetInstrInfo &TII,
+                              int PtrOffset = 0) {
+  // Omit the implicit operands, something BuildMI can't do.
+  MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
+                                              MI->getDebugLoc(), true);
+  MachineInstrBuilder MIB(MF, NewMI);
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (i == OpNo) {
+      assert(MO.isReg() && "Expected to fold into reg operand!");
+      addOperands(MIB, MOs, PtrOffset);
+    } else {
+      MIB.addOperand(MO);
+    }
+  }
+
+  MachineBasicBlock *MBB = InsertPt->getParent();
+  MBB->insert(InsertPt, NewMI);
+
+  return MIB;
+}
+
+static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
+                                ArrayRef<MachineOperand> MOs,
+                                MachineBasicBlock::iterator InsertPt,
+                                MachineInstr *MI) {
+  MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
+                                    MI->getDebugLoc(), TII.get(Opcode));
+  addOperands(MIB, MOs);
+  return MIB.addImm(0);
+}
+
+MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
+    MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
+    ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+    unsigned Size, unsigned Align) const {
+  switch (MI->getOpcode()) {
+  case X86::INSERTPSrr:
+  case X86::VINSERTPSrr:
+    // Attempt to convert the load of inserted vector into a fold load
+    // of a single float.
+    if (OpNum == 2) {
+      unsigned Imm = MI->getOperand(MI->getNumOperands() - 1).getImm();
+      unsigned ZMask = Imm & 15;
+      unsigned DstIdx = (Imm >> 4) & 3;
+      unsigned SrcIdx = (Imm >> 6) & 3;
+
+      unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize();
+      if (Size <= RCSize && 4 <= Align) {
+        int PtrOffset = SrcIdx * 4;
+        unsigned NewImm = (DstIdx << 4) | ZMask;
+        unsigned NewOpCode =
+            (MI->getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm
+                                                 : X86::INSERTPSrm);
+        MachineInstr *NewMI =
+            FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
+        NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
+        return NewMI;
+      }
+    }
+    break;
+  };
+
+  return nullptr;
+}
+
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
+    ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+    unsigned Size, unsigned Align, bool AllowCommute) const {
+  const DenseMap<unsigned,
+                 std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
+  bool isCallRegIndirect = Subtarget.callRegIndirect();
+  bool isTwoAddrFold = false;
+
+  // For CPUs that favor the register form of a call or push,
+  // do not fold loads into calls or pushes, unless optimizing for size
+  // aggressively.
+  if (isCallRegIndirect && !MF.getFunction()->optForMinSize() &&
+      (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r ||
+       MI->getOpcode() == X86::PUSH16r || MI->getOpcode() == X86::PUSH32r ||
+       MI->getOpcode() == X86::PUSH64r))
+    return nullptr;
+
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  bool isTwoAddr = NumOps > 1 &&
+    MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+  // FIXME: AsmPrinter doesn't know how to handle
+  // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
+  if (MI->getOpcode() == X86::ADD32ri &&
+      MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
+    return nullptr;
+
+  MachineInstr *NewMI = nullptr;
+
+  // Attempt to fold any custom cases we have.
+  if (MachineInstr *CustomMI =
+          foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align))
+    return CustomMI;
+
+  // Folding a memory location into the two-address part of a two-address
+  // instruction is different than folding it other places.  It requires
+  // replacing the *two* registers with the memory location.
+  if (isTwoAddr && NumOps >= 2 && OpNum < 2 &&
+      MI->getOperand(0).isReg() &&
+      MI->getOperand(1).isReg() &&
+      MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) {
+    OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+    isTwoAddrFold = true;
+  } else if (OpNum == 0) {
+    if (MI->getOpcode() == X86::MOV32r0) {
+      NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
+      if (NewMI)
+        return NewMI;
+    }
+
+    OpcodeTablePtr = &RegOp2MemOpTable0;
+  } else if (OpNum == 1) {
+    OpcodeTablePtr = &RegOp2MemOpTable1;
+  } else if (OpNum == 2) {
+    OpcodeTablePtr = &RegOp2MemOpTable2;
+  } else if (OpNum == 3) {
+    OpcodeTablePtr = &RegOp2MemOpTable3;
+  } else if (OpNum == 4) {
+    OpcodeTablePtr = &RegOp2MemOpTable4;
+  }
+
+  // If table selected...
+  if (OpcodeTablePtr) {
+    // Find the Opcode to fuse
+    DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+      OpcodeTablePtr->find(MI->getOpcode());
+    if (I != OpcodeTablePtr->end()) {
+      unsigned Opcode = I->second.first;
+      unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
+      if (Align < MinAlign)
+        return nullptr;
+      bool NarrowToMOV32rm = false;
+      if (Size) {
+        unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize();
+        if (Size < RCSize) {
+          // Check if it's safe to fold the load. If the size of the object is
+          // narrower than the load width, then it's not.
+          if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
+            return nullptr;
+          // If this is a 64-bit load, but the spill slot is 32, then we can do
+          // a 32-bit load which is implicitly zero-extended. This likely is
+          // due to live interval analysis remat'ing a load from stack slot.
+          if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
+            return nullptr;
+          Opcode = X86::MOV32rm;
+          NarrowToMOV32rm = true;
+        }
+      }
+
+      if (isTwoAddrFold)
+        NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
+      else
+        NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
+
+      if (NarrowToMOV32rm) {
+        // If this is the special case where we use a MOV32rm to load a 32-bit
+        // value and zero-extend the top bits. Change the destination register
+        // to a 32-bit one.
+        unsigned DstReg = NewMI->getOperand(0).getReg();
+        if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+          NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
+        else
+          NewMI->getOperand(0).setSubReg(X86::sub_32bit);
+      }
+      return NewMI;
+    }
+  }
+
+  // If the instruction and target operand are commutable, commute the
+  // instruction and try again.
+  if (AllowCommute) {
+    unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
+    if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
+      bool HasDef = MI->getDesc().getNumDefs();
+      unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
+      unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg();
+      unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg();
+      bool Tied1 =
+          0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
+      bool Tied2 =
+          0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
+
+      // If either of the commutable operands are tied to the destination
+      // then we can not commute + fold.
+      if ((HasDef && Reg0 == Reg1 && Tied1) ||
+          (HasDef && Reg0 == Reg2 && Tied2))
+        return nullptr;
+
+      MachineInstr *CommutedMI =
+          commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+      if (!CommutedMI) {
+        // Unable to commute.
+        return nullptr;
+      }
+      if (CommutedMI != MI) {
+        // New instruction. We can't fold from this.
+        CommutedMI->eraseFromParent();
+        return nullptr;
+      }
+
+      // Attempt to fold with the commuted version of the instruction.
+      NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt,
+                                    Size, Align, /*AllowCommute=*/false);
+      if (NewMI)
+        return NewMI;
+
+      // Folding failed again - undo the commute before returning.
+      MachineInstr *UncommutedMI =
+          commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+      if (!UncommutedMI) {
+        // Unable to commute.
+        return nullptr;
+      }
+      if (UncommutedMI != MI) {
+        // New instruction. It doesn't need to be kept.
+        UncommutedMI->eraseFromParent();
+        return nullptr;
+      }
+
+      // Return here to prevent duplicate fuse failure report.
+      return nullptr;
+    }
+  }
+
+  // No fusion
+  if (PrintFailedFusing && !MI->isCopy())
+    dbgs() << "We failed to fuse operand " << OpNum << " in " << *MI;
+  return nullptr;
+}
+
+/// Return true for all instructions that only update
+/// the first 32 or 64-bits of the destination register and leave the rest
+/// unmodified. This can be used to avoid folding loads if the instructions
+/// only update part of the destination register, and the non-updated part is
+/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
+/// instructions breaks the partial register dependency and it can improve
+/// performance. e.g.:
+///
+///   movss (%rdi), %xmm0
+///   cvtss2sd %xmm0, %xmm0
+///
+/// Instead of
+///   cvtss2sd (%rdi), %xmm0
+///
+/// FIXME: This should be turned into a TSFlags.
+///
+static bool hasPartialRegUpdate(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::CVTSI2SSrr:
+  case X86::CVTSI2SSrm:
+  case X86::CVTSI2SS64rr:
+  case X86::CVTSI2SS64rm:
+  case X86::CVTSI2SDrr:
+  case X86::CVTSI2SDrm:
+  case X86::CVTSI2SD64rr:
+  case X86::CVTSI2SD64rm:
+  case X86::CVTSD2SSrr:
+  case X86::CVTSD2SSrm:
+  case X86::Int_CVTSD2SSrr:
+  case X86::Int_CVTSD2SSrm:
+  case X86::CVTSS2SDrr:
+  case X86::CVTSS2SDrm:
+  case X86::Int_CVTSS2SDrr:
+  case X86::Int_CVTSS2SDrm:
+  case X86::RCPSSr:
+  case X86::RCPSSm:
+  case X86::RCPSSr_Int:
+  case X86::RCPSSm_Int:
+  case X86::ROUNDSDr:
+  case X86::ROUNDSDm:
+  case X86::ROUNDSDr_Int:
+  case X86::ROUNDSSr:
+  case X86::ROUNDSSm:
+  case X86::ROUNDSSr_Int:
+  case X86::RSQRTSSr:
+  case X86::RSQRTSSm:
+  case X86::RSQRTSSr_Int:
+  case X86::RSQRTSSm_Int:
+  case X86::SQRTSSr:
+  case X86::SQRTSSm:
+  case X86::SQRTSSr_Int:
+  case X86::SQRTSSm_Int:
+  case X86::SQRTSDr:
+  case X86::SQRTSDm:
+  case X86::SQRTSDr_Int:
+  case X86::SQRTSDm_Int:
+    return true;
+  }
+
+  return false;
+}
+
+/// Inform the ExeDepsFix pass how many idle
+/// instructions we would like before a partial register update.
+unsigned X86InstrInfo::
+getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
+                             const TargetRegisterInfo *TRI) const {
+  if (OpNum != 0 || !hasPartialRegUpdate(MI->getOpcode()))
+    return 0;
+
+  // If MI is marked as reading Reg, the partial register update is wanted.
+  const MachineOperand &MO = MI->getOperand(0);
+  unsigned Reg = MO.getReg();
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (MO.readsReg() || MI->readsVirtualRegister(Reg))
+      return 0;
+  } else {
+    if (MI->readsRegister(Reg, TRI))
+      return 0;
+  }
+
+  // If any of the preceding 16 instructions are reading Reg, insert a
+  // dependency breaking instruction.  The magic number is based on a few
+  // Nehalem experiments.
+  return 16;
+}
+
+// Return true for any instruction the copies the high bits of the first source
+// operand into the unused high bits of the destination operand.
+static bool hasUndefRegUpdate(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::VCVTSI2SSrr:
+  case X86::VCVTSI2SSrm:
+  case X86::Int_VCVTSI2SSrr:
+  case X86::Int_VCVTSI2SSrm:
+  case X86::VCVTSI2SS64rr:
+  case X86::VCVTSI2SS64rm:
+  case X86::Int_VCVTSI2SS64rr:
+  case X86::Int_VCVTSI2SS64rm:
+  case X86::VCVTSI2SDrr:
+  case X86::VCVTSI2SDrm:
+  case X86::Int_VCVTSI2SDrr:
+  case X86::Int_VCVTSI2SDrm:
+  case X86::VCVTSI2SD64rr:
+  case X86::VCVTSI2SD64rm:
+  case X86::Int_VCVTSI2SD64rr:
+  case X86::Int_VCVTSI2SD64rm:
+  case X86::VCVTSD2SSrr:
+  case X86::VCVTSD2SSrm:
+  case X86::Int_VCVTSD2SSrr:
+  case X86::Int_VCVTSD2SSrm:
+  case X86::VCVTSS2SDrr:
+  case X86::VCVTSS2SDrm:
+  case X86::Int_VCVTSS2SDrr:
+  case X86::Int_VCVTSS2SDrm:
+  case X86::VRCPSSr:
+  case X86::VRCPSSm:
+  case X86::VRCPSSm_Int:
+  case X86::VROUNDSDr:
+  case X86::VROUNDSDm:
+  case X86::VROUNDSDr_Int:
+  case X86::VROUNDSSr:
+  case X86::VROUNDSSm:
+  case X86::VROUNDSSr_Int:
+  case X86::VRSQRTSSr:
+  case X86::VRSQRTSSm:
+  case X86::VRSQRTSSm_Int:
+  case X86::VSQRTSSr:
+  case X86::VSQRTSSm:
+  case X86::VSQRTSSm_Int:
+  case X86::VSQRTSDr:
+  case X86::VSQRTSDm:
+  case X86::VSQRTSDm_Int:
+    // AVX-512
+  case X86::VCVTSD2SSZrr:
+  case X86::VCVTSD2SSZrm:
+  case X86::VCVTSS2SDZrr:
+  case X86::VCVTSS2SDZrm:
+    return true;
+  }
+
+  return false;
+}
+
+/// Inform the ExeDepsFix pass how many idle instructions we would like before
+/// certain undef register reads.
+///
+/// This catches the VCVTSI2SD family of instructions:
+///
+/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
+///
+/// We should to be careful *not* to catch VXOR idioms which are presumably
+/// handled specially in the pipeline:
+///
+/// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1
+///
+/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
+/// high bits that are passed-through are not live.
+unsigned X86InstrInfo::
+getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
+                     const TargetRegisterInfo *TRI) const {
+  if (!hasUndefRegUpdate(MI->getOpcode()))
+    return 0;
+
+  // Set the OpNum parameter to the first source operand.
+  OpNum = 1;
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+    // Use the same magic number as getPartialRegUpdateClearance.
+    return 16;
+  }
+  return 0;
+}
+
+void X86InstrInfo::
+breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
+                          const TargetRegisterInfo *TRI) const {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  // If MI kills this register, the false dependence is already broken.
+  if (MI->killsRegister(Reg, TRI))
+    return;
+
+  if (X86::VR128RegClass.contains(Reg)) {
+    // These instructions are all floating point domain, so xorps is the best
+    // choice.
+    unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg)
+      .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+    MI->addRegisterKilled(Reg, TRI, true);
+  } else if (X86::VR256RegClass.contains(Reg)) {
+    // Use vxorps to clear the full ymm register.
+    // It wants to read and write the xmm sub-register.
+    unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg)
+      .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef)
+      .addReg(Reg, RegState::ImplicitDefine);
+    MI->addRegisterKilled(Reg, TRI, true);
+  }
+}
+
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
+  // Check switch flag
+  if (NoFusing)
+    return nullptr;
+
+  // Unless optimizing for size, don't fold to avoid partial
+  // register update stalls
+  if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode()))
+    return nullptr;
+
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned Size = MFI->getObjectSize(FrameIndex);
+  unsigned Alignment = MFI->getObjectAlignment(FrameIndex);
+  // If the function stack isn't realigned we don't want to fold instructions
+  // that need increased alignment.
+  if (!RI.needsStackRealignment(MF))
+    Alignment =
+        std::min(Alignment, Subtarget.getFrameLowering()->getStackAlignment());
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    unsigned NewOpc = 0;
+    unsigned RCSize = 0;
+    switch (MI->getOpcode()) {
+    default: return nullptr;
+    case X86::TEST8rr:  NewOpc = X86::CMP8ri; RCSize = 1; break;
+    case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break;
+    case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break;
+    case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break;
+    }
+    // Check if it's safe to fold the load. If the size of the object is
+    // narrower than the load width, then it's not.
+    if (Size < RCSize)
+      return nullptr;
+    // Change to CMPXXri r, 0 first.
+    MI->setDesc(get(NewOpc));
+    MI->getOperand(1).ChangeToImmediate(0);
+  } else if (Ops.size() != 1)
+    return nullptr;
+
+  return foldMemoryOperandImpl(MF, MI, Ops[0],
+                               MachineOperand::CreateFI(FrameIndex), InsertPt,
+                               Size, Alignment, /*AllowCommute=*/true);
+}
+
+/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
+/// because the latter uses contents that wouldn't be defined in the folded
+/// version.  For instance, this transformation isn't legal:
+///   movss (%rdi), %xmm0
+///   addps %xmm0, %xmm0
+/// ->
+///   addps (%rdi), %xmm0
+///
+/// But this one is:
+///   movss (%rdi), %xmm0
+///   addss %xmm0, %xmm0
+/// ->
+///   addss (%rdi), %xmm0
+///
+static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
+                                             const MachineInstr &UserMI,
+                                             const MachineFunction &MF) {
+  unsigned Opc = LoadMI.getOpcode();
+  unsigned UserOpc = UserMI.getOpcode();
+  unsigned RegSize =
+      MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();
+
+  if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) {
+    // These instructions only load 32 bits, we can't fold them if the
+    // destination register is wider than 32 bits (4 bytes), and its user
+    // instruction isn't scalar (SS).
+    switch (UserOpc) {
+    case X86::ADDSSrr_Int: case X86::VADDSSrr_Int:
+    case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int:
+    case X86::MULSSrr_Int: case X86::VMULSSrr_Int:
+    case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int:
+    case X86::VFMADDSSr132r_Int: case X86::VFNMADDSSr132r_Int:
+    case X86::VFMADDSSr213r_Int: case X86::VFNMADDSSr213r_Int:
+    case X86::VFMADDSSr231r_Int: case X86::VFNMADDSSr231r_Int:
+    case X86::VFMSUBSSr132r_Int: case X86::VFNMSUBSSr132r_Int:
+    case X86::VFMSUBSSr213r_Int: case X86::VFNMSUBSSr213r_Int:
+    case X86::VFMSUBSSr231r_Int: case X86::VFNMSUBSSr231r_Int:
+      return false;
+    default:
+      return true;
+    }
+  }
+
+  if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) {
+    // These instructions only load 64 bits, we can't fold them if the
+    // destination register is wider than 64 bits (8 bytes), and its user
+    // instruction isn't scalar (SD).
+    switch (UserOpc) {
+    case X86::ADDSDrr_Int: case X86::VADDSDrr_Int:
+    case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int:
+    case X86::MULSDrr_Int: case X86::VMULSDrr_Int:
+    case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int:
+    case X86::VFMADDSDr132r_Int: case X86::VFNMADDSDr132r_Int:
+    case X86::VFMADDSDr213r_Int: case X86::VFNMADDSDr213r_Int:
+    case X86::VFMADDSDr231r_Int: case X86::VFNMADDSDr231r_Int:
+    case X86::VFMSUBSDr132r_Int: case X86::VFNMSUBSDr132r_Int:
+    case X86::VFMSUBSDr213r_Int: case X86::VFNMSUBSDr213r_Int:
+    case X86::VFMSUBSDr231r_Int: case X86::VFNMSUBSDr231r_Int:
+      return false;
+    default:
+      return true;
+    }
+  }
+
+  return false;
+}
+
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const {
+  // If loading from a FrameIndex, fold directly from the FrameIndex.
+  unsigned NumOps = LoadMI->getDesc().getNumOperands();
+  int FrameIndex;
+  if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
+    if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF))
+      return nullptr;
+    return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex);
+  }
+
+  // Check switch flag
+  if (NoFusing) return nullptr;
+
+  // Avoid partial register update stalls unless optimizing for size.
+  if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode()))
+    return nullptr;
+
+  // Determine the alignment of the load.
+  unsigned Alignment = 0;
+  if (LoadMI->hasOneMemOperand())
+    Alignment = (*LoadMI->memoperands_begin())->getAlignment();
+  else
+    switch (LoadMI->getOpcode()) {
+    case X86::AVX2_SETALLONES:
+    case X86::AVX_SET0:
+      Alignment = 32;
+      break;
+    case X86::V_SET0:
+    case X86::V_SETALLONES:
+      Alignment = 16;
+      break;
+    case X86::FsFLD0SD:
+      Alignment = 8;
+      break;
+    case X86::FsFLD0SS:
+      Alignment = 4;
+      break;
+    default:
+      return nullptr;
+    }
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    unsigned NewOpc = 0;
+    switch (MI->getOpcode()) {
+    default: return nullptr;
+    case X86::TEST8rr:  NewOpc = X86::CMP8ri; break;
+    case X86::TEST16rr: NewOpc = X86::CMP16ri8; break;
+    case X86::TEST32rr: NewOpc = X86::CMP32ri8; break;
+    case X86::TEST64rr: NewOpc = X86::CMP64ri8; break;
+    }
+    // Change to CMPXXri r, 0 first.
+    MI->setDesc(get(NewOpc));
+    MI->getOperand(1).ChangeToImmediate(0);
+  } else if (Ops.size() != 1)
+    return nullptr;
+
+  // Make sure the subregisters match.
+  // Otherwise we risk changing the size of the load.
+  if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg())
+    return nullptr;
+
+  SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
+  switch (LoadMI->getOpcode()) {
+  case X86::V_SET0:
+  case X86::V_SETALLONES:
+  case X86::AVX2_SETALLONES:
+  case X86::AVX_SET0:
+  case X86::FsFLD0SD:
+  case X86::FsFLD0SS: {
+    // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
+    // Create a constant-pool entry and operands to load from it.
+
+    // Medium and large mode can't fold loads this way.
+    if (MF.getTarget().getCodeModel() != CodeModel::Small &&
+        MF.getTarget().getCodeModel() != CodeModel::Kernel)
+      return nullptr;
+
+    // x86-32 PIC requires a PIC base register for constant pools.
+    unsigned PICBase = 0;
+    if (MF.getTarget().getRelocationModel() == Reloc::PIC_) {
+      if (Subtarget.is64Bit())
+        PICBase = X86::RIP;
+      else
+        // FIXME: PICBase = getGlobalBaseReg(&MF);
+        // This doesn't work for several reasons.
+        // 1. GlobalBaseReg may have been spilled.
+        // 2. It may not be live at MI.
+        return nullptr;
+    }
+
+    // Create a constant-pool entry.
+    MachineConstantPool &MCP = *MF.getConstantPool();
+    Type *Ty;
+    unsigned Opc = LoadMI->getOpcode();
+    if (Opc == X86::FsFLD0SS)
+      Ty = Type::getFloatTy(MF.getFunction()->getContext());
+    else if (Opc == X86::FsFLD0SD)
+      Ty = Type::getDoubleTy(MF.getFunction()->getContext());
+    else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0)
+      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
+    else
+      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
+
+    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES);
+    const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
+                                    Constant::getNullValue(Ty);
+    unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
+
+    // Create operands to load from the constant pool entry.
+    MOs.push_back(MachineOperand::CreateReg(PICBase, false));
+    MOs.push_back(MachineOperand::CreateImm(1));
+    MOs.push_back(MachineOperand::CreateReg(0, false));
+    MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
+    MOs.push_back(MachineOperand::CreateReg(0, false));
+    break;
+  }
+  default: {
+    if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF))
+      return nullptr;
+
+    // Folding a normal load. Just copy the load's address operands.
+    MOs.append(LoadMI->operands_begin() + NumOps - X86::AddrNumOperands,
+               LoadMI->operands_begin() + NumOps);
+    break;
+  }
+  }
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
+                               /*Size=*/0, Alignment, /*AllowCommute=*/true);
+}
+
+bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                                unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                                SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+    MemOp2RegOpTable.find(MI->getOpcode());
+  if (I == MemOp2RegOpTable.end())
+    return false;
+  unsigned Opc = I->second.first;
+  unsigned Index = I->second.second & TB_INDEX_MASK;
+  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+  if (UnfoldLoad && !FoldedLoad)
+    return false;
+  UnfoldLoad &= FoldedLoad;
+  if (UnfoldStore && !FoldedStore)
+    return false;
+  UnfoldStore &= FoldedStore;
+
+  const MCInstrDesc &MCID = get(Opc);
+  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+  // TODO: Check if 32-byte or greater accesses are slow too?
+  if (!MI->hasOneMemOperand() &&
+      RC == &X86::VR128RegClass &&
+      Subtarget.isUnalignedMem16Slow())
+    // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
+    // conservatively assume the address is unaligned. That's bad for
+    // performance.
+    return false;
+  SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps;
+  SmallVector<MachineOperand,2> BeforeOps;
+  SmallVector<MachineOperand,2> AfterOps;
+  SmallVector<MachineOperand,4> ImpOps;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &Op = MI->getOperand(i);
+    if (i >= Index && i < Index + X86::AddrNumOperands)
+      AddrOps.push_back(Op);
+    else if (Op.isReg() && Op.isImplicit())
+      ImpOps.push_back(Op);
+    else if (i < Index)
+      BeforeOps.push_back(Op);
+    else if (i > Index)
+      AfterOps.push_back(Op);
+  }
+
+  // Emit the load instruction.
+  if (UnfoldLoad) {
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractLoadMemRefs(MI->memoperands_begin(),
+                            MI->memoperands_end());
+    loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs);
+    if (UnfoldStore) {
+      // Address operands cannot be marked isKill.
+      for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
+        MachineOperand &MO = NewMIs[0]->getOperand(i);
+        if (MO.isReg())
+          MO.setIsKill(false);
+      }
+    }
+  }
+
+  // Emit the data processing instruction.
+  MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI->getDebugLoc(), true);
+  MachineInstrBuilder MIB(MF, DataMI);
+
+  if (FoldedStore)
+    MIB.addReg(Reg, RegState::Define);
+  for (MachineOperand &BeforeOp : BeforeOps)
+    MIB.addOperand(BeforeOp);
+  if (FoldedLoad)
+    MIB.addReg(Reg);
+  for (MachineOperand &AfterOp : AfterOps)
+    MIB.addOperand(AfterOp);
+  for (MachineOperand &ImpOp : ImpOps) {
+    MIB.addReg(ImpOp.getReg(),
+               getDefRegState(ImpOp.isDef()) |
+               RegState::Implicit |
+               getKillRegState(ImpOp.isKill()) |
+               getDeadRegState(ImpOp.isDead()) |
+               getUndefRegState(ImpOp.isUndef()));
+  }
+  // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
+  switch (DataMI->getOpcode()) {
+  default: break;
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP8ri: {
+    MachineOperand &MO0 = DataMI->getOperand(0);
+    MachineOperand &MO1 = DataMI->getOperand(1);
+    if (MO1.getImm() == 0) {
+      unsigned NewOpc;
+      switch (DataMI->getOpcode()) {
+      default: llvm_unreachable("Unreachable!");
+      case X86::CMP64ri8:
+      case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
+      case X86::CMP32ri8:
+      case X86::CMP32ri:   NewOpc = X86::TEST32rr; break;
+      case X86::CMP16ri8:
+      case X86::CMP16ri:   NewOpc = X86::TEST16rr; break;
+      case X86::CMP8ri:    NewOpc = X86::TEST8rr; break;
+      }
+      DataMI->setDesc(get(NewOpc));
+      MO1.ChangeToRegister(MO0.getReg(), false);
+    }
+  }
+  }
+  NewMIs.push_back(DataMI);
+
+  // Emit the store instruction.
+  if (UnfoldStore) {
+    const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractStoreMemRefs(MI->memoperands_begin(),
+                             MI->memoperands_end());
+    storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs);
+  }
+
+  return true;
+}
+
+bool
+X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                                  SmallVectorImpl<SDNode*> &NewNodes) const {
+  if (!N->isMachineOpcode())
+    return false;
+
+  DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+    MemOp2RegOpTable.find(N->getMachineOpcode());
+  if (I == MemOp2RegOpTable.end())
+    return false;
+  unsigned Opc = I->second.first;
+  unsigned Index = I->second.second & TB_INDEX_MASK;
+  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+  const MCInstrDesc &MCID = get(Opc);
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+  unsigned NumDefs = MCID.NumDefs;
+  std::vector<SDValue> AddrOps;
+  std::vector<SDValue> BeforeOps;
+  std::vector<SDValue> AfterOps;
+  SDLoc dl(N);
+  unsigned NumOps = N->getNumOperands();
+  for (unsigned i = 0; i != NumOps-1; ++i) {
+    SDValue Op = N->getOperand(i);
+    if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands)
+      AddrOps.push_back(Op);
+    else if (i < Index-NumDefs)
+      BeforeOps.push_back(Op);
+    else if (i > Index-NumDefs)
+      AfterOps.push_back(Op);
+  }
+  SDValue Chain = N->getOperand(NumOps-1);
+  AddrOps.push_back(Chain);
+
+  // Emit the load instruction.
+  SDNode *Load = nullptr;
+  if (FoldedLoad) {
+    EVT VT = *RC->vt_begin();
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+                            cast<MachineSDNode>(N)->memoperands_end());
+    if (!(*MMOs.first) &&
+        RC == &X86::VR128RegClass &&
+        Subtarget.isUnalignedMem16Slow())
+      // Do not introduce a slow unaligned load.
+      return false;
+    // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
+    // memory access is slow above.
+    unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+    bool isAligned = (*MMOs.first) &&
+                     (*MMOs.first)->getAlignment() >= Alignment;
+    Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
+                              VT, MVT::Other, AddrOps);
+    NewNodes.push_back(Load);
+
+    // Preserve memory reference information.
+    cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
+  }
+
+  // Emit the data processing instruction.
+  std::vector<EVT> VTs;
+  const TargetRegisterClass *DstRC = nullptr;
+  if (MCID.getNumDefs() > 0) {
+    DstRC = getRegClass(MCID, 0, &RI, MF);
+    VTs.push_back(*DstRC->vt_begin());
+  }
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+    EVT VT = N->getValueType(i);
+    if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
+      VTs.push_back(VT);
+  }
+  if (Load)
+    BeforeOps.push_back(SDValue(Load, 0));
+  BeforeOps.insert(BeforeOps.end(), AfterOps.begin(), AfterOps.end());
+  SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
+  NewNodes.push_back(NewNode);
+
+  // Emit the store instruction.
+  if (FoldedStore) {
+    AddrOps.pop_back();
+    AddrOps.push_back(SDValue(NewNode, 0));
+    AddrOps.push_back(Chain);
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+                             cast<MachineSDNode>(N)->memoperands_end());
+    if (!(*MMOs.first) &&
+        RC == &X86::VR128RegClass &&
+        Subtarget.isUnalignedMem16Slow())
+      // Do not introduce a slow unaligned store.
+      return false;
+    // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
+    // memory access is slow above.
+    unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+    bool isAligned = (*MMOs.first) &&
+                     (*MMOs.first)->getAlignment() >= Alignment;
+    SDNode *Store =
+        DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
+                           dl, MVT::Other, AddrOps);
+    NewNodes.push_back(Store);
+
+    // Preserve memory reference information.
+    cast<MachineSDNode>(Store)->setMemRefs(MMOs.first, MMOs.second);
+  }
+
+  return true;
+}
+
+unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
+                                      bool UnfoldLoad, bool UnfoldStore,
+                                      unsigned *LoadRegIndex) const {
+  DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+    MemOp2RegOpTable.find(Opc);
+  if (I == MemOp2RegOpTable.end())
+    return 0;
+  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+  if (UnfoldLoad && !FoldedLoad)
+    return 0;
+  if (UnfoldStore && !FoldedStore)
+    return 0;
+  if (LoadRegIndex)
+    *LoadRegIndex = I->second.second & TB_INDEX_MASK;
+  return I->second.first;
+}
+
+bool
+X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                     int64_t &Offset1, int64_t &Offset2) const {
+  if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
+    return false;
+  unsigned Opc1 = Load1->getMachineOpcode();
+  unsigned Opc2 = Load2->getMachineOpcode();
+  switch (Opc1) {
+  default: return false;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp32m:
+  case X86::LD_Fp64m:
+  case X86::LD_Fp80m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::FsMOVAPSrm:
+  case X86::FsMOVAPDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::MOVDQUrm:
+  // AVX load instructions
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::FsVMOVAPSrm:
+  case X86::FsVMOVAPDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
+  case X86::VMOVAPSYrm:
+  case X86::VMOVUPSYrm:
+  case X86::VMOVAPDYrm:
+  case X86::VMOVDQAYrm:
+  case X86::VMOVDQUYrm:
+    break;
+  }
+  switch (Opc2) {
+  default: return false;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp32m:
+  case X86::LD_Fp64m:
+  case X86::LD_Fp80m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::FsMOVAPSrm:
+  case X86::FsMOVAPDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::MOVDQUrm:
+  // AVX load instructions
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::FsVMOVAPSrm:
+  case X86::FsVMOVAPDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
+  case X86::VMOVAPSYrm:
+  case X86::VMOVUPSYrm:
+  case X86::VMOVAPDYrm:
+  case X86::VMOVDQAYrm:
+  case X86::VMOVDQUYrm:
+    break;
+  }
+
+  // Check if chain operands and base addresses match.
+  if (Load1->getOperand(0) != Load2->getOperand(0) ||
+      Load1->getOperand(5) != Load2->getOperand(5))
+    return false;
+  // Segment operands should match as well.
+  if (Load1->getOperand(4) != Load2->getOperand(4))
+    return false;
+  // Scale should be 1, Index should be Reg0.
+  if (Load1->getOperand(1) == Load2->getOperand(1) &&
+      Load1->getOperand(2) == Load2->getOperand(2)) {
+    if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1)
+      return false;
+
+    // Now let's examine the displacements.
+    if (isa<ConstantSDNode>(Load1->getOperand(3)) &&
+        isa<ConstantSDNode>(Load2->getOperand(3))) {
+      Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue();
+      Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue();
+      return true;
+    }
+  }
+  return false;
+}
+
+bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                           int64_t Offset1, int64_t Offset2,
+                                           unsigned NumLoads) const {
+  assert(Offset2 > Offset1);
+  if ((Offset2 - Offset1) / 8 > 64)
+    return false;
+
+  unsigned Opc1 = Load1->getMachineOpcode();
+  unsigned Opc2 = Load2->getMachineOpcode();
+  if (Opc1 != Opc2)
+    return false;  // FIXME: overly conservative?
+
+  switch (Opc1) {
+  default: break;
+  case X86::LD_Fp32m:
+  case X86::LD_Fp64m:
+  case X86::LD_Fp80m:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+    return false;
+  }
+
+  EVT VT = Load1->getValueType(0);
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    // XMM registers. In 64-bit mode we can be a bit more aggressive since we
+    // have 16 of them to play with.
+    if (Subtarget.is64Bit()) {
+      if (NumLoads >= 3)
+        return false;
+    } else if (NumLoads) {
+      return false;
+    }
+    break;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::i64:
+  case MVT::f32:
+  case MVT::f64:
+    if (NumLoads)
+      return false;
+    break;
+  }
+
+  return true;
+}
+
+bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First,
+                                          MachineInstr *Second) const {
+  // Check if this processor supports macro-fusion. Since this is a minor
+  // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
+  // proxy for SandyBridge+.
+  if (!Subtarget.hasAVX())
+    return false;
+
+  enum {
+    FuseTest,
+    FuseCmp,
+    FuseInc
+  } FuseKind;
+
+  switch(Second->getOpcode()) {
+  default:
+    return false;
+  case X86::JE_1:
+  case X86::JNE_1:
+  case X86::JL_1:
+  case X86::JLE_1:
+  case X86::JG_1:
+  case X86::JGE_1:
+    FuseKind = FuseInc;
+    break;
+  case X86::JB_1:
+  case X86::JBE_1:
+  case X86::JA_1:
+  case X86::JAE_1:
+    FuseKind = FuseCmp;
+    break;
+  case X86::JS_1:
+  case X86::JNS_1:
+  case X86::JP_1:
+  case X86::JNP_1:
+  case X86::JO_1:
+  case X86::JNO_1:
+    FuseKind = FuseTest;
+    break;
+  }
+  switch (First->getOpcode()) {
+  default:
+    return false;
+  case X86::TEST8rr:
+  case X86::TEST16rr:
+  case X86::TEST32rr:
+  case X86::TEST64rr:
+  case X86::TEST8ri:
+  case X86::TEST16ri:
+  case X86::TEST32ri:
+  case X86::TEST32i32:
+  case X86::TEST64i32:
+  case X86::TEST64ri32:
+  case X86::TEST8rm:
+  case X86::TEST16rm:
+  case X86::TEST32rm:
+  case X86::TEST64rm:
+  case X86::TEST8ri_NOREX:
+  case X86::AND16i16:
+  case X86::AND16ri:
+  case X86::AND16ri8:
+  case X86::AND16rm:
+  case X86::AND16rr:
+  case X86::AND32i32:
+  case X86::AND32ri:
+  case X86::AND32ri8:
+  case X86::AND32rm:
+  case X86::AND32rr:
+  case X86::AND64i32:
+  case X86::AND64ri32:
+  case X86::AND64ri8:
+  case X86::AND64rm:
+  case X86::AND64rr:
+  case X86::AND8i8:
+  case X86::AND8ri:
+  case X86::AND8rm:
+  case X86::AND8rr:
+    return true;
+  case X86::CMP16i16:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP16rm:
+  case X86::CMP16rr:
+  case X86::CMP32i32:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP32rm:
+  case X86::CMP32rr:
+  case X86::CMP64i32:
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP64rm:
+  case X86::CMP64rr:
+  case X86::CMP8i8:
+  case X86::CMP8ri:
+  case X86::CMP8rm:
+  case X86::CMP8rr:
+  case X86::ADD16i16:
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD16ri8_DB:
+  case X86::ADD16ri_DB:
+  case X86::ADD16rm:
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB:
+  case X86::ADD32i32:
+  case X86::ADD32ri:
+  case X86::ADD32ri8:
+  case X86::ADD32ri8_DB:
+  case X86::ADD32ri_DB:
+  case X86::ADD32rm:
+  case X86::ADD32rr:
+  case X86::ADD32rr_DB:
+  case X86::ADD64i32:
+  case X86::ADD64ri32:
+  case X86::ADD64ri32_DB:
+  case X86::ADD64ri8:
+  case X86::ADD64ri8_DB:
+  case X86::ADD64rm:
+  case X86::ADD64rr:
+  case X86::ADD64rr_DB:
+  case X86::ADD8i8:
+  case X86::ADD8mi:
+  case X86::ADD8mr:
+  case X86::ADD8ri:
+  case X86::ADD8rm:
+  case X86::ADD8rr:
+  case X86::SUB16i16:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB16rm:
+  case X86::SUB16rr:
+  case X86::SUB32i32:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB32rm:
+  case X86::SUB32rr:
+  case X86::SUB64i32:
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB64rm:
+  case X86::SUB64rr:
+  case X86::SUB8i8:
+  case X86::SUB8ri:
+  case X86::SUB8rm:
+  case X86::SUB8rr:
+    return FuseKind == FuseCmp || FuseKind == FuseInc;
+  case X86::INC16r:
+  case X86::INC32r:
+  case X86::INC64r:
+  case X86::INC8r:
+  case X86::DEC16r:
+  case X86::DEC32r:
+  case X86::DEC64r:
+  case X86::DEC8r:
+    return FuseKind == FuseInc;
+  }
+}
+
+bool X86InstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1 && "Invalid X86 branch condition!");
+  X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
+  if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E)
+    return true;
+  Cond[0].setImm(GetOppositeBranchCondition(CC));
+  return false;
+}
+
+bool X86InstrInfo::
+isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+  // FIXME: Return false for x87 stack register classes for now. We can't
+  // allow any loads of these registers before FpGet_ST0_80.
+  return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass ||
+           RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
+}
+
+/// Return a virtual register initialized with the
+/// the global base register value. Output instructions required to
+/// initialize the register in the function entry block, if necessary.
+///
+/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
+///
+unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
+  assert(!Subtarget.is64Bit() &&
+         "X86-64 PIC uses RIP relative addressing");
+
+  X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+  unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+  if (GlobalBaseReg != 0)
+    return GlobalBaseReg;
+
+  // Create the register. The code to initialize it is inserted
+  // later, by the CGBR pass (below).
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+  X86FI->setGlobalBaseReg(GlobalBaseReg);
+  return GlobalBaseReg;
+}
+
+// These are the replaceable SSE instructions. Some of these have Int variants
+// that we don't include here. We don't want to replace instructions selected
+// by intrinsics.
+static const uint16_t ReplaceableInstrs[][3] = {
+  //PackedSingle     PackedDouble    PackedInt
+  { X86::MOVAPSmr,   X86::MOVAPDmr,  X86::MOVDQAmr  },
+  { X86::MOVAPSrm,   X86::MOVAPDrm,  X86::MOVDQArm  },
+  { X86::MOVAPSrr,   X86::MOVAPDrr,  X86::MOVDQArr  },
+  { X86::MOVUPSmr,   X86::MOVUPDmr,  X86::MOVDQUmr  },
+  { X86::MOVUPSrm,   X86::MOVUPDrm,  X86::MOVDQUrm  },
+  { X86::MOVLPSmr,   X86::MOVLPDmr,  X86::MOVPQI2QImr  },
+  { X86::MOVNTPSmr,  X86::MOVNTPDmr, X86::MOVNTDQmr },
+  { X86::ANDNPSrm,   X86::ANDNPDrm,  X86::PANDNrm   },
+  { X86::ANDNPSrr,   X86::ANDNPDrr,  X86::PANDNrr   },
+  { X86::ANDPSrm,    X86::ANDPDrm,   X86::PANDrm    },
+  { X86::ANDPSrr,    X86::ANDPDrr,   X86::PANDrr    },
+  { X86::ORPSrm,     X86::ORPDrm,    X86::PORrm     },
+  { X86::ORPSrr,     X86::ORPDrr,    X86::PORrr     },
+  { X86::XORPSrm,    X86::XORPDrm,   X86::PXORrm    },
+  { X86::XORPSrr,    X86::XORPDrr,   X86::PXORrr    },
+  // AVX 128-bit support
+  { X86::VMOVAPSmr,  X86::VMOVAPDmr,  X86::VMOVDQAmr  },
+  { X86::VMOVAPSrm,  X86::VMOVAPDrm,  X86::VMOVDQArm  },
+  { X86::VMOVAPSrr,  X86::VMOVAPDrr,  X86::VMOVDQArr  },
+  { X86::VMOVUPSmr,  X86::VMOVUPDmr,  X86::VMOVDQUmr  },
+  { X86::VMOVUPSrm,  X86::VMOVUPDrm,  X86::VMOVDQUrm  },
+  { X86::VMOVLPSmr,  X86::VMOVLPDmr,  X86::VMOVPQI2QImr  },
+  { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
+  { X86::VANDNPSrm,  X86::VANDNPDrm,  X86::VPANDNrm   },
+  { X86::VANDNPSrr,  X86::VANDNPDrr,  X86::VPANDNrr   },
+  { X86::VANDPSrm,   X86::VANDPDrm,   X86::VPANDrm    },
+  { X86::VANDPSrr,   X86::VANDPDrr,   X86::VPANDrr    },
+  { X86::VORPSrm,    X86::VORPDrm,    X86::VPORrm     },
+  { X86::VORPSrr,    X86::VORPDrr,    X86::VPORrr     },
+  { X86::VXORPSrm,   X86::VXORPDrm,   X86::VPXORrm    },
+  { X86::VXORPSrr,   X86::VXORPDrr,   X86::VPXORrr    },
+  // AVX 256-bit support
+  { X86::VMOVAPSYmr,   X86::VMOVAPDYmr,   X86::VMOVDQAYmr  },
+  { X86::VMOVAPSYrm,   X86::VMOVAPDYrm,   X86::VMOVDQAYrm  },
+  { X86::VMOVAPSYrr,   X86::VMOVAPDYrr,   X86::VMOVDQAYrr  },
+  { X86::VMOVUPSYmr,   X86::VMOVUPDYmr,   X86::VMOVDQUYmr  },
+  { X86::VMOVUPSYrm,   X86::VMOVUPDYrm,   X86::VMOVDQUYrm  },
+  { X86::VMOVNTPSYmr,  X86::VMOVNTPDYmr,  X86::VMOVNTDQYmr }
+};
+
+static const uint16_t ReplaceableInstrsAVX2[][3] = {
+  //PackedSingle       PackedDouble       PackedInt
+  { X86::VANDNPSYrm,   X86::VANDNPDYrm,   X86::VPANDNYrm   },
+  { X86::VANDNPSYrr,   X86::VANDNPDYrr,   X86::VPANDNYrr   },
+  { X86::VANDPSYrm,    X86::VANDPDYrm,    X86::VPANDYrm    },
+  { X86::VANDPSYrr,    X86::VANDPDYrr,    X86::VPANDYrr    },
+  { X86::VORPSYrm,     X86::VORPDYrm,     X86::VPORYrm     },
+  { X86::VORPSYrr,     X86::VORPDYrr,     X86::VPORYrr     },
+  { X86::VXORPSYrm,    X86::VXORPDYrm,    X86::VPXORYrm    },
+  { X86::VXORPSYrr,    X86::VXORPDYrr,    X86::VPXORYrr    },
+  { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
+  { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
+  { X86::VINSERTF128rm,  X86::VINSERTF128rm,  X86::VINSERTI128rm },
+  { X86::VINSERTF128rr,  X86::VINSERTF128rr,  X86::VINSERTI128rr },
+  { X86::VPERM2F128rm,   X86::VPERM2F128rm,   X86::VPERM2I128rm },
+  { X86::VPERM2F128rr,   X86::VPERM2F128rr,   X86::VPERM2I128rr },
+  { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm},
+  { X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr},
+  { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr},
+  { X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm},
+  { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
+  { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm}
+};
+
+// FIXME: Some shuffle and unpack instructions have equivalents in different
+// domains, but they require a bit more work than just switching opcodes.
+
+static const uint16_t *lookup(unsigned opcode, unsigned domain) {
+  for (const uint16_t (&Row)[3] : ReplaceableInstrs)
+    if (Row[domain-1] == opcode)
+      return Row;
+  return nullptr;
+}
+
+static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
+  for (const uint16_t (&Row)[3] : ReplaceableInstrsAVX2)
+    if (Row[domain-1] == opcode)
+      return Row;
+  return nullptr;
+}
+
+std::pair<uint16_t, uint16_t>
+X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const {
+  uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+  bool hasAVX2 = Subtarget.hasAVX2();
+  uint16_t validDomains = 0;
+  if (domain && lookup(MI->getOpcode(), domain))
+    validDomains = 0xe;
+  else if (domain && lookupAVX2(MI->getOpcode(), domain))
+    validDomains = hasAVX2 ? 0xe : 0x6;
+  return std::make_pair(domain, validDomains);
+}
+
+void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
+  assert(Domain>0 && Domain<4 && "Invalid execution domain");
+  uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+  assert(dom && "Not an SSE instruction");
+  const uint16_t *table = lookup(MI->getOpcode(), dom);
+  if (!table) { // try the other table
+    assert((Subtarget.hasAVX2() || Domain < 3) &&
+           "256-bit vector operations only available in AVX2");
+    table = lookupAVX2(MI->getOpcode(), dom);
+  }
+  assert(table && "Cannot change domain");
+  MI->setDesc(get(table[Domain-1]));
+}
+
+/// Return the noop instruction to use for a noop.
+void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(X86::NOOP);
+}
+
+// This code must remain in sync with getJumpInstrTableEntryBound in this class!
+// In particular, getJumpInstrTableEntryBound must always return an upper bound
+// on the encoding lengths of the instructions generated by
+// getUnconditionalBranch and getTrap.
+void X86InstrInfo::getUnconditionalBranch(
+    MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const {
+  Branch.setOpcode(X86::JMP_1);
+  Branch.addOperand(MCOperand::createExpr(BranchTarget));
+}
+
+// This code must remain in sync with getJumpInstrTableEntryBound in this class!
+// In particular, getJumpInstrTableEntryBound must always return an upper bound
+// on the encoding lengths of the instructions generated by
+// getUnconditionalBranch and getTrap.
+void X86InstrInfo::getTrap(MCInst &MI) const {
+  MI.setOpcode(X86::TRAP);
+}
+
+// See getTrap and getUnconditionalBranch for conditions on the value returned
+// by this function.
+unsigned X86InstrInfo::getJumpInstrTableEntryBound() const {
+  // 5 bytes suffice: JMP_4 Symbol@PLT is uses 1 byte (E9) for the JMP_4 and 4
+  // bytes for the symbol offset. And TRAP is ud2, which is two bytes (0F 0B).
+  return 5;
+}
+
+bool X86InstrInfo::isHighLatencyDef(int opc) const {
+  switch (opc) {
+  default: return false;
+  case X86::DIVSDrm:
+  case X86::DIVSDrm_Int:
+  case X86::DIVSDrr:
+  case X86::DIVSDrr_Int:
+  case X86::DIVSSrm:
+  case X86::DIVSSrm_Int:
+  case X86::DIVSSrr:
+  case X86::DIVSSrr_Int:
+  case X86::SQRTPDm:
+  case X86::SQRTPDr:
+  case X86::SQRTPSm:
+  case X86::SQRTPSr:
+  case X86::SQRTSDm:
+  case X86::SQRTSDm_Int:
+  case X86::SQRTSDr:
+  case X86::SQRTSDr_Int:
+  case X86::SQRTSSm:
+  case X86::SQRTSSm_Int:
+  case X86::SQRTSSr:
+  case X86::SQRTSSr_Int:
+  // AVX instructions with high latency
+  case X86::VDIVSDrm:
+  case X86::VDIVSDrm_Int:
+  case X86::VDIVSDrr:
+  case X86::VDIVSDrr_Int:
+  case X86::VDIVSSrm:
+  case X86::VDIVSSrm_Int:
+  case X86::VDIVSSrr:
+  case X86::VDIVSSrr_Int:
+  case X86::VSQRTPDm:
+  case X86::VSQRTPDr:
+  case X86::VSQRTPSm:
+  case X86::VSQRTPSr:
+  case X86::VSQRTSDm:
+  case X86::VSQRTSDm_Int:
+  case X86::VSQRTSDr:
+  case X86::VSQRTSSm:
+  case X86::VSQRTSSm_Int:
+  case X86::VSQRTSSr:
+  case X86::VSQRTPDZm:
+  case X86::VSQRTPDZr:
+  case X86::VSQRTPSZm:
+  case X86::VSQRTPSZr:
+  case X86::VSQRTSDZm:
+  case X86::VSQRTSDZm_Int:
+  case X86::VSQRTSDZr:
+  case X86::VSQRTSSZm_Int:
+  case X86::VSQRTSSZr:
+  case X86::VSQRTSSZm:
+  case X86::VDIVSDZrm:
+  case X86::VDIVSDZrr:
+  case X86::VDIVSSZrm:
+  case X86::VDIVSSZrr:
+
+  case X86::VGATHERQPSZrm:
+  case X86::VGATHERQPDZrm:
+  case X86::VGATHERDPDZrm:
+  case X86::VGATHERDPSZrm:
+  case X86::VPGATHERQDZrm:
+  case X86::VPGATHERQQZrm:
+  case X86::VPGATHERDDZrm:
+  case X86::VPGATHERDQZrm:
+  case X86::VSCATTERQPDZmr:
+  case X86::VSCATTERQPSZmr:
+  case X86::VSCATTERDPDZmr:
+  case X86::VSCATTERDPSZmr:
+  case X86::VPSCATTERQDZmr:
+  case X86::VPSCATTERQQZmr:
+  case X86::VPSCATTERDDZmr:
+  case X86::VPSCATTERDQZmr:
+    return true;
+  }
+}
+
+bool X86InstrInfo::
+hasHighOperandLatency(const TargetSchedModel &SchedModel,
+                      const MachineRegisterInfo *MRI,
+                      const MachineInstr *DefMI, unsigned DefIdx,
+                      const MachineInstr *UseMI, unsigned UseIdx) const {
+  return isHighLatencyDef(DefMI->getOpcode());
+}
+
+bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst,
+                                           const MachineBasicBlock *MBB) const {
+  assert((Inst.getNumOperands() == 3 || Inst.getNumOperands() == 4) &&
+         "Reassociation needs binary operators");
+
+  // Integer binary math/logic instructions have a third source operand:
+  // the EFLAGS register. That operand must be both defined here and never
+  // used; ie, it must be dead. If the EFLAGS operand is live, then we can
+  // not change anything because rearranging the operands could affect other
+  // instructions that depend on the exact status flags (zero, sign, etc.)
+  // that are set by using these particular operands with this operation.
+  if (Inst.getNumOperands() == 4) {
+    assert(Inst.getOperand(3).isReg() &&
+           Inst.getOperand(3).getReg() == X86::EFLAGS &&
+           "Unexpected operand in reassociable instruction");
+    if (!Inst.getOperand(3).isDead())
+      return false;
+  }
+
+  return TargetInstrInfo::hasReassociableOperands(Inst, MBB);
+}
+
+// TODO: There are many more machine instruction opcodes to match:
+//       1. Other data types (integer, vectors)
+//       2. Other math / logic operations (xor, or)
+//       3. Other forms of the same operation (intrinsics and other variants)
+bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
+  switch (Inst.getOpcode()) {
+  case X86::AND8rr:
+  case X86::AND16rr:
+  case X86::AND32rr:
+  case X86::AND64rr:
+  case X86::OR8rr:
+  case X86::OR16rr:
+  case X86::OR32rr:
+  case X86::OR64rr:
+  case X86::XOR8rr:
+  case X86::XOR16rr:
+  case X86::XOR32rr:
+  case X86::XOR64rr:
+  case X86::IMUL16rr:
+  case X86::IMUL32rr:
+  case X86::IMUL64rr:
+  case X86::PANDrr:
+  case X86::PORrr:
+  case X86::PXORrr:
+  case X86::VPANDrr:
+  case X86::VPANDYrr:
+  case X86::VPORrr:
+  case X86::VPORYrr:
+  case X86::VPXORrr:
+  case X86::VPXORYrr:
+  // Normal min/max instructions are not commutative because of NaN and signed
+  // zero semantics, but these are. Thus, there's no need to check for global
+  // relaxed math; the instructions themselves have the properties we need.
+  case X86::MAXCPDrr:
+  case X86::MAXCPSrr:
+  case X86::MAXCSDrr:
+  case X86::MAXCSSrr:
+  case X86::MINCPDrr:
+  case X86::MINCPSrr:
+  case X86::MINCSDrr:
+  case X86::MINCSSrr:
+  case X86::VMAXCPDrr:
+  case X86::VMAXCPSrr:
+  case X86::VMAXCPDYrr:
+  case X86::VMAXCPSYrr:
+  case X86::VMAXCSDrr:
+  case X86::VMAXCSSrr:
+  case X86::VMINCPDrr:
+  case X86::VMINCPSrr:
+  case X86::VMINCPDYrr:
+  case X86::VMINCPSYrr:
+  case X86::VMINCSDrr:
+  case X86::VMINCSSrr:
+    return true;
+  case X86::ADDPDrr:
+  case X86::ADDPSrr:
+  case X86::ADDSDrr:
+  case X86::ADDSSrr:
+  case X86::MULPDrr:
+  case X86::MULPSrr:
+  case X86::MULSDrr:
+  case X86::MULSSrr:
+  case X86::VADDPDrr:
+  case X86::VADDPSrr:
+  case X86::VADDPDYrr:
+  case X86::VADDPSYrr:
+  case X86::VADDSDrr:
+  case X86::VADDSSrr:
+  case X86::VMULPDrr:
+  case X86::VMULPSrr:
+  case X86::VMULPDYrr:
+  case X86::VMULPSYrr:
+  case X86::VMULSDrr:
+  case X86::VMULSSrr:
+    return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+  default:
+    return false;
+  }
+}
+
+/// This is an architecture-specific helper function of reassociateOps.
+/// Set special operand attributes for new instructions after reassociation.
+void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
+                                         MachineInstr &OldMI2,
+                                         MachineInstr &NewMI1,
+                                         MachineInstr &NewMI2) const {
+  // Integer instructions define an implicit EFLAGS source register operand as
+  // the third source (fourth total) operand.
+  if (OldMI1.getNumOperands() != 4 || OldMI2.getNumOperands() != 4)
+    return;
+
+  assert(NewMI1.getNumOperands() == 4 && NewMI2.getNumOperands() == 4 &&
+         "Unexpected instruction type for reassociation");
+
+  MachineOperand &OldOp1 = OldMI1.getOperand(3);
+  MachineOperand &OldOp2 = OldMI2.getOperand(3);
+  MachineOperand &NewOp1 = NewMI1.getOperand(3);
+  MachineOperand &NewOp2 = NewMI2.getOperand(3);
+
+  assert(OldOp1.isReg() && OldOp1.getReg() == X86::EFLAGS && OldOp1.isDead() &&
+         "Must have dead EFLAGS operand in reassociable instruction");
+  assert(OldOp2.isReg() && OldOp2.getReg() == X86::EFLAGS && OldOp2.isDead() &&
+         "Must have dead EFLAGS operand in reassociable instruction");
+
+  (void)OldOp1;
+  (void)OldOp2;
+
+  assert(NewOp1.isReg() && NewOp1.getReg() == X86::EFLAGS &&
+         "Unexpected operand in reassociable instruction");
+  assert(NewOp2.isReg() && NewOp2.getReg() == X86::EFLAGS &&
+         "Unexpected operand in reassociable instruction");
+
+  // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
+  // of this pass or other passes. The EFLAGS operands must be dead in these new
+  // instructions because the EFLAGS operands in the original instructions must
+  // be dead in order for reassociation to occur.
+  NewOp1.setIsDead();
+  NewOp2.setIsDead();
+}
+
+std::pair<unsigned, unsigned>
+X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+  return std::make_pair(TF, 0u);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+  using namespace X86II;
+  static const std::pair<unsigned, const char *> TargetFlags[] = {
+      {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
+      {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
+      {MO_GOT, "x86-got"},
+      {MO_GOTOFF, "x86-gotoff"},
+      {MO_GOTPCREL, "x86-gotpcrel"},
+      {MO_PLT, "x86-plt"},
+      {MO_TLSGD, "x86-tlsgd"},
+      {MO_TLSLD, "x86-tlsld"},
+      {MO_TLSLDM, "x86-tlsldm"},
+      {MO_GOTTPOFF, "x86-gottpoff"},
+      {MO_INDNTPOFF, "x86-indntpoff"},
+      {MO_TPOFF, "x86-tpoff"},
+      {MO_DTPOFF, "x86-dtpoff"},
+      {MO_NTPOFF, "x86-ntpoff"},
+      {MO_GOTNTPOFF, "x86-gotntpoff"},
+      {MO_DLLIMPORT, "x86-dllimport"},
+      {MO_DARWIN_STUB, "x86-darwin-stub"},
+      {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
+      {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
+      {MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, "x86-darwin-hidden-nonlazy-pic-base"},
+      {MO_TLVP, "x86-tlvp"},
+      {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
+      {MO_SECREL, "x86-secrel"}};
+  return makeArrayRef(TargetFlags);
+}
+
+namespace {
+  /// Create Global Base Reg pass. This initializes the PIC
+  /// global base register for x86-32.
+  struct CGBR : public MachineFunctionPass {
+    static char ID;
+    CGBR() : MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      const X86TargetMachine *TM =
+        static_cast<const X86TargetMachine *>(&MF.getTarget());
+      const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+
+      // Don't do anything if this is 64-bit as 64-bit PIC
+      // uses RIP relative addressing.
+      if (STI.is64Bit())
+        return false;
+
+      // Only emit a global base reg in PIC mode.
+      if (TM->getRelocationModel() != Reloc::PIC_)
+        return false;
+
+      X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+      unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+
+      // If we didn't need a GlobalBaseReg, don't insert code.
+      if (GlobalBaseReg == 0)
+        return false;
+
+      // Insert the set of GlobalBaseReg into the first MBB of the function
+      MachineBasicBlock &FirstMBB = MF.front();
+      MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+      DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
+      MachineRegisterInfo &RegInfo = MF.getRegInfo();
+      const X86InstrInfo *TII = STI.getInstrInfo();
+
+      unsigned PC;
+      if (STI.isPICStyleGOT())
+        PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+      else
+        PC = GlobalBaseReg;
+
+      // Operand of MovePCtoStack is completely ignored by asm printer. It's
+      // only used in JIT code emission as displacement to pc.
+      BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
+
+      // If we're using vanilla 'GOT' PIC style, we should use relative addressing
+      // not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
+      if (STI.isPICStyleGOT()) {
+        // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
+        BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
+          .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
+                                        X86II::MO_GOT_ABSOLUTE_ADDRESS);
+      }
+
+      return true;
+    }
+
+    const char *getPassName() const override {
+      return "X86 PIC Global Base Reg Initialization";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char CGBR::ID = 0;
+FunctionPass*
+llvm::createX86GlobalBaseRegPass() { return new CGBR(); }
+
+namespace {
+  struct LDTLSCleanup : public MachineFunctionPass {
+    static char ID;
+    LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      X86MachineFunctionInfo* MFI = MF.getInfo<X86MachineFunctionInfo>();
+      if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
+        // No point folding accesses if there isn't at least two.
+        return false;
+      }
+
+      MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+      return VisitNode(DT->getRootNode(), 0);
+    }
+
+    // Visit the dominator subtree rooted at Node in pre-order.
+    // If TLSBaseAddrReg is non-null, then use that to replace any
+    // TLS_base_addr instructions. Otherwise, create the register
+    // when the first such instruction is seen, and then use it
+    // as we encounter more instructions.
+    bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+      MachineBasicBlock *BB = Node->getBlock();
+      bool Changed = false;
+
+      // Traverse the current block.
+      for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+           ++I) {
+        switch (I->getOpcode()) {
+          case X86::TLS_base_addr32:
+          case X86::TLS_base_addr64:
+            if (TLSBaseAddrReg)
+              I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+            else
+              I = SetRegister(I, &TLSBaseAddrReg);
+            Changed = true;
+            break;
+          default:
+            break;
+        }
+      }
+
+      // Visit the children of this block in the dominator tree.
+      for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
+           I != E; ++I) {
+        Changed |= VisitNode(*I, TLSBaseAddrReg);
+      }
+
+      return Changed;
+    }
+
+    // Replace the TLS_base_addr instruction I with a copy from
+    // TLSBaseAddrReg, returning the new instruction.
+    MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
+                                         unsigned TLSBaseAddrReg) {
+      MachineFunction *MF = I->getParent()->getParent();
+      const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
+      const bool is64Bit = STI.is64Bit();
+      const X86InstrInfo *TII = STI.getInstrInfo();
+
+      // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
+      MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY),
+                                   is64Bit ? X86::RAX : X86::EAX)
+                                   .addReg(TLSBaseAddrReg);
+
+      // Erase the TLS_base_addr instruction.
+      I->eraseFromParent();
+
+      return Copy;
+    }
+
+    // Create a virtal register in *TLSBaseAddrReg, and populate it by
+    // inserting a copy instruction after I. Returns the new instruction.
+    MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
+      MachineFunction *MF = I->getParent()->getParent();
+      const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
+      const bool is64Bit = STI.is64Bit();
+      const X86InstrInfo *TII = STI.getInstrInfo();
+
+      // Create a virtual register for the TLS base address.
+      MachineRegisterInfo &RegInfo = MF->getRegInfo();
+      *TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit
+                                                      ? &X86::GR64RegClass
+                                                      : &X86::GR32RegClass);
+
+      // Insert a copy from RAX/EAX to TLSBaseAddrReg.
+      MachineInstr *Next = I->getNextNode();
+      MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY),
+                                   *TLSBaseAddrReg)
+                                   .addReg(is64Bit ? X86::RAX : X86::EAX);
+
+      return Copy;
+    }
+
+    const char *getPassName() const override {
+      return "Local Dynamic TLS Access Clean-up";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass*
+llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
new file mode 100644
index 0000000..edd09d6
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
@@ -0,0 +1,571 @@
+//===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRINFO_H
+#define LLVM_LIB_TARGET_X86_X86INSTRINFO_H
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86RegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "X86GenInstrInfo.inc"
+
+namespace llvm {
+  class X86RegisterInfo;
+  class X86Subtarget;
+
+namespace X86 {
+  // X86 specific condition code. These correspond to X86_*_COND in
+  // X86InstrInfo.td. They must be kept in synch.
+  enum CondCode {
+    COND_A  = 0,
+    COND_AE = 1,
+    COND_B  = 2,
+    COND_BE = 3,
+    COND_E  = 4,
+    COND_G  = 5,
+    COND_GE = 6,
+    COND_L  = 7,
+    COND_LE = 8,
+    COND_NE = 9,
+    COND_NO = 10,
+    COND_NP = 11,
+    COND_NS = 12,
+    COND_O  = 13,
+    COND_P  = 14,
+    COND_S  = 15,
+    LAST_VALID_COND = COND_S,
+
+    // Artificial condition codes. These are used by AnalyzeBranch
+    // to indicate a block terminated with two conditional branches to
+    // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE,
+    // which can't be represented on x86 with a single condition. These
+    // are never used in MachineInstrs.
+    COND_NE_OR_P,
+    COND_NP_OR_E,
+
+    COND_INVALID
+  };
+
+  // Turn condition code into conditional branch opcode.
+  unsigned GetCondBranchFromCond(CondCode CC);
+
+  /// \brief Return a set opcode for the given condition and whether it has
+  /// a memory operand.
+  unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
+
+  /// \brief Return a cmov opcode for the given condition, register size in
+  /// bytes, and operand type.
+  unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
+                           bool HasMemoryOperand = false);
+
+  // Turn CMov opcode into condition code.
+  CondCode getCondFromCMovOpc(unsigned Opc);
+
+  /// GetOppositeBranchCondition - Return the inverse of the specified cond,
+  /// e.g. turning COND_E to COND_NE.
+  CondCode GetOppositeBranchCondition(CondCode CC);
+}  // end namespace X86;
+
+
+/// isGlobalStubReference - Return true if the specified TargetFlag operand is
+/// a reference to a stub for a global, not the global itself.
+inline static bool isGlobalStubReference(unsigned char TargetFlag) {
+  switch (TargetFlag) {
+  case X86II::MO_DLLIMPORT: // dllimport stub.
+  case X86II::MO_GOTPCREL:  // rip-relative GOT reference.
+  case X86II::MO_GOT:       // normal GOT reference.
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:        // Normal $non_lazy_ptr ref.
+  case X86II::MO_DARWIN_NONLAZY:                 // Normal $non_lazy_ptr ref.
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Hidden $non_lazy_ptr ref.
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// isGlobalRelativeToPICBase - Return true if the specified global value
+/// reference is relative to a 32-bit PIC base (X86ISD::GlobalBaseReg).  If this
+/// is true, the addressing mode has the PIC base register added in (e.g. EBX).
+inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
+  switch (TargetFlag) {
+  case X86II::MO_GOTOFF:                         // isPICStyleGOT: local global.
+  case X86II::MO_GOT:                            // isPICStyleGOT: other global.
+  case X86II::MO_PIC_BASE_OFFSET:                // Darwin local global.
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:        // Darwin/32 external global.
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Darwin/32 hidden global.
+  case X86II::MO_TLVP:                           // ??? Pretty sure..
+    return true;
+  default:
+    return false;
+  }
+}
+
+inline static bool isScale(const MachineOperand &MO) {
+  return MO.isImm() &&
+    (MO.getImm() == 1 || MO.getImm() == 2 ||
+     MO.getImm() == 4 || MO.getImm() == 8);
+}
+
+inline static bool isLeaMem(const MachineInstr *MI, unsigned Op) {
+  if (MI->getOperand(Op).isFI()) return true;
+  return Op+X86::AddrSegmentReg <= MI->getNumOperands() &&
+    MI->getOperand(Op+X86::AddrBaseReg).isReg() &&
+    isScale(MI->getOperand(Op+X86::AddrScaleAmt)) &&
+    MI->getOperand(Op+X86::AddrIndexReg).isReg() &&
+    (MI->getOperand(Op+X86::AddrDisp).isImm() ||
+     MI->getOperand(Op+X86::AddrDisp).isGlobal() ||
+     MI->getOperand(Op+X86::AddrDisp).isCPI() ||
+     MI->getOperand(Op+X86::AddrDisp).isJTI());
+}
+
+inline static bool isMem(const MachineInstr *MI, unsigned Op) {
+  if (MI->getOperand(Op).isFI()) return true;
+  return Op+X86::AddrNumOperands <= MI->getNumOperands() &&
+    MI->getOperand(Op+X86::AddrSegmentReg).isReg() &&
+    isLeaMem(MI, Op);
+}
+
+class X86InstrInfo final : public X86GenInstrInfo {
+  X86Subtarget &Subtarget;
+  const X86RegisterInfo RI;
+
+  /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
+  /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps.
+  ///
+  typedef DenseMap<unsigned,
+                   std::pair<unsigned, unsigned> > RegOp2MemOpTableType;
+  RegOp2MemOpTableType RegOp2MemOpTable2Addr;
+  RegOp2MemOpTableType RegOp2MemOpTable0;
+  RegOp2MemOpTableType RegOp2MemOpTable1;
+  RegOp2MemOpTableType RegOp2MemOpTable2;
+  RegOp2MemOpTableType RegOp2MemOpTable3;
+  RegOp2MemOpTableType RegOp2MemOpTable4;
+
+  /// MemOp2RegOpTable - Load / store unfolding opcode map.
+  ///
+  typedef DenseMap<unsigned,
+                   std::pair<unsigned, unsigned> > MemOp2RegOpTableType;
+  MemOp2RegOpTableType MemOp2RegOpTable;
+
+  static void AddTableEntry(RegOp2MemOpTableType &R2MTable,
+                            MemOp2RegOpTableType &M2RTable,
+                            unsigned RegOp, unsigned MemOp, unsigned Flags);
+
+  virtual void anchor();
+
+  bool AnalyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                         MachineBasicBlock *&FBB,
+                         SmallVectorImpl<MachineOperand> &Cond,
+                         SmallVectorImpl<MachineInstr *> &CondBranches,
+                         bool AllowModify) const;
+
+public:
+  explicit X86InstrInfo(X86Subtarget &STI);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const X86RegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// getSPAdjust - This returns the stack pointer adjustment made by
+  /// this instruction. For x86, we need to handle more complex call
+  /// sequences involving PUSHes.
+  int getSPAdjust(const MachineInstr *MI) const override;
+
+  /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
+  /// extension instruction. That is, it's like a copy where it's legal for the
+  /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
+  /// true, then it's expected the pre-extension value is available as a subreg
+  /// of the result register. This also returns the sub-register index in
+  /// SubIdx.
+  bool isCoalescableExtInstr(const MachineInstr &MI,
+                             unsigned &SrcReg, unsigned &DstReg,
+                             unsigned &SubIdx) const override;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+  /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
+  /// stack locations as well.  This uses a heuristic so it isn't
+  /// reliable for correctness.
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                     int &FrameIndex) const override;
+
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+  /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
+  /// stack locations as well.  This uses a heuristic so it isn't
+  /// reliable for correctness.
+  unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
+                                    int &FrameIndex) const override;
+
+  bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
+                                         AliasAnalysis *AA) const override;
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, unsigned SubIdx,
+                     const MachineInstr *Orig,
+                     const TargetRegisterInfo &TRI) const override;
+
+  /// Given an operand within a MachineInstr, insert preceding code to put it
+  /// into the right format for a particular kind of LEA instruction. This may
+  /// involve using an appropriate super-register instead (with an implicit use
+  /// of the original) or creating a new virtual register and inserting COPY
+  /// instructions to get the data into the right class.
+  ///
+  /// Reference parameters are set to indicate how caller should add this
+  /// operand to the LEA instruction.
+  bool classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
+                      unsigned LEAOpcode, bool AllowSP,
+                      unsigned &NewSrc, bool &isKill,
+                      bool &isUndef, MachineOperand &ImplicitOp) const;
+
+  /// convertToThreeAddress - This method must be implemented by targets that
+  /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
+  /// may be able to convert a two-address instruction into a true
+  /// three-address instruction on demand.  This allows the X86 target (for
+  /// example) to convert ADD and SHL instructions into LEA instructions if they
+  /// would require register copies due to two-addressness.
+  ///
+  /// This method returns a null pointer if the transformation cannot be
+  /// performed, otherwise it returns the new instruction.
+  ///
+  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                      MachineBasicBlock::iterator &MBBI,
+                                      LiveVariables *LV) const override;
+
+  /// Returns true iff the routine could find two commutable operands in the
+  /// given machine instruction.
+  /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
+  /// input values can be re-defined in this method only if the input values
+  /// are not pre-defined, which is designated by the special value
+  /// 'CommuteAnyOperandIndex' assigned to it.
+  /// If both of indices are pre-defined and refer to some operands, then the
+  /// method simply returns true if the corresponding operands are commutable
+  /// and returns false otherwise.
+  ///
+  /// For example, calling this method this way:
+  ///     unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
+  ///     findCommutedOpIndices(MI, Op1, Op2);
+  /// can be interpreted as a query asking to find an operand that would be
+  /// commutable with the operand#1.
+  bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+                             unsigned &SrcOpIdx2) const override;
+
+  /// Returns true if the routine could find two commutable operands
+  /// in the given FMA instruction. Otherwise, returns false.
+  ///
+  /// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments.
+  /// The output indices of the commuted operands are returned in these
+  /// arguments. Also, the input values of these arguments may be preset either
+  /// to indices of operands that must be commuted or be equal to a special
+  /// value 'CommuteAnyOperandIndex' which means that the corresponding
+  /// operand index is not set and this method is free to pick any of
+  /// available commutable operands.
+  ///
+  /// For example, calling this method this way:
+  ///     unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex;
+  ///     findFMA3CommutedOpIndices(MI, Idx1, Idx2);
+  /// can be interpreted as a query asking if the operand #1 can be swapped
+  /// with any other available operand (e.g. operand #2, operand #3, etc.).
+  ///
+  /// The returned FMA opcode may differ from the opcode in the given MI.
+  /// For example, commuting the operands #1 and #3 in the following FMA
+  ///     FMA213 #1, #2, #3
+  /// results into instruction with adjusted opcode:
+  ///     FMA231 #3, #2, #1
+  bool findFMA3CommutedOpIndices(MachineInstr *MI,
+                                 unsigned &SrcOpIdx1,
+                                 unsigned &SrcOpIdx2) const;
+
+  /// Returns an adjusted FMA opcode that must be used in FMA instruction that
+  /// performs the same computations as the given MI but which has the operands
+  /// \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
+  /// It may return 0 if it is unsafe to commute the operands.
+  ///
+  /// The returned FMA opcode may differ from the opcode in the given \p MI.
+  /// For example, commuting the operands #1 and #3 in the following FMA
+  ///     FMA213 #1, #2, #3
+  /// results into instruction with adjusted opcode:
+  ///     FMA231 #3, #2, #1
+  unsigned getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
+                                          unsigned SrcOpIdx1,
+                                          unsigned SrcOpIdx2) const;
+
+  // Branch analysis.
+  bool isUnpredicatedTerminator(const MachineInstr* MI) const override;
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
+                             unsigned &Offset,
+                             const TargetRegisterInfo *TRI) const override;
+  bool AnalyzeBranchPredicate(MachineBasicBlock &MBB,
+                              TargetInstrInfo::MachineBranchPredicate &MBP,
+                              bool AllowModify = false) const override;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        DebugLoc DL) const override;
+  bool canInsertSelect(const MachineBasicBlock&, ArrayRef<MachineOperand> Cond,
+                       unsigned, unsigned, int&, int&, int&) const override;
+  void insertSelect(MachineBasicBlock &MBB,
+                    MachineBasicBlock::iterator MI, DebugLoc DL,
+                    unsigned DstReg, ArrayRef<MachineOperand> Cond,
+                    unsigned TrueReg, unsigned FalseReg) const override;
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                      SmallVectorImpl<MachineOperand> &Addr,
+                      const TargetRegisterClass *RC,
+                      MachineInstr::mmo_iterator MMOBegin,
+                      MachineInstr::mmo_iterator MMOEnd,
+                      SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                       SmallVectorImpl<MachineOperand> &Addr,
+                       const TargetRegisterClass *RC,
+                       MachineInstr::mmo_iterator MMOBegin,
+                       MachineInstr::mmo_iterator MMOEnd,
+                       SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+
+  /// foldMemoryOperand - If this target supports it, fold a load or store of
+  /// the specified stack slot into the specified machine instruction for the
+  /// specified operand(s).  If this is possible, the target should perform the
+  /// folding and return true, otherwise it should return false.  If it folds
+  /// the instruction, it is likely that the MachineInstruction the iterator
+  /// references has been changed.
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                                      ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
+                                      int FrameIndex) const override;
+
+  /// foldMemoryOperand - Same as the previous version except it allows folding
+  /// of any load and store from / to any address, not just from a specific
+  /// stack slot.
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                                      ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
+                                      MachineInstr *LoadMI) const override;
+
+  /// unfoldMemoryOperand - Separate a single instruction which folded a load or
+  /// a store or a load and a store into two or more instruction. If this is
+  /// possible, returns true as well as the new instructions by reference.
+  bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                         unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                         SmallVectorImpl<MachineInstr*> &NewMIs) const override;
+
+  bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                           SmallVectorImpl<SDNode*> &NewNodes) const override;
+
+  /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new
+  /// instruction after load / store are unfolded from an instruction of the
+  /// specified opcode. It returns zero if the specified unfolding is not
+  /// possible. If LoadRegIndex is non-null, it is filled in with the operand
+  /// index of the operand which will hold the register holding the loaded
+  /// value.
+  unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
+                              bool UnfoldLoad, bool UnfoldStore,
+                              unsigned *LoadRegIndex = nullptr) const override;
+
+  /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
+  /// to determine if two loads are loading from the same base address. It
+  /// should only return true if the base pointers are the same and the
+  /// only differences between the two addresses are the offset. It also returns
+  /// the offsets by reference.
+  bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1,
+                               int64_t &Offset2) const override;
+
+  /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+  /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
+  /// be scheduled togther. On some targets if two loads are loading from
+  /// addresses in the same cache line, it's better if they are scheduled
+  /// together. This function takes two integers that represent the load offsets
+  /// from the common base address. It returns true if it decides it's desirable
+  /// to schedule the two loads together. "NumLoads" is the number of loads that
+  /// have already been scheduled after Load1.
+  bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                               int64_t Offset1, int64_t Offset2,
+                               unsigned NumLoads) const override;
+
+  bool shouldScheduleAdjacent(MachineInstr* First,
+                              MachineInstr *Second) const override;
+
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine
+  /// instruction that defines the specified register class.
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
+
+  /// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction tha
+  /// would clobber the EFLAGS condition register. Note the result may be
+  /// conservative. If it cannot definitely determine the safety after visiting
+  /// a few instructions in each direction it assumes it's not safe.
+  bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I) const;
+
+  /// True if MI has a condition code def, e.g. EFLAGS, that is
+  /// not marked dead.
+  bool hasLiveCondCodeDef(MachineInstr *MI) const;
+
+  /// getGlobalBaseReg - Return a virtual register initialized with the
+  /// the global base register value. Output instructions required to
+  /// initialize the register in the function entry block, if necessary.
+  ///
+  unsigned getGlobalBaseReg(MachineFunction *MF) const;
+
+  std::pair<uint16_t, uint16_t>
+  getExecutionDomain(const MachineInstr *MI) const override;
+
+  void setExecutionDomain(MachineInstr *MI, unsigned Domain) const override;
+
+  unsigned
+    getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
+                                 const TargetRegisterInfo *TRI) const override;
+  unsigned getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
+                                const TargetRegisterInfo *TRI) const override;
+  void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                                      unsigned OpNum,
+                                      ArrayRef<MachineOperand> MOs,
+                                      MachineBasicBlock::iterator InsertPt,
+                                      unsigned Size, unsigned Alignment,
+                                      bool AllowCommute) const;
+
+  void
+  getUnconditionalBranch(MCInst &Branch,
+                         const MCSymbolRefExpr *BranchTarget) const override;
+
+  void getTrap(MCInst &MI) const override;
+
+  unsigned getJumpInstrTableEntryBound() const override;
+
+  bool isHighLatencyDef(int opc) const override;
+
+  bool hasHighOperandLatency(const TargetSchedModel &SchedModel,
+                             const MachineRegisterInfo *MRI,
+                             const MachineInstr *DefMI, unsigned DefIdx,
+                             const MachineInstr *UseMI,
+                             unsigned UseIdx) const override;
+  
+  bool useMachineCombiner() const override {
+    return true;
+  }
+
+  bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
+
+  bool hasReassociableOperands(const MachineInstr &Inst,
+                               const MachineBasicBlock *MBB) const override;
+
+  void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2,
+                             MachineInstr &NewMI1,
+                             MachineInstr &NewMI2) const override;
+
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2 if having two register operands, and the value it
+  /// compares against in CmpValue. Return true if the comparison instruction
+  /// can be analyzed.
+  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &CmpMask,
+                      int &CmpValue) const override;
+
+  /// optimizeCompareInstr - Check if there exists an earlier instruction that
+  /// operates on the same source operands and sets flags in the same way as
+  /// Compare; remove Compare if possible.
+  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int CmpMask, int CmpValue,
+                            const MachineRegisterInfo *MRI) const override;
+
+  /// optimizeLoadInstr - Try to remove the load by folding it to a register
+  /// operand at the use. We fold the load instructions if and only if the
+  /// def and use are in the same BB. We only look at one load and see
+  /// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register
+  /// defined by the load we are trying to fold. DefMI returns the machine
+  /// instruction that defines FoldAsLoadDefReg, and the function returns
+  /// the machine instruction generated due to folding.
+  MachineInstr* optimizeLoadInstr(MachineInstr *MI,
+                                  const MachineRegisterInfo *MRI,
+                                  unsigned &FoldAsLoadDefReg,
+                                  MachineInstr *&DefMI) const override;
+
+  std::pair<unsigned, unsigned>
+  decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableDirectMachineOperandTargetFlags() const override;
+
+protected:
+  /// Commutes the operands in the given instruction by changing the operands
+  /// order and/or changing the instruction's opcode and/or the immediate value
+  /// operand.
+  ///
+  /// The arguments 'CommuteOpIdx1' and 'CommuteOpIdx2' specify the operands
+  /// to be commuted.
+  ///
+  /// Do not call this method for a non-commutable instruction or
+  /// non-commutable operands.
+  /// Even though the instruction is commutable, the method may still
+  /// fail to commute the operands, null pointer is returned in such cases.
+  MachineInstr *commuteInstructionImpl(MachineInstr *MI, bool NewMI,
+                                       unsigned CommuteOpIdx1,
+                                       unsigned CommuteOpIdx2) const override;
+
+private:
+  MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
+                                              MachineFunction::iterator &MFI,
+                                              MachineBasicBlock::iterator &MBBI,
+                                              LiveVariables *LV) const;
+
+  /// Handles memory folding for special case instructions, for instance those
+  /// requiring custom manipulation of the address.
+  MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr *MI,
+                                        unsigned OpNum,
+                                        ArrayRef<MachineOperand> MOs,
+                                        MachineBasicBlock::iterator InsertPt,
+                                        unsigned Size, unsigned Align) const;
+
+  /// isFrameOperand - Return true and the FrameIndex if the specified
+  /// operand and follow operands form a reference to the stack frame.
+  bool isFrameOperand(const MachineInstr *MI, unsigned int Op,
+                      int &FrameIndex) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
new file mode 100644
index 0000000..ea8e562
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
@@ -0,0 +1,3085 @@
+//===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instruction set, defining the instructions, and
+// properties of the instructions which are needed for code generation, machine
+// code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 specific DAG Nodes.
+//
+
+def SDTIntShiftDOp: SDTypeProfile<1, 3,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                   SDTCisInt<0>, SDTCisInt<3>]>;
+
+def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>;
+
+def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+//def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+
+def SDTX86Cmov    : SDTypeProfile<1, 4,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+                                   SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+
+// Unary and binary operator instructions that set EFLAGS as a side-effect.
+def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
+                                           [SDTCisSameAs<0, 2>,
+                                            SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, EFLAGS = op LHS, RHS, EFLAGS
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<4, i32>]>;
+// RES1, RES2, FLAGS = op LHS, RHS
+def SDT2ResultBinaryArithWithFlags : SDTypeProfile<3, 2,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>, SDTCisVT<1, i32>]>;
+def SDTX86BrCond  : SDTypeProfile<0, 3,
+                                  [SDTCisVT<0, OtherVT>,
+                                   SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86SetCC   : SDTypeProfile<1, 2,
+                                  [SDTCisVT<0, i8>,
+                                   SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+def SDTX86SetCC_C : SDTypeProfile<1, 2,
+                                  [SDTCisInt<0>,
+                                   SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>;
+
+def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
+                                     SDTCisVT<2, i8>]>;
+def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def SDTX86atomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>,
+                                SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>;
+def SDTX86Ret     : SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>;
+
+def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_X86CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32>]>;
+
+def SDT_X86Call   : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+
+def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
+                                                         SDTCisVT<1, iPTR>,
+                                                         SDTCisVT<2, iPTR>]>;
+
+def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>,
+                                            SDTCisPtrTy<1>,
+                                            SDTCisVT<2, i32>,
+                                            SDTCisVT<3, i8>,
+                                            SDTCisVT<4, i32>]>;
+
+def SDTX86RepStr  : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
+
+def SDTX86Void    : SDTypeProfile<0, 0, []>;
+
+def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+
+def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
+
+def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
+
+def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
+                            [SDNPHasChain,SDNPSideEffect]>;
+def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
+                        [SDNPHasChain]>;
+def X86SFence : SDNode<"X86ISD::SFENCE", SDT_X86MEMBARRIER,
+                        [SDNPHasChain]>;
+def X86LFence : SDNode<"X86ISD::LFENCE", SDT_X86MEMBARRIER,
+                        [SDNPHasChain]>;
+
+
+def X86bsf     : SDNode<"X86ISD::BSF",      SDTUnaryArithWithFlags>;
+def X86bsr     : SDNode<"X86ISD::BSR",      SDTUnaryArithWithFlags>;
+def X86shld    : SDNode<"X86ISD::SHLD",     SDTIntShiftDOp>;
+def X86shrd    : SDNode<"X86ISD::SHRD",     SDTIntShiftDOp>;
+
+def X86cmp     : SDNode<"X86ISD::CMP" ,     SDTX86CmpTest>;
+def X86bt      : SDNode<"X86ISD::BT",       SDTX86CmpTest>;
+
+def X86cmov    : SDNode<"X86ISD::CMOV",     SDTX86Cmov>;
+def X86brcond  : SDNode<"X86ISD::BRCOND",   SDTX86BrCond,
+                        [SDNPHasChain]>;
+def X86setcc   : SDNode<"X86ISD::SETCC",    SDTX86SetCC>;
+def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;
+
+def X86sahf    : SDNode<"X86ISD::SAHF",     SDTX86sahf>;
+
+def X86rdrand  : SDNode<"X86ISD::RDRAND",   SDTX86rdrand,
+                        [SDNPHasChain, SDNPSideEffect]>;
+
+def X86rdseed  : SDNode<"X86ISD::RDSEED",   SDTX86rdrand,
+                        [SDNPHasChain, SDNPSideEffect]>;
+
+def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+
+def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def X86iret : SDNode<"X86ISD::IRET", SDTX86Ret,
+                        [SDNPHasChain, SDNPOptInGlue]>;
+
+def X86vastart_save_xmm_regs :
+                 SDNode<"X86ISD::VASTART_SAVE_XMM_REGS",
+                        SDT_X86VASTART_SAVE_XMM_REGS,
+                        [SDNPHasChain, SDNPVariadic]>;
+def X86vaarg64 :
+                 SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+                         SDNPMemOperand]>;
+def X86callseq_start :
+                 SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
+                        [SDNPHasChain, SDNPOutGlue]>;
+def X86callseq_end :
+                 SDNode<"ISD::CALLSEQ_END",   SDT_X86CallSeqEnd,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86call    : SDNode<"X86ISD::CALL",     SDT_X86Call,
+                        [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                         SDNPVariadic]>;
+
+def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>;
+def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+                         SDNPMayLoad]>;
+
+def X86rdtsc   : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void,
+                        [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def X86rdtscp  : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void,
+                        [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def X86rdpmc   : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void,
+                        [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+
+def X86Wrapper    : SDNode<"X86ISD::Wrapper",     SDTX86Wrapper>;
+def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP",  SDTX86Wrapper>;
+
+def X86RecoverFrameAlloc : SDNode<"ISD::LOCAL_RECOVER",
+                                  SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+                                                       SDTCisInt<1>]>>;
+
+def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
+                        [SDNPHasChain]>;
+
+def X86eh_sjlj_setjmp  : SDNode<"X86ISD::EH_SJLJ_SETJMP",
+                                SDTypeProfile<1, 1, [SDTCisInt<0>,
+                                                     SDTCisPtrTy<1>]>,
+                                [SDNPHasChain, SDNPSideEffect]>;
+def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP",
+                                SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+                                [SDNPHasChain, SDNPSideEffect]>;
+
+def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET,
+                        [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+def X86add_flag  : SDNode<"X86ISD::ADD",  SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86sub_flag  : SDNode<"X86ISD::SUB",  SDTBinaryArithWithFlags>;
+def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86umul_flag : SDNode<"X86ISD::UMUL", SDT2ResultBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86adc_flag  : SDNode<"X86ISD::ADC",  SDTBinaryArithWithFlagsInOut>;
+def X86sbb_flag  : SDNode<"X86ISD::SBB",  SDTBinaryArithWithFlagsInOut>;
+
+def X86inc_flag  : SDNode<"X86ISD::INC",  SDTUnaryArithWithFlags>;
+def X86dec_flag  : SDNode<"X86ISD::DEC",  SDTUnaryArithWithFlags>;
+def X86or_flag   : SDNode<"X86ISD::OR",   SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86xor_flag  : SDNode<"X86ISD::XOR",  SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86and_flag  : SDNode<"X86ISD::AND",  SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+
+def X86bextr  : SDNode<"X86ISD::BEXTR",  SDTIntBinOp>;
+
+def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
+
+def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDTX86Void,
+                          [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+
+def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
+                          [SDNPHasChain]>;
+
+def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Operand Definitions.
+//
+
+// A version of ptr_rc which excludes SP, ESP, and RSP. This is used for
+// the index operand of an address, to conform to x86 encoding restrictions.
+def ptr_rc_nosp : PointerLikeRegClass<1>;
+
+// *mem - Operand definitions for the funky X86 addressing mode operands.
+//
+def X86MemAsmOperand : AsmOperandClass {
+ let Name = "Mem";
+}
+let RenderMethod = "addMemOperands" in {
+  def X86Mem8AsmOperand   : AsmOperandClass { let Name = "Mem8"; }
+  def X86Mem16AsmOperand  : AsmOperandClass { let Name = "Mem16"; }
+  def X86Mem32AsmOperand  : AsmOperandClass { let Name = "Mem32"; }
+  def X86Mem64AsmOperand  : AsmOperandClass { let Name = "Mem64"; }
+  def X86Mem80AsmOperand  : AsmOperandClass { let Name = "Mem80"; }
+  def X86Mem128AsmOperand : AsmOperandClass { let Name = "Mem128"; }
+  def X86Mem256AsmOperand : AsmOperandClass { let Name = "Mem256"; }
+  def X86Mem512AsmOperand : AsmOperandClass { let Name = "Mem512"; }
+  // Gather mem operands
+  def X86MemVX32Operand : AsmOperandClass { let Name = "MemVX32"; }
+  def X86MemVY32Operand : AsmOperandClass { let Name = "MemVY32"; }
+  def X86MemVZ32Operand : AsmOperandClass { let Name = "MemVZ32"; }
+  def X86MemVX64Operand : AsmOperandClass { let Name = "MemVX64"; }
+  def X86MemVY64Operand : AsmOperandClass { let Name = "MemVY64"; }
+  def X86MemVZ64Operand : AsmOperandClass { let Name = "MemVZ64"; }
+  def X86MemVX32XOperand : AsmOperandClass { let Name = "MemVX32X"; }
+  def X86MemVY32XOperand : AsmOperandClass { let Name = "MemVY32X"; }
+  def X86MemVX64XOperand : AsmOperandClass { let Name = "MemVX64X"; }
+  def X86MemVY64XOperand : AsmOperandClass { let Name = "MemVY64X"; }
+}
+
+def X86AbsMemAsmOperand : AsmOperandClass {
+  let Name = "AbsMem";
+  let SuperClasses = [X86MemAsmOperand];
+}
+
+class X86MemOperand<string printMethod,
+          AsmOperandClass parserMatchClass = X86MemAsmOperand> : Operand<iPTR> {
+  let PrintMethod = printMethod;
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let ParserMatchClass = parserMatchClass;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+// Gather mem operands
+class X86VMemOperand<RegisterClass RC, string printMethod,
+                     AsmOperandClass parserMatchClass>
+    : X86MemOperand<printMethod, parserMatchClass> {
+  let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, i8imm);
+}
+
+def anymem : X86MemOperand<"printanymem">;
+
+def opaque32mem : X86MemOperand<"printopaquemem">;
+def opaque48mem : X86MemOperand<"printopaquemem">;
+def opaque80mem : X86MemOperand<"printopaquemem">;
+def opaque512mem : X86MemOperand<"printopaquemem">;
+
+def i8mem   : X86MemOperand<"printi8mem",   X86Mem8AsmOperand>;
+def i16mem  : X86MemOperand<"printi16mem",  X86Mem16AsmOperand>;
+def i32mem  : X86MemOperand<"printi32mem",  X86Mem32AsmOperand>;
+def i64mem  : X86MemOperand<"printi64mem",  X86Mem64AsmOperand>;
+def i128mem : X86MemOperand<"printi128mem", X86Mem128AsmOperand>;
+def i256mem : X86MemOperand<"printi256mem", X86Mem256AsmOperand>;
+def i512mem : X86MemOperand<"printi512mem", X86Mem512AsmOperand>;
+def f32mem  : X86MemOperand<"printf32mem",  X86Mem32AsmOperand>;
+def f64mem  : X86MemOperand<"printf64mem",  X86Mem64AsmOperand>;
+def f80mem  : X86MemOperand<"printf80mem",  X86Mem80AsmOperand>;
+def f128mem : X86MemOperand<"printf128mem", X86Mem128AsmOperand>;
+def f256mem : X86MemOperand<"printf256mem", X86Mem256AsmOperand>;
+def f512mem : X86MemOperand<"printf512mem", X86Mem512AsmOperand>;
+
+def v512mem : X86VMemOperand<VR512, "printf512mem", X86Mem512AsmOperand>;
+
+// Gather mem operands
+def vx32mem  : X86VMemOperand<VR128,  "printi32mem", X86MemVX32Operand>;
+def vy32mem  : X86VMemOperand<VR256,  "printi32mem", X86MemVY32Operand>;
+def vx64mem  : X86VMemOperand<VR128,  "printi64mem", X86MemVX64Operand>;
+def vy64mem  : X86VMemOperand<VR256,  "printi64mem", X86MemVY64Operand>;
+
+def vx32xmem : X86VMemOperand<VR128X, "printi32mem", X86MemVX32XOperand>;
+def vx64xmem : X86VMemOperand<VR128X, "printi32mem", X86MemVX64XOperand>;
+def vy32xmem : X86VMemOperand<VR256X, "printi32mem", X86MemVY32XOperand>;
+def vy64xmem : X86VMemOperand<VR256X, "printi64mem", X86MemVY64XOperand>;
+def vz32mem  : X86VMemOperand<VR512,  "printi32mem", X86MemVZ32Operand>;
+def vz64mem  : X86VMemOperand<VR512,  "printi64mem", X86MemVZ64Operand>;
+
+// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead
+// of a plain GPR, so that it doesn't potentially require a REX prefix.
+def ptr_rc_norex : PointerLikeRegClass<2>;
+def ptr_rc_norex_nosp : PointerLikeRegClass<3>;
+
+def i8mem_NOREX : Operand<iPTR> {
+  let PrintMethod = "printi8mem";
+  let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm, i8imm);
+  let ParserMatchClass = X86Mem8AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+// GPRs available for tailcall.
+// It represents GR32_TC, GR64_TC or GR64_TCW64.
+def ptr_rc_tailcall : PointerLikeRegClass<4>;
+
+// Special i32mem for addresses of load folding tail calls. These are not
+// allowed to use callee-saved registers since they must be scheduled
+// after callee-saved register are popped.
+def i32mem_TC : Operand<i32> {
+  let PrintMethod = "printi32mem";
+  let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall,
+                       i32imm, i8imm);
+  let ParserMatchClass = X86Mem32AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+// Special i64mem for addresses of load folding tail calls. These are not
+// allowed to use callee-saved registers since they must be scheduled
+// after callee-saved register are popped.
+def i64mem_TC : Operand<i64> {
+  let PrintMethod = "printi64mem";
+  let MIOperandInfo = (ops ptr_rc_tailcall, i8imm,
+                       ptr_rc_tailcall, i32imm, i8imm);
+  let ParserMatchClass = X86Mem64AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+let OperandType = "OPERAND_PCREL",
+    ParserMatchClass = X86AbsMemAsmOperand,
+    PrintMethod = "printPCRelImm" in {
+def i32imm_pcrel : Operand<i32>;
+def i16imm_pcrel : Operand<i16>;
+
+// Branch targets have OtherVT type and print as pc-relative values.
+def brtarget : Operand<OtherVT>;
+def brtarget8 : Operand<OtherVT>;
+
+}
+
+// Special parser to detect 16-bit mode to select 16-bit displacement.
+def X86AbsMem16AsmOperand : AsmOperandClass {
+  let Name = "AbsMem16";
+  let RenderMethod = "addAbsMemOperands";
+  let SuperClasses = [X86AbsMemAsmOperand];
+}
+
+// Branch targets have OtherVT type and print as pc-relative values.
+let OperandType = "OPERAND_PCREL",
+    PrintMethod = "printPCRelImm" in {
+let ParserMatchClass = X86AbsMem16AsmOperand in
+  def brtarget16 : Operand<OtherVT>;
+let ParserMatchClass = X86AbsMemAsmOperand in
+  def brtarget32 : Operand<OtherVT>;
+}
+
+let RenderMethod = "addSrcIdxOperands" in {
+  def X86SrcIdx8Operand : AsmOperandClass {
+    let Name = "SrcIdx8";
+    let SuperClasses = [X86Mem8AsmOperand];
+  }
+  def X86SrcIdx16Operand : AsmOperandClass {
+    let Name = "SrcIdx16";
+    let SuperClasses = [X86Mem16AsmOperand];
+  }
+  def X86SrcIdx32Operand : AsmOperandClass {
+    let Name = "SrcIdx32";
+    let SuperClasses = [X86Mem32AsmOperand];
+  }
+  def X86SrcIdx64Operand : AsmOperandClass {
+    let Name = "SrcIdx64";
+    let SuperClasses = [X86Mem64AsmOperand];
+  }
+} // RenderMethod = "addSrcIdxOperands"
+
+let RenderMethod = "addDstIdxOperands" in {
+ def X86DstIdx8Operand : AsmOperandClass {
+   let Name = "DstIdx8";
+   let SuperClasses = [X86Mem8AsmOperand];
+ }
+ def X86DstIdx16Operand : AsmOperandClass {
+   let Name = "DstIdx16";
+   let SuperClasses = [X86Mem16AsmOperand];
+ }
+ def X86DstIdx32Operand : AsmOperandClass {
+   let Name = "DstIdx32";
+   let SuperClasses = [X86Mem32AsmOperand];
+ }
+ def X86DstIdx64Operand : AsmOperandClass {
+   let Name = "DstIdx64";
+   let SuperClasses = [X86Mem64AsmOperand];
+ }
+} // RenderMethod = "addDstIdxOperands"
+
+let RenderMethod = "addMemOffsOperands" in {
+  def X86MemOffs16_8AsmOperand : AsmOperandClass {
+    let Name = "MemOffs16_8";
+    let SuperClasses = [X86Mem8AsmOperand];
+  }
+  def X86MemOffs16_16AsmOperand : AsmOperandClass {
+    let Name = "MemOffs16_16";
+    let SuperClasses = [X86Mem16AsmOperand];
+  }
+  def X86MemOffs16_32AsmOperand : AsmOperandClass {
+    let Name = "MemOffs16_32";
+    let SuperClasses = [X86Mem32AsmOperand];
+  }
+  def X86MemOffs32_8AsmOperand : AsmOperandClass {
+    let Name = "MemOffs32_8";
+    let SuperClasses = [X86Mem8AsmOperand];
+  }
+  def X86MemOffs32_16AsmOperand : AsmOperandClass {
+    let Name = "MemOffs32_16";
+    let SuperClasses = [X86Mem16AsmOperand];
+  }
+  def X86MemOffs32_32AsmOperand : AsmOperandClass {
+    let Name = "MemOffs32_32";
+    let SuperClasses = [X86Mem32AsmOperand];
+  }
+  def X86MemOffs32_64AsmOperand : AsmOperandClass {
+    let Name = "MemOffs32_64";
+    let SuperClasses = [X86Mem64AsmOperand];
+  }
+  def X86MemOffs64_8AsmOperand : AsmOperandClass {
+    let Name = "MemOffs64_8";
+    let SuperClasses = [X86Mem8AsmOperand];
+  }
+  def X86MemOffs64_16AsmOperand : AsmOperandClass {
+    let Name = "MemOffs64_16";
+    let SuperClasses = [X86Mem16AsmOperand];
+  }
+  def X86MemOffs64_32AsmOperand : AsmOperandClass {
+    let Name = "MemOffs64_32";
+    let SuperClasses = [X86Mem32AsmOperand];
+  }
+  def X86MemOffs64_64AsmOperand : AsmOperandClass {
+    let Name = "MemOffs64_64";
+    let SuperClasses = [X86Mem64AsmOperand];
+  }
+} // RenderMethod = "addMemOffsOperands"
+
+class X86SrcIdxOperand<string printMethod, AsmOperandClass parserMatchClass>
+    : X86MemOperand<printMethod, parserMatchClass> {
+  let MIOperandInfo = (ops ptr_rc, i8imm);
+}
+
+class X86DstIdxOperand<string printMethod, AsmOperandClass parserMatchClass>
+    : X86MemOperand<printMethod, parserMatchClass> {
+  let MIOperandInfo = (ops ptr_rc);
+}
+
+def srcidx8  : X86SrcIdxOperand<"printSrcIdx8",  X86SrcIdx8Operand>;
+def srcidx16 : X86SrcIdxOperand<"printSrcIdx16", X86SrcIdx16Operand>;
+def srcidx32 : X86SrcIdxOperand<"printSrcIdx32", X86SrcIdx32Operand>;
+def srcidx64 : X86SrcIdxOperand<"printSrcIdx64", X86SrcIdx64Operand>;
+def dstidx8  : X86DstIdxOperand<"printDstIdx8",  X86DstIdx8Operand>;
+def dstidx16 : X86DstIdxOperand<"printDstIdx16", X86DstIdx16Operand>;
+def dstidx32 : X86DstIdxOperand<"printDstIdx32", X86DstIdx32Operand>;
+def dstidx64 : X86DstIdxOperand<"printDstIdx64", X86DstIdx64Operand>;
+
+class X86MemOffsOperand<Operand immOperand, string printMethod,
+                        AsmOperandClass parserMatchClass>
+    : X86MemOperand<printMethod, parserMatchClass> {
+  let MIOperandInfo = (ops immOperand, i8imm);
+}
+
+def offset16_8  : X86MemOffsOperand<i16imm, "printMemOffs8",
+                                    X86MemOffs16_8AsmOperand>;
+def offset16_16 : X86MemOffsOperand<i16imm, "printMemOffs16",
+                                    X86MemOffs16_16AsmOperand>;
+def offset16_32 : X86MemOffsOperand<i16imm, "printMemOffs32",
+                                    X86MemOffs16_32AsmOperand>;
+def offset32_8  : X86MemOffsOperand<i32imm, "printMemOffs8",
+                                    X86MemOffs32_8AsmOperand>;
+def offset32_16 : X86MemOffsOperand<i32imm, "printMemOffs16",
+                                    X86MemOffs32_16AsmOperand>;
+def offset32_32 : X86MemOffsOperand<i32imm, "printMemOffs32",
+                                    X86MemOffs32_32AsmOperand>;
+def offset32_64 : X86MemOffsOperand<i32imm, "printMemOffs64",
+                                    X86MemOffs32_64AsmOperand>;
+def offset64_8  : X86MemOffsOperand<i64imm, "printMemOffs8",
+                                    X86MemOffs64_8AsmOperand>;
+def offset64_16 : X86MemOffsOperand<i64imm, "printMemOffs16",
+                                    X86MemOffs64_16AsmOperand>;
+def offset64_32 : X86MemOffsOperand<i64imm, "printMemOffs32",
+                                    X86MemOffs64_32AsmOperand>;
+def offset64_64 : X86MemOffsOperand<i64imm, "printMemOffs64",
+                                    X86MemOffs64_64AsmOperand>;
+
+def SSECC : Operand<i8> {
+  let PrintMethod = "printSSEAVXCC";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def i8immZExt3 : ImmLeaf<i8, [{
+  return Imm >= 0 && Imm < 8;
+}]>;
+
+def AVXCC : Operand<i8> {
+  let PrintMethod = "printSSEAVXCC";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def i8immZExt5 : ImmLeaf<i8, [{
+  return Imm >= 0 && Imm < 32;
+}]>;
+
+def AVX512ICC : Operand<i8> {
+  let PrintMethod = "printSSEAVXCC";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def XOPCC : Operand<i8> {
+  let PrintMethod = "printXOPCC";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+class ImmSExtAsmOperandClass : AsmOperandClass {
+  let SuperClasses = [ImmAsmOperand];
+  let RenderMethod = "addImmOperands";
+}
+
+def X86GR32orGR64AsmOperand : AsmOperandClass {
+  let Name = "GR32orGR64";
+}
+
+def GR32orGR64 : RegisterOperand<GR32> {
+  let ParserMatchClass = X86GR32orGR64AsmOperand;
+}
+def AVX512RCOperand : AsmOperandClass {
+  let Name = "AVX512RC";
+}
+def AVX512RC : Operand<i32> {
+  let PrintMethod = "printRoundingControl";
+  let OperandType = "OPERAND_IMMEDIATE";
+  let ParserMatchClass = AVX512RCOperand;
+}
+
+// Sign-extended immediate classes. We don't need to define the full lattice
+// here because there is no instruction with an ambiguity between ImmSExti64i32
+// and ImmSExti32i8.
+//
+// The strange ranges come from the fact that the assembler always works with
+// 64-bit immediates, but for a 16-bit target value we want to accept both "-1"
+// (which will be a -1ULL), and "0xFF" (-1 in 16-bits).
+
+// [0, 0x7FFFFFFF]                                            |
+//   [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti64i32AsmOperand : ImmSExtAsmOperandClass {
+  let Name = "ImmSExti64i32";
+}
+
+// [0, 0x0000007F] | [0x000000000000FF80, 0x000000000000FFFF] |
+//   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti16i8AsmOperand : ImmSExtAsmOperandClass {
+  let Name = "ImmSExti16i8";
+  let SuperClasses = [ImmSExti64i32AsmOperand];
+}
+
+// [0, 0x0000007F] | [0x00000000FFFFFF80, 0x00000000FFFFFFFF] |
+//   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass {
+  let Name = "ImmSExti32i8";
+}
+
+// [0, 0x0000007F]                                            |
+//   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
+  let Name = "ImmSExti64i8";
+  let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand,
+                      ImmSExti64i32AsmOperand];
+}
+
+// Unsigned immediate used by SSE/AVX instructions
+// [0, 0xFF]
+//   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmUnsignedi8AsmOperand : AsmOperandClass {
+  let Name = "ImmUnsignedi8";
+  let RenderMethod = "addImmOperands";
+}
+
+// A couple of more descriptive operand definitions.
+// 16-bits but only 8 bits are significant.
+def i16i8imm  : Operand<i16> {
+  let ParserMatchClass = ImmSExti16i8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+// 32-bits but only 8 bits are significant.
+def i32i8imm  : Operand<i32> {
+  let ParserMatchClass = ImmSExti32i8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 32 bits are significant.
+def i64i32imm  : Operand<i64> {
+  let ParserMatchClass = ImmSExti64i32AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 8 bits are significant.
+def i64i8imm   : Operand<i64> {
+  let ParserMatchClass = ImmSExti64i8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// Unsigned 8-bit immediate used by SSE/AVX instructions.
+def u8imm : Operand<i8> {
+  let PrintMethod = "printU8Imm";
+  let ParserMatchClass = ImmUnsignedi8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 32-bit immediate but only 8-bits are significant and they are unsigned.
+// Used by some SSE/AVX instructions that use intrinsics.
+def i32u8imm : Operand<i32> {
+  let PrintMethod = "printU8Imm";
+  let ParserMatchClass = ImmUnsignedi8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 32 bits are significant, and those bits are treated as being
+// pc relative.
+def i64i32imm_pcrel : Operand<i64> {
+  let PrintMethod = "printPCRelImm";
+  let ParserMatchClass = X86AbsMemAsmOperand;
+  let OperandType = "OPERAND_PCREL";
+}
+
+def lea64_32mem : Operand<i32> {
+  let PrintMethod = "printanymem";
+  let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
+// Memory operands that use 64-bit pointers in both ILP32 and LP64.
+def lea64mem : Operand<i64> {
+  let PrintMethod = "printanymem";
+  let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
+
+//===----------------------------------------------------------------------===//
+// X86 Complex Pattern Definitions.
+//
+
+// Define X86-specific addressing mode.
+def addr      : ComplexPattern<iPTR, 5, "selectAddr", [], [SDNPWantParent]>;
+def lea32addr : ComplexPattern<i32, 5, "selectLEAAddr",
+                               [add, sub, mul, X86mul_imm, shl, or, frameindex],
+                               []>;
+// In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
+def lea64_32addr : ComplexPattern<i32, 5, "selectLEA64_32Addr",
+                                  [add, sub, mul, X86mul_imm, shl, or,
+                                   frameindex, X86WrapperRIP],
+                                  []>;
+
+def tls32addr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
+def tls32baseaddr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
+def lea64addr : ComplexPattern<i64, 5, "selectLEAAddr",
+                        [add, sub, mul, X86mul_imm, shl, or, frameindex,
+                         X86WrapperRIP], []>;
+
+def tls64addr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
+def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
+def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Predicate Definitions.
+def HasCMov      : Predicate<"Subtarget->hasCMov()">;
+def NoCMov       : Predicate<"!Subtarget->hasCMov()">;
+
+def HasMMX       : Predicate<"Subtarget->hasMMX()">;
+def Has3DNow     : Predicate<"Subtarget->has3DNow()">;
+def Has3DNowA    : Predicate<"Subtarget->has3DNowA()">;
+def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
+def UseSSE1      : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">;
+def HasSSE2      : Predicate<"Subtarget->hasSSE2()">;
+def UseSSE2      : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">;
+def HasSSE3      : Predicate<"Subtarget->hasSSE3()">;
+def UseSSE3      : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">;
+def HasSSSE3     : Predicate<"Subtarget->hasSSSE3()">;
+def UseSSSE3     : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">;
+def HasSSE41     : Predicate<"Subtarget->hasSSE41()">;
+def NoSSE41      : Predicate<"!Subtarget->hasSSE41()">;
+def UseSSE41     : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">;
+def HasSSE42     : Predicate<"Subtarget->hasSSE42()">;
+def UseSSE42     : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">;
+def HasSSE4A     : Predicate<"Subtarget->hasSSE4A()">;
+def HasAVX       : Predicate<"Subtarget->hasAVX()">;
+def HasAVX2      : Predicate<"Subtarget->hasAVX2()">;
+def HasAVX1Only  : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
+def HasAVX512    : Predicate<"Subtarget->hasAVX512()">,
+                     AssemblerPredicate<"FeatureAVX512", "AVX-512 ISA">;
+def UseAVX       : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
+def UseAVX2      : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
+def NoAVX512     : Predicate<"!Subtarget->hasAVX512()">;
+def HasCDI       : Predicate<"Subtarget->hasCDI()">,
+                     AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">;
+def HasPFI       : Predicate<"Subtarget->hasPFI()">,
+                     AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">;
+def HasERI       : Predicate<"Subtarget->hasERI()">,
+                     AssemblerPredicate<"FeatureERI", "AVX-512 ER ISA">;
+def HasDQI       : Predicate<"Subtarget->hasDQI()">,
+                     AssemblerPredicate<"FeatureDQI", "AVX-512 DQ ISA">;
+def NoDQI        : Predicate<"!Subtarget->hasDQI()">;
+def HasBWI       : Predicate<"Subtarget->hasBWI()">,
+                     AssemblerPredicate<"FeatureBWI", "AVX-512 BW ISA">;
+def NoBWI        : Predicate<"!Subtarget->hasBWI()">;
+def HasVLX       : Predicate<"Subtarget->hasVLX()">,
+                     AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">;
+def NoVLX        : Predicate<"!Subtarget->hasVLX()">;
+def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">;
+def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">;
+def PKU        : Predicate<"!Subtarget->hasPKU()">;
+
+def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
+def HasAES       : Predicate<"Subtarget->hasAES()">;
+def HasFXSR      : Predicate<"Subtarget->hasFXSR()">;
+def HasXSAVE     : Predicate<"Subtarget->hasXSAVE()">;
+def HasXSAVEOPT  : Predicate<"Subtarget->hasXSAVEOPT()">;
+def HasXSAVEC    : Predicate<"Subtarget->hasXSAVEC()">;
+def HasXSAVES    : Predicate<"Subtarget->hasXSAVES()">;
+def HasPCLMUL    : Predicate<"Subtarget->hasPCLMUL()">;
+def HasFMA       : Predicate<"Subtarget->hasFMA()">;
+def UseFMAOnAVX  : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">;
+def HasFMA4      : Predicate<"Subtarget->hasFMA4()">;
+def HasXOP       : Predicate<"Subtarget->hasXOP()">;
+def HasTBM       : Predicate<"Subtarget->hasTBM()">;
+def HasMOVBE     : Predicate<"Subtarget->hasMOVBE()">;
+def HasRDRAND    : Predicate<"Subtarget->hasRDRAND()">;
+def HasF16C      : Predicate<"Subtarget->hasF16C()">;
+def HasFSGSBase  : Predicate<"Subtarget->hasFSGSBase()">;
+def HasLZCNT     : Predicate<"Subtarget->hasLZCNT()">;
+def HasBMI       : Predicate<"Subtarget->hasBMI()">;
+def HasBMI2      : Predicate<"Subtarget->hasBMI2()">;
+def HasRTM       : Predicate<"Subtarget->hasRTM()">;
+def HasHLE       : Predicate<"Subtarget->hasHLE()">;
+def HasTSX       : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">;
+def HasADX       : Predicate<"Subtarget->hasADX()">;
+def HasSHA       : Predicate<"Subtarget->hasSHA()">;
+def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
+def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
+def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
+def HasLAHFSAHF  : Predicate<"Subtarget->hasLAHFSAHF()">;
+def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
+def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
+def HasMPX       : Predicate<"Subtarget->hasMPX()">;
+def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
+def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
+                             AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
+def In64BitMode  : Predicate<"Subtarget->is64Bit()">,
+                             AssemblerPredicate<"Mode64Bit", "64-bit mode">;
+def IsLP64  : Predicate<"Subtarget->isTarget64BitLP64()">;
+def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">;
+def In16BitMode  : Predicate<"Subtarget->is16Bit()">,
+                             AssemblerPredicate<"Mode16Bit", "16-bit mode">;
+def Not16BitMode : Predicate<"!Subtarget->is16Bit()">,
+                             AssemblerPredicate<"!Mode16Bit", "Not 16-bit mode">;
+def In32BitMode  : Predicate<"Subtarget->is32Bit()">,
+                             AssemblerPredicate<"Mode32Bit", "32-bit mode">;
+def IsWin64      : Predicate<"Subtarget->isTargetWin64()">;
+def NotWin64     : Predicate<"!Subtarget->isTargetWin64()">;
+def IsPS4        : Predicate<"Subtarget->isTargetPS4()">;
+def NotPS4       : Predicate<"!Subtarget->isTargetPS4()">;
+def IsNaCl       : Predicate<"Subtarget->isTargetNaCl()">;
+def NotNaCl      : Predicate<"!Subtarget->isTargetNaCl()">;
+def SmallCode    : Predicate<"TM.getCodeModel() == CodeModel::Small">;
+def KernelCode   : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
+def FarData      : Predicate<"TM.getCodeModel() != CodeModel::Small &&"
+                             "TM.getCodeModel() != CodeModel::Kernel">;
+def NearData     : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
+                             "TM.getCodeModel() == CodeModel::Kernel">;
+def IsStatic     : Predicate<"TM.getRelocationModel() == Reloc::Static">;
+def IsNotPIC     : Predicate<"TM.getRelocationModel() != Reloc::PIC_">;
+def OptForSize   : Predicate<"OptForSize">;
+def OptForSpeed  : Predicate<"!OptForSize">;
+def FastBTMem    : Predicate<"!Subtarget->isBTMemSlow()">;
+def CallImmAddr  : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
+def FavorMemIndirectCall  : Predicate<"!Subtarget->callRegIndirect()">;
+def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
+def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+include "X86InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments.
+//
+
+// X86 specific condition code. These correspond to CondCode in
+// X86InstrInfo.h. They must be kept in synch.
+def X86_COND_A   : PatLeaf<(i8 0)>;  // alt. COND_NBE
+def X86_COND_AE  : PatLeaf<(i8 1)>;  // alt. COND_NC
+def X86_COND_B   : PatLeaf<(i8 2)>;  // alt. COND_C
+def X86_COND_BE  : PatLeaf<(i8 3)>;  // alt. COND_NA
+def X86_COND_E   : PatLeaf<(i8 4)>;  // alt. COND_Z
+def X86_COND_G   : PatLeaf<(i8 5)>;  // alt. COND_NLE
+def X86_COND_GE  : PatLeaf<(i8 6)>;  // alt. COND_NL
+def X86_COND_L   : PatLeaf<(i8 7)>;  // alt. COND_NGE
+def X86_COND_LE  : PatLeaf<(i8 8)>;  // alt. COND_NG
+def X86_COND_NE  : PatLeaf<(i8 9)>;  // alt. COND_NZ
+def X86_COND_NO  : PatLeaf<(i8 10)>;
+def X86_COND_NP  : PatLeaf<(i8 11)>; // alt. COND_PO
+def X86_COND_NS  : PatLeaf<(i8 12)>;
+def X86_COND_O   : PatLeaf<(i8 13)>;
+def X86_COND_P   : PatLeaf<(i8 14)>; // alt. COND_PE
+def X86_COND_S   : PatLeaf<(i8 15)>;
+
+// Predicate used to help when pattern matching LZCNT/TZCNT.
+def X86_COND_E_OR_NE : ImmLeaf<i8, [{
+  return (Imm == X86::COND_E) || (Imm == X86::COND_NE);
+}]>;
+
+
+def i16immSExt8  : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>;
+def i32immSExt8  : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
+def i64immSExt8  : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
+
+// If we have multiple users of an immediate, it's much smaller to reuse
+// the register, rather than encode the immediate in every instruction.
+// This has the risk of increasing register pressure from stretched live
+// ranges, however, the immediates should be trivial to rematerialize by
+// the RA in the event of high register pressure.
+// TODO : This is currently enabled for stores and binary ops. There are more
+// cases for which this can be enabled, though this catches the bulk of the
+// issues.
+// TODO2 : This should really also be enabled under O2, but there's currently
+// an issue with RA where we don't pull the constants into their users
+// when we rematerialize them. I'll follow-up on enabling O2 after we fix that
+// issue.
+// TODO3 : This is currently limited to single basic blocks (DAG creation
+// pulls block immediates to the top and merges them if necessary).
+// Eventually, it would be nice to allow ConstantHoisting to merge constants
+// globally for potentially added savings.
+//
+def imm8_su : PatLeaf<(i8 imm), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def imm16_su : PatLeaf<(i16 imm), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def imm32_su : PatLeaf<(i32 imm), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
+def i16immSExt8_su : PatLeaf<(i16immSExt8), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i32immSExt8_su : PatLeaf<(i32immSExt8), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
+
+def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
+
+
+// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+// unsigned field.
+def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>;
+
+def i64immZExt32SExt8 : ImmLeaf<i64, [{
+  return isUInt<32>(Imm) && isInt<8>(static_cast<int32_t>(Imm));
+}]>;
+
+// Helper fragments for loads.
+// It's always safe to treat a anyext i16 load as a i32 load if the i16 is
+// known to be 32-bit aligned or better. Ditto for i8 to i16.
+def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    return true;
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 2 && !LD->isVolatile();
+  return false;
+}]>;
+
+def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),[{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 2 && !LD->isVolatile();
+  return false;
+}]>;
+
+def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    return true;
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 4 && !LD->isVolatile();
+  return false;
+}]>;
+
+def loadi8   : PatFrag<(ops node:$ptr), (i8  (load node:$ptr))>;
+def loadi64  : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
+def loadf32  : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
+def loadf64  : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
+def loadf80  : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
+def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>;
+
+def sextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
+def sextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
+def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
+def sextloadi64i8  : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>;
+def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
+def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;
+
+def zextloadi8i1   : PatFrag<(ops node:$ptr), (i8  (zextloadi1 node:$ptr))>;
+def zextloadi16i1  : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>;
+def zextloadi32i1  : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>;
+def zextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def zextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>;
+def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
+def zextloadi64i1  : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>;
+def zextloadi64i8  : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>;
+def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
+def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;
+
+def extloadi8i1    : PatFrag<(ops node:$ptr), (i8  (extloadi1 node:$ptr))>;
+def extloadi16i1   : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>;
+def extloadi32i1   : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>;
+def extloadi16i8   : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>;
+def extloadi32i8   : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>;
+def extloadi32i16  : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
+def extloadi64i1   : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
+def extloadi64i8   : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
+def extloadi64i16  : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
+def extloadi64i32  : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+
+// An 'and' node with a single use.
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+// An 'srl' node with a single use.
+def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+// An 'trunc' node with a single use.
+def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
+  return N->hasOneUse();
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list.
+//
+
+// Nop
+let hasSideEffects = 0, SchedRW = [WriteZero] in {
+  def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>;
+  def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero),
+                "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16;
+  def NOOPL : I<0x1f, MRMXm, (outs), (ins i32mem:$zero),
+                "nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32;
+}
+
+
+// Constructing a stack frame.
+def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl),
+                 "enter\t$len, $lvl", [], IIC_ENTER>, Sched<[WriteMicrocoded]>;
+
+let SchedRW = [WriteALU] in {
+let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, hasSideEffects=0 in
+def LEAVE    : I<0xC9, RawFrm,
+                 (outs), (ins), "leave", [], IIC_LEAVE>,
+                 Requires<[Not64BitMode]>;
+
+let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, hasSideEffects = 0 in
+def LEAVE64  : I<0xC9, RawFrm,
+                 (outs), (ins), "leave", [], IIC_LEAVE>,
+                 Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+let Defs = [ESP], Uses = [ESP], hasSideEffects=0 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def POP16r  : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+                IIC_POP_REG16>, OpSize16;
+def POP32r  : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+                IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
+def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+                IIC_POP_REG>, OpSize16;
+def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [],
+                IIC_POP_MEM>, OpSize16;
+def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+                IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
+def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [],
+                IIC_POP_MEM>, OpSize32, Requires<[Not64BitMode]>;
+} // mayLoad, SchedRW
+
+let mayStore = 1, SchedRW = [WriteStore] in {
+def PUSH16r  : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+                 IIC_PUSH_REG>, OpSize16;
+def PUSH32r  : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+                 IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
+def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+                 IIC_PUSH_REG>, OpSize16;
+def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+                 IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
+
+def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm),
+                   "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
+def PUSHi16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
+                   "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
+
+def PUSH32i8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
+                   "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                   Requires<[Not64BitMode]>;
+def PUSHi32  : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
+                   "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                   Requires<[Not64BitMode]>;
+} // mayStore, SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in {
+def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[],
+                 IIC_PUSH_MEM>, OpSize16;
+def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
+                 IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>;
+} // mayLoad, mayStore, SchedRW
+
+}
+
+let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
+    SchedRW = [WriteRMW], Defs = [ESP] in {
+  let Uses = [ESP, EFLAGS] in
+  def RDFLAGS32 : PseudoI<(outs GR32:$dst), (ins),
+                   [(set GR32:$dst, (int_x86_flags_read_u32))]>,
+                Requires<[Not64BitMode]>;
+
+  let Uses = [RSP, EFLAGS] in
+  def RDFLAGS64 : PseudoI<(outs GR64:$dst), (ins),
+                   [(set GR64:$dst, (int_x86_flags_read_u64))]>,
+                Requires<[In64BitMode]>;
+}
+
+let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
+    SchedRW = [WriteRMW] in {
+  let Defs = [ESP, EFLAGS], Uses = [ESP] in
+  def WRFLAGS32 : PseudoI<(outs), (ins GR32:$src),
+                   [(int_x86_flags_write_u32 GR32:$src)]>,
+                Requires<[Not64BitMode]>;
+
+  let Defs = [RSP, EFLAGS], Uses = [RSP] in
+  def WRFLAGS64 : PseudoI<(outs), (ins GR64:$src),
+                   [(int_x86_flags_write_u64 GR64:$src)]>,
+                Requires<[In64BitMode]>;
+}
+
+let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0,
+    SchedRW = [WriteLoad] in {
+def POPF16   : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>,
+                OpSize16;
+def POPF32   : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>,
+                OpSize32, Requires<[Not64BitMode]>;
+}
+
+let Defs = [ESP], Uses = [ESP, EFLAGS], mayStore = 1, hasSideEffects=0,
+    SchedRW = [WriteStore] in {
+def PUSHF16  : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>,
+                 OpSize16;
+def PUSHF32  : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>,
+               OpSize32, Requires<[Not64BitMode]>;
+}
+
+let Defs = [RSP], Uses = [RSP], hasSideEffects=0 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def POP64r   : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
+                 IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
+def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
+                IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
+def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [],
+                IIC_POP_MEM>, OpSize32, Requires<[In64BitMode]>;
+} // mayLoad, SchedRW
+let mayStore = 1, SchedRW = [WriteStore] in {
+def PUSH64r  : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
+                 IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
+def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
+                 IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
+} // mayStore, SchedRW
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in {
+def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
+                 IIC_PUSH_MEM>, OpSize32, Requires<[In64BitMode]>;
+} // mayLoad, mayStore, SchedRW
+}
+
+let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1,
+    SchedRW = [WriteStore] in {
+def PUSH64i8   : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
+                    "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                    Requires<[In64BitMode]>;
+def PUSH64i32  : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
+                    "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                    Requires<[In64BitMode]>;
+}
+
+let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in
+def POPF64   : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>,
+               OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>;
+let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, hasSideEffects=0 in
+def PUSHF64    : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
+                 OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>;
+
+let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
+    mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteLoad] in {
+def POPA32   : I<0x61, RawFrm, (outs), (ins), "popal", [], IIC_POP_A>,
+               OpSize32, Requires<[Not64BitMode]>;
+def POPA16   : I<0x61, RawFrm, (outs), (ins), "popaw", [], IIC_POP_A>,
+               OpSize16, Requires<[Not64BitMode]>;
+}
+let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
+    mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pushal", [], IIC_PUSH_A>,
+               OpSize32, Requires<[Not64BitMode]>;
+def PUSHA16  : I<0x60, RawFrm, (outs), (ins), "pushaw", [], IIC_PUSH_A>,
+               OpSize16, Requires<[Not64BitMode]>;
+}
+
+let Constraints = "$src = $dst", SchedRW = [WriteALU] in {
+// GR32 = bswap GR32
+def BSWAP32r : I<0xC8, AddRegFrm,
+                 (outs GR32:$dst), (ins GR32:$src),
+                 "bswap{l}\t$dst",
+                 [(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, OpSize32, TB;
+
+def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
+                  "bswap{q}\t$dst",
+                  [(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB;
+} // Constraints = "$src = $dst", SchedRW
+
+// Bit scan instructions.
+let Defs = [EFLAGS] in {
+def BSF16rr  : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                 "bsf{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))],
+                  IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>;
+def BSF16rm  : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                 "bsf{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))],
+                  IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>;
+def BSF32rr  : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                 "bsf{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))],
+                 IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>;
+def BSF32rm  : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                 "bsf{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))],
+                 IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>;
+def BSF64rr  : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                  "bsf{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))],
+                  IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>;
+def BSF64rm  : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                  "bsf{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))],
+                  IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>;
+
+def BSR16rr  : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                 "bsr{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))],
+                 IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>;
+def BSR16rm  : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                 "bsr{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))],
+                 IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>;
+def BSR32rr  : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                 "bsr{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))],
+                 IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>;
+def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                 "bsr{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))],
+                 IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>;
+def BSR64rr  : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                  "bsr{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))],
+                  IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>;
+def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                  "bsr{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))],
+                  IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>;
+} // Defs = [EFLAGS]
+
+let SchedRW = [WriteMicrocoded] in {
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
+def MOVSB : I<0xA4, RawFrmDstSrc, (outs dstidx8:$dst), (ins srcidx8:$src),
+              "movsb\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
+def MOVSW : I<0xA5, RawFrmDstSrc, (outs dstidx16:$dst), (ins srcidx16:$src),
+              "movsw\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize16;
+def MOVSL : I<0xA5, RawFrmDstSrc, (outs dstidx32:$dst), (ins srcidx32:$src),
+              "movs{l|d}\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize32;
+def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs dstidx64:$dst), (ins srcidx64:$src),
+               "movsq\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
+}
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
+def STOSB : I<0xAA, RawFrmDst, (outs dstidx8:$dst), (ins),
+              "stosb\t{%al, $dst|$dst, al}", [], IIC_STOS>;
+let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
+def STOSW : I<0xAB, RawFrmDst, (outs dstidx16:$dst), (ins),
+              "stosw\t{%ax, $dst|$dst, ax}", [], IIC_STOS>, OpSize16;
+let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
+def STOSL : I<0xAB, RawFrmDst, (outs dstidx32:$dst), (ins),
+              "stos{l|d}\t{%eax, $dst|$dst, eax}", [], IIC_STOS>, OpSize32;
+let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in
+def STOSQ : RI<0xAB, RawFrmDst, (outs dstidx64:$dst), (ins),
+               "stosq\t{%rax, $dst|$dst, rax}", [], IIC_STOS>;
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,EFLAGS], Uses = [AL,EDI,EFLAGS] in
+def SCASB : I<0xAE, RawFrmDst, (outs), (ins dstidx8:$dst),
+              "scasb\t{$dst, %al|al, $dst}", [], IIC_SCAS>;
+let Defs = [EDI,EFLAGS], Uses = [AX,EDI,EFLAGS] in
+def SCASW : I<0xAF, RawFrmDst, (outs), (ins dstidx16:$dst),
+              "scasw\t{$dst, %ax|ax, $dst}", [], IIC_SCAS>, OpSize16;
+let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,EFLAGS] in
+def SCASL : I<0xAF, RawFrmDst, (outs), (ins dstidx32:$dst),
+              "scas{l|d}\t{$dst, %eax|eax, $dst}", [], IIC_SCAS>, OpSize32;
+let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,EFLAGS] in
+def SCASQ : RI<0xAF, RawFrmDst, (outs), (ins dstidx64:$dst),
+               "scasq\t{$dst, %rax|rax, $dst}", [], IIC_SCAS>;
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,EFLAGS] in {
+def CMPSB : I<0xA6, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
+              "cmpsb\t{$dst, $src|$src, $dst}", [], IIC_CMPS>;
+def CMPSW : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
+              "cmpsw\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize16;
+def CMPSL : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
+              "cmps{l|d}\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize32;
+def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
+               "cmpsq\t{$dst, $src|$src, $dst}", [], IIC_CMPS>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions.
+//
+let SchedRW = [WriteMove] in {
+let hasSideEffects = 0 in {
+def MOV8rr  : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
+                "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8ri  : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}",
+                   [(set GR8:$dst, imm:$src)], IIC_MOV>;
+def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
+                   "mov{w}\t{$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize16;
+def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
+                   "mov{l}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, imm:$src)], IIC_MOV>, OpSize32;
+def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
+                       "mov{q}\t{$src, $dst|$dst, $src}",
+                       [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>;
+}
+let isReMaterializable = 1 in {
+def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
+                    "movabs{q}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, imm:$src)], IIC_MOV>;
+}
+
+// Longer forms that use a ModR/M byte. Needed for disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MOV8ri_alt  : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src),
+                   "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
+                   "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+}
+} // SchedRW
+
+let SchedRW = [WriteStore] in {
+def MOV8mi  : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}",
+                   [(store (i8 imm8_su:$src), addr:$dst)], IIC_MOV_MEM>;
+def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
+                   "mov{w}\t{$src, $dst|$dst, $src}",
+                   [(store (i16 imm16_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16;
+def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
+                   "mov{l}\t{$src, $dst|$dst, $src}",
+                   [(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32;
+def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+                       "mov{q}\t{$src, $dst|$dst, $src}",
+                       [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>;
+} // SchedRW
+
+let hasSideEffects = 0 in {
+
+/// Memory offset versions of moves. The immediate is an address mode sized
+/// offset from the segment base.
+let SchedRW = [WriteALU] in {
+let mayLoad = 1 in {
+let Defs = [AL] in
+def MOV8ao32 : Ii32<0xA0, RawFrmMemOffs, (outs), (ins offset32_8:$src),
+                    "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
+                    AdSize32;
+let Defs = [AX] in
+def MOV16ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_16:$src),
+                     "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+                     OpSize16, AdSize32;
+let Defs = [EAX] in
+def MOV32ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_32:$src),
+                     "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+                     OpSize32, AdSize32;
+let Defs = [RAX] in
+def MOV64ao32 : RIi32<0xA1, RawFrmMemOffs, (outs), (ins offset32_64:$src),
+                      "mov{q}\t{$src, %rax|rax, $src}", [], IIC_MOV_MEM>,
+                      AdSize32;
+
+let Defs = [AL] in
+def MOV8ao16 : Ii16<0xA0, RawFrmMemOffs, (outs), (ins offset16_8:$src),
+                    "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, AdSize16;
+let Defs = [AX] in
+def MOV16ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_16:$src),
+                     "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+                     OpSize16, AdSize16;
+let Defs = [EAX] in
+def MOV32ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_32:$src),
+                     "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+                     AdSize16, OpSize32;
+}
+let mayStore = 1 in {
+let Uses = [AL] in
+def MOV8o32a : Ii32<0xA2, RawFrmMemOffs, (outs offset32_8:$dst), (ins),
+                    "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize32;
+let Uses = [AX] in
+def MOV16o32a : Ii32<0xA3, RawFrmMemOffs, (outs offset32_16:$dst), (ins),
+                     "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+                     OpSize16, AdSize32;
+let Uses = [EAX] in
+def MOV32o32a : Ii32<0xA3, RawFrmMemOffs, (outs offset32_32:$dst), (ins),
+                     "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+                     OpSize32, AdSize32;
+let Uses = [RAX] in
+def MOV64o32a : RIi32<0xA3, RawFrmMemOffs, (outs offset32_64:$dst), (ins),
+                      "mov{q}\t{%rax, $dst|$dst, rax}", [], IIC_MOV_MEM>,
+                      AdSize32;
+
+let Uses = [AL] in
+def MOV8o16a : Ii16<0xA2, RawFrmMemOffs, (outs offset16_8:$dst), (ins),
+                    "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize16;
+let Uses = [AX] in
+def MOV16o16a : Ii16<0xA3, RawFrmMemOffs, (outs offset16_16:$dst), (ins),
+                     "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+                     OpSize16, AdSize16;
+let Uses = [EAX] in
+def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs offset16_32:$dst), (ins),
+                     "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+                     OpSize32, AdSize16;
+}
+}
+
+// These forms all have full 64-bit absolute addresses in their instructions
+// and use the movabs mnemonic to indicate this specific form.
+let mayLoad = 1 in {
+let Defs = [AL] in
+def MOV8ao64 : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src),
+                     "movabs{b}\t{$src, %al|al, $src}", []>, AdSize64;
+let Defs = [AX] in
+def MOV16ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src),
+                     "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize16, AdSize64;
+let Defs = [EAX] in
+def MOV32ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src),
+                     "movabs{l}\t{$src, %eax|eax, $src}", []>, OpSize32,
+                     AdSize64;
+let Defs = [RAX] in
+def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src),
+                     "movabs{q}\t{$src, %rax|rax, $src}", []>, AdSize64;
+}
+
+let mayStore = 1 in {
+let Uses = [AL] in
+def MOV8o64a : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs offset64_8:$dst), (ins),
+                     "movabs{b}\t{%al, $dst|$dst, al}", []>, AdSize64;
+let Uses = [AX] in
+def MOV16o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset64_16:$dst), (ins),
+                     "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize16, AdSize64;
+let Uses = [EAX] in
+def MOV32o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset64_32:$dst), (ins),
+                     "movabs{l}\t{%eax, $dst|$dst, eax}", []>, OpSize32,
+                     AdSize64;
+let Uses = [RAX] in
+def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs offset64_64:$dst), (ins),
+                     "movabs{q}\t{%rax, $dst|$dst, rax}", []>, AdSize64;
+}
+} // hasSideEffects = 0
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteMove] in {
+def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                    "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                    "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                     "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
+def MOV8rm  : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
+                "mov{b}\t{$src, $dst|$dst, $src}",
+                [(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>;
+def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}",
+                [(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize16;
+def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}",
+                [(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>, OpSize32;
+def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}",
+                 [(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>;
+}
+
+let SchedRW = [WriteStore] in {
+def MOV8mr  : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
+                "mov{b}\t{$src, $dst|$dst, $src}",
+                [(store GR8:$src, addr:$dst)], IIC_MOV_MEM>;
+def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}",
+                [(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize16;
+def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}",
+                [(store GR32:$src, addr:$dst)], IIC_MOV_MEM>, OpSize32;
+def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}",
+                 [(store GR64:$src, addr:$dst)], IIC_MOV_MEM>;
+} // SchedRW
+
+// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
+// that they can be used for copying and storing h registers, which can't be
+// encoded when a REX prefix is present.
+let isCodeGenOnly = 1 in {
+let hasSideEffects = 0 in
+def MOV8rr_NOREX : I<0x88, MRMDestReg,
+                     (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [], IIC_MOV>,
+                   Sched<[WriteMove]>;
+let mayStore = 1, hasSideEffects = 0 in
+def MOV8mr_NOREX : I<0x88, MRMDestMem,
+                     (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [],
+                     IIC_MOV_MEM>, Sched<[WriteStore]>;
+let mayLoad = 1, hasSideEffects = 0,
+    canFoldAsLoad = 1, isReMaterializable = 1 in
+def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
+                     (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [],
+                     IIC_MOV_MEM>, Sched<[WriteLoad]>;
+}
+
+
+// Condition code ops, incl. set if equal/not equal/...
+let SchedRW = [WriteALU] in {
+let Defs = [EFLAGS], Uses = [AH] in
+def SAHF     : I<0x9E, RawFrm, (outs),  (ins), "sahf",
+                 [(set EFLAGS, (X86sahf AH))], IIC_AHF>,
+               Requires<[HasLAHFSAHF]>;
+let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in
+def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", [],
+                IIC_AHF>,  // AH = flags
+               Requires<[HasLAHFSAHF]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Bit tests instructions: BT, BTS, BTR, BTC.
+
+let Defs = [EFLAGS] in {
+let SchedRW = [WriteALU] in {
+def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+               "bt{w}\t{$src2, $src1|$src1, $src2}",
+               [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>,
+               OpSize16, TB;
+def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+               "bt{l}\t{$src2, $src1|$src1, $src2}",
+               [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>,
+               OpSize32, TB;
+def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+               "bt{q}\t{$src2, $src1|$src1, $src2}",
+               [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB;
+} // SchedRW
+
+// Unlike with the register+register form, the memory+register form of the
+// bt instruction does not ignore the high bits of the index. From ISel's
+// perspective, this is pretty bizarre. Make these instructions disassembly
+// only for now.
+
+let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in {
+  def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+                 "bt{w}\t{$src2, $src1|$src1, $src2}",
+  //               [(X86bt (loadi16 addr:$src1), GR16:$src2),
+  //                (implicit EFLAGS)]
+                 [], IIC_BT_MR
+                 >, OpSize16, TB, Requires<[FastBTMem]>;
+  def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+                 "bt{l}\t{$src2, $src1|$src1, $src2}",
+  //               [(X86bt (loadi32 addr:$src1), GR32:$src2),
+  //                (implicit EFLAGS)]
+                 [], IIC_BT_MR
+                 >, OpSize32, TB, Requires<[FastBTMem]>;
+  def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "bt{q}\t{$src2, $src1|$src1, $src2}",
+  //               [(X86bt (loadi64 addr:$src1), GR64:$src2),
+  //                (implicit EFLAGS)]
+                  [], IIC_BT_MR
+                  >, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                "bt{w}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))],
+                IIC_BT_RI>, OpSize16, TB;
+def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                "bt{l}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))],
+                IIC_BT_RI>, OpSize32, TB;
+def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                "bt{q}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))],
+                IIC_BT_RI>, TB;
+} // SchedRW
+
+// Note that these instructions don't need FastBTMem because that
+// only applies when the other operand is in a register. When it's
+// an immediate, bt is still fast.
+let SchedRW = [WriteALU] in {
+def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                "bt{w}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2))
+                 ], IIC_BT_MI>, OpSize16, TB;
+def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                "bt{l}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2))
+                 ], IIC_BT_MI>, OpSize32, TB;
+def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                "bt{q}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt (loadi64 addr:$src1),
+                                     i64immSExt8:$src2))], IIC_BT_MI>, TB;
+} // SchedRW
+
+let hasSideEffects = 0 in {
+let SchedRW = [WriteALU] in {
+def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+                "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize16, TB;
+def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+                "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize32, TB;
+def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+                 "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+                "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize16, TB;
+def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+                "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize32, TB;
+def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                    "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize16, TB;
+def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                    "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize32, TB;
+def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                    "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                    "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize16, TB;
+def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                    "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize32, TB;
+def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                    "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+                "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize16, TB;
+def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+                "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize32, TB;
+def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+                 "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+                "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize16, TB;
+def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+                "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize32, TB;
+def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                    "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize16, TB;
+def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                    "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize32, TB;
+def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                    "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                    "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize16, TB;
+def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                    "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize32, TB;
+def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                    "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+                "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize16, TB;
+def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+                "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+              OpSize32, TB;
+def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+               "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+              "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+              OpSize16, TB;
+def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+              "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+              OpSize32, TB;
+def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                    "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize16, TB;
+def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                    "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize32, TB;
+def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                    "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                    "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize16, TB;
+def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                    "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize32, TB;
+def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                    "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+} // hasSideEffects = 0
+} // Defs = [EFLAGS]
+
+
+//===----------------------------------------------------------------------===//
+// Atomic support
+//
+
+// Atomic swap. These are just normal xchg instructions. But since a memory
+// operand is referenced, the atomicity is ensured.
+multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag,
+                       InstrItinClass itin> {
+  let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in {
+    def NAME#8rm  : I<opc8, MRMSrcMem, (outs GR8:$dst),
+                      (ins GR8:$val, i8mem:$ptr),
+                      !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+                      [(set
+                         GR8:$dst,
+                         (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+                      itin>;
+    def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst),
+                      (ins GR16:$val, i16mem:$ptr),
+                      !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+                      [(set
+                         GR16:$dst,
+                         (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+                      itin>, OpSize16;
+    def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
+                      (ins GR32:$val, i32mem:$ptr),
+                      !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+                      [(set
+                         GR32:$dst,
+                         (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+                      itin>, OpSize32;
+    def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
+                       (ins GR64:$val, i64mem:$ptr),
+                       !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+                       [(set
+                         GR64:$dst,
+                         (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+                       itin>;
+  }
+}
+
+defm XCHG    : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>;
+
+// Swap between registers.
+let SchedRW = [WriteALU] in {
+let Constraints = "$val = $dst" in {
+def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
+                "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
+                 "xchg{w}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>,
+                 OpSize16;
+def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src),
+                 "xchg{l}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>,
+                 OpSize32;
+def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
+                  "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+}
+
+// Swap between EAX and other registers.
+let Uses = [AX], Defs = [AX] in
+def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
+                  "xchg{w}\t{$src, %ax|ax, $src}", [], IIC_XCHG_REG>, OpSize16;
+let Uses = [EAX], Defs = [EAX] in
+def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
+                  "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
+                  OpSize32, Requires<[Not64BitMode]>;
+let Uses = [EAX], Defs = [EAX] in
+// Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
+// xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
+def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
+                   "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
+                   OpSize32, Requires<[In64BitMode]>;
+let Uses = [RAX], Defs = [RAX] in
+def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
+                  "xchg{q}\t{$src, %rax|rax, $src}", [], IIC_XCHG_REG>;
+} // SchedRW
+
+let SchedRW = [WriteALU] in {
+def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+                "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                 "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
+                 OpSize16;
+def XADD32rr  : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+                 "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
+                 OpSize32;
+def XADD64rr  : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+                   "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def XADD8rm   : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+                 "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+def XADD16rm  : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                 "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+                 OpSize16;
+def XADD32rm  : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                 "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+                 OpSize32;
+def XADD64rm  : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                   "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+
+}
+
+let SchedRW = [WriteALU] in {
+def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+                   "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+                   IIC_CMPXCHG_REG8>, TB;
+def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                    "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+                    IIC_CMPXCHG_REG>, TB, OpSize16;
+def CMPXCHG32rr  : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_REG>, TB, OpSize32;
+def CMPXCHG64rr  : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+                      IIC_CMPXCHG_REG>, TB;
+} // SchedRW
+
+let SchedRW = [WriteALULd, WriteRMW] in {
+let mayLoad = 1, mayStore = 1 in {
+def CMPXCHG8rm   : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+                     "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_MEM8>, TB;
+def CMPXCHG16rm  : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                     "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_MEM>, TB, OpSize16;
+def CMPXCHG32rm  : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_MEM>, TB, OpSize32;
+def CMPXCHG64rm  : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+                      IIC_CMPXCHG_MEM>, TB;
+}
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
+def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
+                  "cmpxchg8b\t$dst", [], IIC_CMPXCHG_8B>, TB;
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
+def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
+                    "cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>,
+                    TB, Requires<[HasCmpxchg16b]>;
+} // SchedRW
+
+
+// Lock instruction prefix
+def LOCK_PREFIX : I<0xF0, RawFrm, (outs),  (ins), "lock", []>;
+
+// Rex64 instruction prefix
+def REX64_PREFIX : I<0x48, RawFrm, (outs),  (ins), "rex64", []>,
+                     Requires<[In64BitMode]>;
+
+// Data16 instruction prefix
+def DATA16_PREFIX : I<0x66, RawFrm, (outs),  (ins), "data16", []>;
+
+// Repeat string operation instruction prefixes
+// These uses the DF flag in the EFLAGS register to inc or dec ECX
+let Defs = [ECX], Uses = [ECX,EFLAGS] in {
+// Repeat (used with INS, OUTS, MOVS, LODS and STOS)
+def REP_PREFIX : I<0xF3, RawFrm, (outs),  (ins), "rep", []>;
+// Repeat while not equal (used with CMPS and SCAS)
+def REPNE_PREFIX : I<0xF2, RawFrm, (outs),  (ins), "repne", []>;
+}
+
+
+// String manipulation instructions
+let SchedRW = [WriteMicrocoded] in {
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [AL,ESI], Uses = [ESI,EFLAGS] in
+def LODSB : I<0xAC, RawFrmSrc, (outs), (ins srcidx8:$src),
+              "lodsb\t{$src, %al|al, $src}", [], IIC_LODS>;
+let Defs = [AX,ESI], Uses = [ESI,EFLAGS] in
+def LODSW : I<0xAD, RawFrmSrc, (outs), (ins srcidx16:$src),
+              "lodsw\t{$src, %ax|ax, $src}", [], IIC_LODS>, OpSize16;
+let Defs = [EAX,ESI], Uses = [ESI,EFLAGS] in
+def LODSL : I<0xAD, RawFrmSrc, (outs), (ins srcidx32:$src),
+              "lods{l|d}\t{$src, %eax|eax, $src}", [], IIC_LODS>, OpSize32;
+let Defs = [RAX,ESI], Uses = [ESI,EFLAGS] in
+def LODSQ : RI<0xAD, RawFrmSrc, (outs), (ins srcidx64:$src),
+               "lodsq\t{$src, %rax|rax, $src}", [], IIC_LODS>;
+}
+
+let SchedRW = [WriteSystem] in {
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [ESI], Uses = [DX,ESI,EFLAGS] in {
+def OUTSB : I<0x6E, RawFrmSrc, (outs), (ins srcidx8:$src),
+             "outsb\t{$src, %dx|dx, $src}", [], IIC_OUTS>;
+def OUTSW : I<0x6F, RawFrmSrc, (outs), (ins srcidx16:$src),
+              "outsw\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize16;
+def OUTSL : I<0x6F, RawFrmSrc, (outs), (ins srcidx32:$src),
+              "outs{l|d}\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize32;
+}
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI], Uses = [DX,EDI,EFLAGS] in {
+def INSB : I<0x6C, RawFrmDst, (outs dstidx8:$dst), (ins),
+             "insb\t{%dx, $dst|$dst, dx}", [], IIC_INS>;
+def INSW : I<0x6D, RawFrmDst, (outs dstidx16:$dst), (ins),
+             "insw\t{%dx, $dst|$dst, dx}", [], IIC_INS>,  OpSize16;
+def INSL : I<0x6D, RawFrmDst, (outs dstidx32:$dst), (ins),
+             "ins{l|d}\t{%dx, $dst|$dst, dx}", [], IIC_INS>, OpSize32;
+}
+}
+
+// Flag instructions
+let SchedRW = [WriteALU] in {
+def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>;
+def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>;
+def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>;
+def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>;
+def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>;
+def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>;
+def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>;
+
+def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB;
+}
+
+// Table lookup instructions
+let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in
+def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>,
+           Sched<[WriteLoad]>;
+
+let SchedRW = [WriteMicrocoded] in {
+// ASCII Adjust After Addition
+let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>,
+            Requires<[Not64BitMode]>;
+
+// ASCII Adjust AX Before Division
+let Uses = [AX], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src),
+                 "aad\t$src", [], IIC_AAD>, Requires<[Not64BitMode]>;
+
+// ASCII Adjust AX After Multiply
+let Uses = [AL], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src),
+                 "aam\t$src", [], IIC_AAM>, Requires<[Not64BitMode]>;
+
+// ASCII Adjust AL After Subtraction - sets
+let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>,
+            Requires<[Not64BitMode]>;
+
+// Decimal Adjust AL after Addition
+let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
+def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>,
+            Requires<[Not64BitMode]>;
+
+// Decimal Adjust AL after Subtraction
+let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
+def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>,
+            Requires<[Not64BitMode]>;
+} // SchedRW
+
+let SchedRW = [WriteSystem] in {
+// Check Array Index Against Bounds
+def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                   "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize16,
+                   Requires<[Not64BitMode]>;
+def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                   "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize32,
+                   Requires<[Not64BitMode]>;
+
+// Adjust RPL Field of Segment Selector
+def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                 "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>,
+                 Requires<[Not64BitMode]>;
+def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                 "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>,
+                 Requires<[Not64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// MOVBE Instructions
+//
+let Predicates = [HasMOVBE] in {
+  let SchedRW = [WriteALULd] in {
+  def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                    "movbe{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>,
+                    OpSize16, T8PS;
+  def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                    "movbe{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (bswap (loadi32 addr:$src)))], IIC_MOVBE>,
+                    OpSize32, T8PS;
+  def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                     "movbe{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>,
+                     T8PS;
+  }
+  let SchedRW = [WriteStore] in {
+  def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                    "movbe{w}\t{$src, $dst|$dst, $src}",
+                    [(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>,
+                    OpSize16, T8PS;
+  def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                    "movbe{l}\t{$src, $dst|$dst, $src}",
+                    [(store (bswap GR32:$src), addr:$dst)], IIC_MOVBE>,
+                    OpSize32, T8PS;
+  def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                     "movbe{q}\t{$src, $dst|$dst, $src}",
+                     [(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>,
+                     T8PS;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// RDRAND Instruction
+//
+let Predicates = [HasRDRAND], Defs = [EFLAGS] in {
+  def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
+                    "rdrand{w}\t$dst",
+                    [(set GR16:$dst, EFLAGS, (X86rdrand))]>, OpSize16, TB;
+  def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
+                    "rdrand{l}\t$dst",
+                    [(set GR32:$dst, EFLAGS, (X86rdrand))]>, OpSize32, TB;
+  def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
+                     "rdrand{q}\t$dst",
+                     [(set GR64:$dst, EFLAGS, (X86rdrand))]>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// RDSEED Instruction
+//
+let Predicates = [HasRDSEED], Defs = [EFLAGS] in {
+  def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins),
+                    "rdseed{w}\t$dst",
+                    [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, TB;
+  def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins),
+                    "rdseed{l}\t$dst",
+                    [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, TB;
+  def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins),
+                     "rdseed{q}\t$dst",
+                     [(set GR64:$dst, EFLAGS, (X86rdseed))]>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// LZCNT Instruction
+//
+let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
+  def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                    "lzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize16;
+  def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                    "lzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (ctlz (loadi16 addr:$src))),
+                     (implicit EFLAGS)]>, XS, OpSize16;
+
+  def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                    "lzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize32;
+  def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                    "lzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (ctlz (loadi32 addr:$src))),
+                     (implicit EFLAGS)]>, XS, OpSize32;
+
+  def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                     "lzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>,
+                     XS;
+  def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                     "lzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (ctlz (loadi64 addr:$src))),
+                      (implicit EFLAGS)]>, XS;
+}
+
+let Predicates = [HasLZCNT] in {
+  def : Pat<(X86cmov (ctlz GR16:$src), (i16 16), (X86_COND_E_OR_NE),
+              (X86cmp GR16:$src, (i16 0))),
+            (LZCNT16rr GR16:$src)>;
+  def : Pat<(X86cmov (ctlz GR32:$src), (i32 32), (X86_COND_E_OR_NE),
+              (X86cmp GR32:$src, (i32 0))),
+            (LZCNT32rr GR32:$src)>;
+  def : Pat<(X86cmov (ctlz GR64:$src), (i64 64), (X86_COND_E_OR_NE),
+              (X86cmp GR64:$src, (i64 0))),
+            (LZCNT64rr GR64:$src)>;
+  def : Pat<(X86cmov (i16 16), (ctlz GR16:$src), (X86_COND_E_OR_NE),
+              (X86cmp GR16:$src, (i16 0))),
+            (LZCNT16rr GR16:$src)>;
+  def : Pat<(X86cmov (i32 32), (ctlz GR32:$src), (X86_COND_E_OR_NE),
+              (X86cmp GR32:$src, (i32 0))),
+            (LZCNT32rr GR32:$src)>;
+  def : Pat<(X86cmov (i64 64), (ctlz GR64:$src), (X86_COND_E_OR_NE),
+              (X86cmp GR64:$src, (i64 0))),
+            (LZCNT64rr GR64:$src)>;
+
+  def : Pat<(X86cmov (ctlz (loadi16 addr:$src)), (i16 16), (X86_COND_E_OR_NE),
+              (X86cmp (loadi16 addr:$src), (i16 0))),
+            (LZCNT16rm addr:$src)>;
+  def : Pat<(X86cmov (ctlz (loadi32 addr:$src)), (i32 32), (X86_COND_E_OR_NE),
+              (X86cmp (loadi32 addr:$src), (i32 0))),
+            (LZCNT32rm addr:$src)>;
+  def : Pat<(X86cmov (ctlz (loadi64 addr:$src)), (i64 64), (X86_COND_E_OR_NE),
+              (X86cmp (loadi64 addr:$src), (i64 0))),
+            (LZCNT64rm addr:$src)>;
+  def : Pat<(X86cmov (i16 16), (ctlz (loadi16 addr:$src)), (X86_COND_E_OR_NE),
+              (X86cmp (loadi16 addr:$src), (i16 0))),
+            (LZCNT16rm addr:$src)>;
+  def : Pat<(X86cmov (i32 32), (ctlz (loadi32 addr:$src)), (X86_COND_E_OR_NE),
+              (X86cmp (loadi32 addr:$src), (i32 0))),
+            (LZCNT32rm addr:$src)>;
+  def : Pat<(X86cmov (i64 64), (ctlz (loadi64 addr:$src)), (X86_COND_E_OR_NE),
+              (X86cmp (loadi64 addr:$src), (i64 0))),
+            (LZCNT64rm addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// BMI Instructions
+//
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                    "tzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize16;
+  def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                    "tzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (cttz (loadi16 addr:$src))),
+                     (implicit EFLAGS)]>, XS, OpSize16;
+
+  def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                    "tzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize32;
+  def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                    "tzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (cttz (loadi32 addr:$src))),
+                     (implicit EFLAGS)]>, XS, OpSize32;
+
+  def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                     "tzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>,
+                     XS;
+  def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                     "tzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (cttz (loadi64 addr:$src))),
+                      (implicit EFLAGS)]>, XS;
+}
+
+multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
+                  RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+  def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
+             !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
+             []>, T8PS, VEX_4V;
+  let mayLoad = 1 in
+  def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
+             !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
+             []>, T8PS, VEX_4V;
+}
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem>;
+  defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem>, VEX_W;
+  defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem>;
+  defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem>, VEX_W;
+  defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem>;
+  defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem>, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments to auto generate BMI instructions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasBMI] in {
+  // FIXME: patterns for the load versions are not implemented
+  def : Pat<(and GR32:$src, (add GR32:$src, -1)),
+            (BLSR32rr GR32:$src)>;
+  def : Pat<(and GR64:$src, (add GR64:$src, -1)),
+            (BLSR64rr GR64:$src)>;
+
+  def : Pat<(xor GR32:$src, (add GR32:$src, -1)),
+            (BLSMSK32rr GR32:$src)>;
+  def : Pat<(xor GR64:$src, (add GR64:$src, -1)),
+            (BLSMSK64rr GR64:$src)>;
+
+  def : Pat<(and GR32:$src, (ineg GR32:$src)),
+            (BLSI32rr GR32:$src)>;
+  def : Pat<(and GR64:$src, (ineg GR64:$src)),
+            (BLSI64rr GR64:$src)>;
+}
+
+let Predicates = [HasBMI] in {
+  def : Pat<(X86cmov (cttz GR16:$src), (i16 16), (X86_COND_E_OR_NE),
+              (X86cmp GR16:$src, (i16 0))),
+            (TZCNT16rr GR16:$src)>;
+  def : Pat<(X86cmov (cttz GR32:$src), (i32 32), (X86_COND_E_OR_NE),
+              (X86cmp GR32:$src, (i32 0))),
+            (TZCNT32rr GR32:$src)>;
+  def : Pat<(X86cmov (cttz GR64:$src), (i64 64), (X86_COND_E_OR_NE),
+              (X86cmp GR64:$src, (i64 0))),
+            (TZCNT64rr GR64:$src)>;
+  def : Pat<(X86cmov (i16 16), (cttz GR16:$src), (X86_COND_E_OR_NE),
+              (X86cmp GR16:$src, (i16 0))),
+            (TZCNT16rr GR16:$src)>;
+  def : Pat<(X86cmov (i32 32), (cttz GR32:$src), (X86_COND_E_OR_NE),
+              (X86cmp GR32:$src, (i32 0))),
+            (TZCNT32rr GR32:$src)>;
+  def : Pat<(X86cmov (i64 64), (cttz GR64:$src), (X86_COND_E_OR_NE),
+              (X86cmp GR64:$src, (i64 0))),
+            (TZCNT64rr GR64:$src)>;
+
+  def : Pat<(X86cmov (cttz (loadi16 addr:$src)), (i16 16), (X86_COND_E_OR_NE),
+              (X86cmp (loadi16 addr:$src), (i16 0))),
+            (TZCNT16rm addr:$src)>;
+  def : Pat<(X86cmov (cttz (loadi32 addr:$src)), (i32 32), (X86_COND_E_OR_NE),
+              (X86cmp (loadi32 addr:$src), (i32 0))),
+            (TZCNT32rm addr:$src)>;
+  def : Pat<(X86cmov (cttz (loadi64 addr:$src)), (i64 64), (X86_COND_E_OR_NE),
+              (X86cmp (loadi64 addr:$src), (i64 0))),
+            (TZCNT64rm addr:$src)>;
+  def : Pat<(X86cmov (i16 16), (cttz (loadi16 addr:$src)), (X86_COND_E_OR_NE),
+              (X86cmp (loadi16 addr:$src), (i16 0))),
+            (TZCNT16rm addr:$src)>;
+  def : Pat<(X86cmov (i32 32), (cttz (loadi32 addr:$src)), (X86_COND_E_OR_NE),
+              (X86cmp (loadi32 addr:$src), (i32 0))),
+            (TZCNT32rm addr:$src)>;
+  def : Pat<(X86cmov (i64 64), (cttz (loadi64 addr:$src)), (X86_COND_E_OR_NE),
+              (X86cmp (loadi64 addr:$src), (i64 0))),
+            (TZCNT64rm addr:$src)>;
+}
+
+
+multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
+                          X86MemOperand x86memop, Intrinsic Int,
+                          PatFrag ld_frag> {
+  def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
+             T8PS, VEX_4VOp3;
+  def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
+              (implicit EFLAGS)]>, T8PS, VEX_4VOp3;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  defm BEXTR32 : bmi_bextr_bzhi<0xF7, "bextr{l}", GR32, i32mem,
+                                int_x86_bmi_bextr_32, loadi32>;
+  defm BEXTR64 : bmi_bextr_bzhi<0xF7, "bextr{q}", GR64, i64mem,
+                                int_x86_bmi_bextr_64, loadi64>, VEX_W;
+}
+
+let Predicates = [HasBMI2], Defs = [EFLAGS] in {
+  defm BZHI32 : bmi_bextr_bzhi<0xF5, "bzhi{l}", GR32, i32mem,
+                               int_x86_bmi_bzhi_32, loadi32>;
+  defm BZHI64 : bmi_bextr_bzhi<0xF5, "bzhi{q}", GR64, i64mem,
+                               int_x86_bmi_bzhi_64, loadi64>, VEX_W;
+}
+
+
+def CountTrailingOnes : SDNodeXForm<imm, [{
+  // Count the trailing ones in the immediate.
+  return getI8Imm(countTrailingOnes(N->getZExtValue()), SDLoc(N));
+}]>;
+
+def BZHIMask : ImmLeaf<i64, [{
+  return isMask_64(Imm) && (countTrailingOnes<uint64_t>(Imm) > 32);
+}]>;
+
+let Predicates = [HasBMI2] in {
+  def : Pat<(and GR64:$src, BZHIMask:$mask),
+            (BZHI64rr GR64:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+                             (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
+
+  def : Pat<(and GR32:$src, (add (shl 1, GR8:$lz), -1)),
+            (BZHI32rr GR32:$src,
+              (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+  def : Pat<(and (loadi32 addr:$src), (add (shl 1, GR8:$lz), -1)),
+            (BZHI32rm addr:$src,
+              (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+  def : Pat<(and GR64:$src, (add (shl 1, GR8:$lz), -1)),
+            (BZHI64rr GR64:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+  def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)),
+            (BZHI64rm addr:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+} // HasBMI2
+
+let Predicates = [HasBMI] in {
+  def : Pat<(X86bextr GR32:$src1, GR32:$src2),
+            (BEXTR32rr GR32:$src1, GR32:$src2)>;
+  def : Pat<(X86bextr (loadi32 addr:$src1), GR32:$src2),
+            (BEXTR32rm addr:$src1, GR32:$src2)>;
+  def : Pat<(X86bextr GR64:$src1, GR64:$src2),
+            (BEXTR64rr GR64:$src1, GR64:$src2)>;
+  def : Pat<(X86bextr (loadi64 addr:$src1), GR64:$src2),
+            (BEXTR64rm addr:$src1, GR64:$src2)>;
+} // HasBMI
+
+multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
+                         X86MemOperand x86memop, Intrinsic Int,
+                         PatFrag ld_frag> {
+  def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (Int RC:$src1, RC:$src2))]>,
+             VEX_4V;
+  def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>, VEX_4V;
+}
+
+let Predicates = [HasBMI2] in {
+  defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
+                               int_x86_bmi_pdep_32, loadi32>, T8XD;
+  defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
+                               int_x86_bmi_pdep_64, loadi64>, T8XD, VEX_W;
+  defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
+                               int_x86_bmi_pext_32, loadi32>, T8XS;
+  defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
+                               int_x86_bmi_pext_64, loadi64>, T8XS, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// TBM Instructions
+//
+let Predicates = [HasTBM], Defs = [EFLAGS] in {
+
+multiclass tbm_ternary_imm_intr<bits<8> opc, RegisterClass RC, string OpcodeStr,
+                                X86MemOperand x86memop, PatFrag ld_frag,
+                                Intrinsic Int, Operand immtype,
+                                SDPatternOperator immoperator> {
+  def ri : Ii32<opc,  MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl),
+                !strconcat(OpcodeStr,
+                           "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
+                [(set RC:$dst, (Int RC:$src1, immoperator:$cntl))]>,
+           XOP, XOPA;
+  def mi : Ii32<opc,  MRMSrcMem, (outs RC:$dst),
+                (ins x86memop:$src1, immtype:$cntl),
+                !strconcat(OpcodeStr,
+                           "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
+                [(set RC:$dst, (Int (ld_frag addr:$src1), immoperator:$cntl))]>,
+           XOP, XOPA;
+}
+
+defm BEXTRI32 : tbm_ternary_imm_intr<0x10, GR32, "bextr", i32mem, loadi32,
+                                     int_x86_tbm_bextri_u32, i32imm, imm>;
+let ImmT = Imm32S in
+defm BEXTRI64 : tbm_ternary_imm_intr<0x10, GR64, "bextr", i64mem, loadi64,
+                                     int_x86_tbm_bextri_u64, i64i32imm,
+                                     i64immSExt32>, VEX_W;
+
+multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
+                         RegisterClass RC, string OpcodeStr,
+                         X86MemOperand x86memop, PatFrag ld_frag> {
+let hasSideEffects = 0 in {
+  def rr : I<opc,  FormReg, (outs RC:$dst), (ins RC:$src),
+             !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+             []>, XOP_4V, XOP9;
+  let mayLoad = 1 in
+  def rm : I<opc,  FormMem, (outs RC:$dst), (ins x86memop:$src),
+             !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+             []>, XOP_4V, XOP9;
+}
+}
+
+multiclass tbm_binary_intr<bits<8> opc, string OpcodeStr,
+                           Format FormReg, Format FormMem> {
+  defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr, i32mem,
+                               loadi32>;
+  defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr, i64mem,
+                               loadi64>, VEX_W;
+}
+
+defm BLCFILL : tbm_binary_intr<0x01, "blcfill", MRM1r, MRM1m>;
+defm BLCI    : tbm_binary_intr<0x02, "blci", MRM6r, MRM6m>;
+defm BLCIC   : tbm_binary_intr<0x01, "blcic", MRM5r, MRM5m>;
+defm BLCMSK  : tbm_binary_intr<0x02, "blcmsk", MRM1r, MRM1m>;
+defm BLCS    : tbm_binary_intr<0x01, "blcs", MRM3r, MRM3m>;
+defm BLSFILL : tbm_binary_intr<0x01, "blsfill", MRM2r, MRM2m>;
+defm BLSIC   : tbm_binary_intr<0x01, "blsic", MRM6r, MRM6m>;
+defm T1MSKC  : tbm_binary_intr<0x01, "t1mskc", MRM7r, MRM7m>;
+defm TZMSK   : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>;
+} // HasTBM, EFLAGS
+
+//===----------------------------------------------------------------------===//
+// MONITORX/MWAITX Instructions
+//
+let SchedRW = [WriteSystem] in {
+let Uses = [EAX, ECX, EDX] in
+def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", [],
+                    IIC_SSE_MONITOR>, TB;
+let Uses = [ECX, EAX, EBX] in
+def MWAITXrr   : I<0x01, MRM_FB, (outs), (ins), "mwaitx", [], IIC_SSE_MWAIT>,
+                 TB;
+} // SchedRW
+
+def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrr)>, Requires<[Not64BitMode]>;
+def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrr)>, Requires<[In64BitMode]>;
+
+def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORXrrr)>,
+      Requires<[Not64BitMode]>;
+def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>,
+      Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// CLZERO Instruction
+//
+let Uses = [EAX] in
+def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, TB;
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments to auto generate TBM instructions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasTBM] in {
+  def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)),
+            (BEXTRI32ri GR32:$src1, imm:$src2)>;
+  def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)),
+            (BEXTRI32mi addr:$src1, imm:$src2)>;
+  def : Pat<(X86bextr GR64:$src1, i64immSExt32:$src2),
+            (BEXTRI64ri GR64:$src1, i64immSExt32:$src2)>;
+  def : Pat<(X86bextr (loadi64 addr:$src1), i64immSExt32:$src2),
+            (BEXTRI64mi addr:$src1, i64immSExt32:$src2)>;
+
+  // FIXME: patterns for the load versions are not implemented
+  def : Pat<(and GR32:$src, (add GR32:$src, 1)),
+            (BLCFILL32rr GR32:$src)>;
+  def : Pat<(and GR64:$src, (add GR64:$src, 1)),
+            (BLCFILL64rr GR64:$src)>;
+
+  def : Pat<(or GR32:$src, (not (add GR32:$src, 1))),
+            (BLCI32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (not (add GR64:$src, 1))),
+            (BLCI64rr GR64:$src)>;
+
+  // Extra patterns because opt can optimize the above patterns to this.
+  def : Pat<(or GR32:$src, (sub -2, GR32:$src)),
+            (BLCI32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (sub -2, GR64:$src)),
+            (BLCI64rr GR64:$src)>;
+
+  def : Pat<(and (not GR32:$src), (add GR32:$src, 1)),
+            (BLCIC32rr GR32:$src)>;
+  def : Pat<(and (not GR64:$src), (add GR64:$src, 1)),
+            (BLCIC64rr GR64:$src)>;
+
+  def : Pat<(xor GR32:$src, (add GR32:$src, 1)),
+            (BLCMSK32rr GR32:$src)>;
+  def : Pat<(xor GR64:$src, (add GR64:$src, 1)),
+            (BLCMSK64rr GR64:$src)>;
+
+  def : Pat<(or GR32:$src, (add GR32:$src, 1)),
+            (BLCS32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (add GR64:$src, 1)),
+            (BLCS64rr GR64:$src)>;
+
+  def : Pat<(or GR32:$src, (add GR32:$src, -1)),
+            (BLSFILL32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (add GR64:$src, -1)),
+            (BLSFILL64rr GR64:$src)>;
+
+  def : Pat<(or (not GR32:$src), (add GR32:$src, -1)),
+            (BLSIC32rr GR32:$src)>;
+  def : Pat<(or (not GR64:$src), (add GR64:$src, -1)),
+            (BLSIC64rr GR64:$src)>;
+
+  def : Pat<(or (not GR32:$src), (add GR32:$src, 1)),
+            (T1MSKC32rr GR32:$src)>;
+  def : Pat<(or (not GR64:$src), (add GR64:$src, 1)),
+            (T1MSKC64rr GR64:$src)>;
+
+  def : Pat<(and (not GR32:$src), (add GR32:$src, -1)),
+            (TZMSK32rr GR32:$src)>;
+  def : Pat<(and (not GR64:$src), (add GR64:$src, -1)),
+            (TZMSK64rr GR64:$src)>;
+} // HasTBM
+
+//===----------------------------------------------------------------------===//
+// Memory Instructions
+//
+
+def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+                   "clflushopt\t$src", []>, PD;
+def CLWB       : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", []>, PD;
+def PCOMMIT    : I<0xAE, MRM_F8, (outs), (ins), "pcommit", []>, PD;
+
+
+//===----------------------------------------------------------------------===//
+// Subsystems.
+//===----------------------------------------------------------------------===//
+
+include "X86InstrArithmetic.td"
+include "X86InstrCMovSetCC.td"
+include "X86InstrExtension.td"
+include "X86InstrControl.td"
+include "X86InstrShiftRotate.td"
+
+// X87 Floating Point Stack.
+include "X86InstrFPStack.td"
+
+// SIMD support (SSE, MMX and AVX)
+include "X86InstrFragmentsSIMD.td"
+
+// FMA - Fused Multiply-Add support (requires FMA)
+include "X86InstrFMA.td"
+
+// XOP
+include "X86InstrXOP.td"
+
+// SSE, MMX and 3DNow! vector support.
+include "X86InstrSSE.td"
+include "X86InstrAVX512.td"
+include "X86InstrMMX.td"
+include "X86Instr3DNow.td"
+
+// MPX instructions
+include "X86InstrMPX.td"
+
+include "X86InstrVMX.td"
+include "X86InstrSVM.td"
+
+include "X86InstrTSX.td"
+include "X86InstrSGX.td"
+
+// System instructions.
+include "X86InstrSystem.td"
+
+// Compiler Pseudo Instructions and Pat Patterns
+include "X86InstrCompiler.td"
+
+//===----------------------------------------------------------------------===//
+// Assembler Mnemonic Aliases
+//===----------------------------------------------------------------------===//
+
+def : MnemonicAlias<"call", "callw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"call", "calll", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"call", "callq", "att">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"cbw",  "cbtw", "att">;
+def : MnemonicAlias<"cwde", "cwtl", "att">;
+def : MnemonicAlias<"cwd",  "cwtd", "att">;
+def : MnemonicAlias<"cdq",  "cltd", "att">;
+def : MnemonicAlias<"cdqe", "cltq", "att">;
+def : MnemonicAlias<"cqo",  "cqto", "att">;
+
+// In 64-bit mode lret maps to lretl; it is not ambiguous with lretq.
+def : MnemonicAlias<"lret", "lretw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>;
+
+def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"loopz",  "loope">;
+def : MnemonicAlias<"loopnz", "loopne">;
+
+def : MnemonicAlias<"pop",   "popw",  "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pop",   "popl",  "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pop",   "popq",  "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popf",  "popfw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popf",  "popfl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"popf",  "popfq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popfd", "popfl", "att">;
+
+// FIXME: This is wrong for "push reg".  "push %bx" should turn into pushw in
+// all modes.  However: "push (addr)" and "push $42" should default to
+// pushl/pushq depending on the current mode.  Similar for "pop %bx"
+def : MnemonicAlias<"push",   "pushw",  "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"push",   "pushl",  "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"push",   "pushq",  "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushf",  "pushfw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pushf",  "pushfl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pushf",  "pushfq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushfd", "pushfl", "att">;
+
+def : MnemonicAlias<"popad",  "popal",  "intel">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"pushad", "pushal", "intel">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"popa",   "popaw",  "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pusha",  "pushaw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popa",   "popal",  "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pusha",  "pushal", "intel">, Requires<[In32BitMode]>;
+
+def : MnemonicAlias<"popa",   "popaw",  "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pusha",  "pushaw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popa",   "popal",  "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pusha",  "pushal", "att">, Requires<[In32BitMode]>;
+
+def : MnemonicAlias<"repe",  "rep">;
+def : MnemonicAlias<"repz",  "rep">;
+def : MnemonicAlias<"repnz", "repne">;
+
+def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"sal", "shl", "intel">;
+def : MnemonicAlias<"salb", "shlb", "att">;
+def : MnemonicAlias<"salw", "shlw", "att">;
+def : MnemonicAlias<"sall", "shll", "att">;
+def : MnemonicAlias<"salq", "shlq", "att">;
+
+def : MnemonicAlias<"smovb", "movsb", "att">;
+def : MnemonicAlias<"smovw", "movsw", "att">;
+def : MnemonicAlias<"smovl", "movsl", "att">;
+def : MnemonicAlias<"smovq", "movsq", "att">;
+
+def : MnemonicAlias<"ud2a",  "ud2",  "att">;
+def : MnemonicAlias<"verrw", "verr", "att">;
+
+// System instruction aliases.
+def : MnemonicAlias<"iret",    "iretw",    "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"iret",    "iretl",    "att">, Requires<[Not16BitMode]>;
+def : MnemonicAlias<"sysret",  "sysretl",  "att">;
+def : MnemonicAlias<"sysexit", "sysexitl", "att">;
+
+def : MnemonicAlias<"lgdt", "lgdtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"lidt", "lidtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lidt", "lidtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lidt", "lidtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sidt", "sidtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sidt", "sidtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sidt", "sidtq", "att">, Requires<[In64BitMode]>;
+
+
+// Floating point stack aliases.
+def : MnemonicAlias<"fcmovz",   "fcmove",   "att">;
+def : MnemonicAlias<"fcmova",   "fcmovnbe", "att">;
+def : MnemonicAlias<"fcmovnae", "fcmovb",   "att">;
+def : MnemonicAlias<"fcmovna",  "fcmovbe",  "att">;
+def : MnemonicAlias<"fcmovae",  "fcmovnb",  "att">;
+def : MnemonicAlias<"fcomip",   "fcompi">;
+def : MnemonicAlias<"fildq",    "fildll",   "att">;
+def : MnemonicAlias<"fistpq",   "fistpll",  "att">;
+def : MnemonicAlias<"fisttpq",  "fisttpll", "att">;
+def : MnemonicAlias<"fldcww",   "fldcw",    "att">;
+def : MnemonicAlias<"fnstcww",  "fnstcw",   "att">;
+def : MnemonicAlias<"fnstsww",  "fnstsw",   "att">;
+def : MnemonicAlias<"fucomip",  "fucompi">;
+def : MnemonicAlias<"fwait",    "wait">;
+
+def : MnemonicAlias<"fxsaveq",   "fxsave64",   "att">;
+def : MnemonicAlias<"fxrstorq",  "fxrstor64",  "att">;
+def : MnemonicAlias<"xsaveq",    "xsave64",    "att">;
+def : MnemonicAlias<"xrstorq",   "xrstor64",   "att">;
+def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">;
+def : MnemonicAlias<"xrstorsq",  "xrstors64",  "att">;
+def : MnemonicAlias<"xsavecq",   "xsavec64",   "att">;
+def : MnemonicAlias<"xsavesq",   "xsaves64",   "att">;
+
+class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond,
+                    string VariantName>
+  : MnemonicAlias<!strconcat(Prefix, OldCond, Suffix),
+                  !strconcat(Prefix, NewCond, Suffix), VariantName>;
+
+/// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of
+/// MnemonicAlias's that canonicalize the condition code in a mnemonic, for
+/// example "setz" -> "sete".
+multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix,
+                                        string V = ""> {
+  def C   : CondCodeAlias<Prefix, Suffix, "c",   "b",  V>; // setc   -> setb
+  def Z   : CondCodeAlias<Prefix, Suffix, "z" ,  "e",  V>; // setz   -> sete
+  def NA  : CondCodeAlias<Prefix, Suffix, "na",  "be", V>; // setna  -> setbe
+  def NB  : CondCodeAlias<Prefix, Suffix, "nb",  "ae", V>; // setnb  -> setae
+  def NC  : CondCodeAlias<Prefix, Suffix, "nc",  "ae", V>; // setnc  -> setae
+  def NG  : CondCodeAlias<Prefix, Suffix, "ng",  "le", V>; // setng  -> setle
+  def NL  : CondCodeAlias<Prefix, Suffix, "nl",  "ge", V>; // setnl  -> setge
+  def NZ  : CondCodeAlias<Prefix, Suffix, "nz",  "ne", V>; // setnz  -> setne
+  def PE  : CondCodeAlias<Prefix, Suffix, "pe",  "p",  V>; // setpe  -> setp
+  def PO  : CondCodeAlias<Prefix, Suffix, "po",  "np", V>; // setpo  -> setnp
+
+  def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b",  V>; // setnae -> setb
+  def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a",  V>; // setnbe -> seta
+  def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l",  V>; // setnge -> setl
+  def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g",  V>; // setnle -> setg
+}
+
+// Aliases for set<CC>
+defm : IntegerCondCodeMnemonicAlias<"set", "">;
+// Aliases for j<CC>
+defm : IntegerCondCodeMnemonicAlias<"j", "">;
+// Aliases for cmov<CC>{w,l,q}
+defm : IntegerCondCodeMnemonicAlias<"cmov", "w", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "l", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">;
+// No size suffix for intel-style asm.
+defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">;
+
+
+//===----------------------------------------------------------------------===//
+// Assembler Instruction Aliases
+//===----------------------------------------------------------------------===//
+
+// aad/aam default to base 10 if no operand is specified.
+def : InstAlias<"aad", (AAD8i8 10)>, Requires<[Not64BitMode]>;
+def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>;
+
+// Disambiguate the mem/imm form of bt-without-a-suffix as btl.
+// Likewise for btc/btr/bts.
+def : InstAlias<"bt {$imm, $mem|$mem, $imm}",
+                (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"btc {$imm, $mem|$mem, $imm}",
+                (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"btr {$imm, $mem|$mem, $imm}",
+                (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"bts {$imm, $mem|$mem, $imm}",
+                (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+
+// clr aliases.
+def : InstAlias<"clrb $reg", (XOR8rr  GR8 :$reg, GR8 :$reg), 0>;
+def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
+def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
+def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;
+
+// lods aliases. Accept the destination being omitted because it's implicit
+// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"lodsb $src", (LODSB srcidx8:$src),  0>;
+def : InstAlias<"lodsw $src", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods{l|d} $src", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lodsq $src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"lods {$src, %al|al, $src}", (LODSB srcidx8:$src),  0>;
+def : InstAlias<"lods {$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods {$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lods {$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+
+// stos aliases. Accept the source being omitted because it's implicit in
+// the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the source.
+def : InstAlias<"stosb $dst", (STOSB dstidx8:$dst),  0>;
+def : InstAlias<"stosw $dst", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos{l|d} $dst", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stosq $dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"stos {%al, $dst|$dst, al}", (STOSB dstidx8:$dst),  0>;
+def : InstAlias<"stos {%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos {%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stos {%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+
+// scas aliases. Accept the destination being omitted because it's implicit
+// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"scasb $dst", (SCASB dstidx8:$dst),  0>;
+def : InstAlias<"scasw $dst", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas{l|d} $dst", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scasq $dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"scas {$dst, %al|al, $dst}", (SCASB dstidx8:$dst),  0>;
+def : InstAlias<"scas {$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas {$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scas {$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+
+// div and idiv aliases for explicit A register.
+def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r  GR8 :$src)>;
+def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16r GR16:$src)>;
+def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32r GR32:$src)>;
+def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64r GR64:$src)>;
+def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8m  i8mem :$src)>;
+def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16m i16mem:$src)>;
+def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32m i32mem:$src)>;
+def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64m i64mem:$src)>;
+def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8r  GR8 :$src)>;
+def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16r GR16:$src)>;
+def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32r GR32:$src)>;
+def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64r GR64:$src)>;
+def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8m  i8mem :$src)>;
+def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16m i16mem:$src)>;
+def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32m i32mem:$src)>;
+def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>;
+
+
+
+// Various unary fpstack operations default to operating on on ST1.
+// For example, "fxch" -> "fxch %st(1)"
+def : InstAlias<"faddp",        (ADD_FPrST0  ST1), 0>;
+def:  InstAlias<"fadd",         (ADD_FPrST0  ST1), 0>;
+def : InstAlias<"fsub{|r}p",    (SUBR_FPrST0 ST1), 0>;
+def : InstAlias<"fsub{r|}p",    (SUB_FPrST0  ST1), 0>;
+def : InstAlias<"fmul",         (MUL_FPrST0  ST1), 0>;
+def : InstAlias<"fmulp",        (MUL_FPrST0  ST1), 0>;
+def : InstAlias<"fdiv{|r}p",    (DIVR_FPrST0 ST1), 0>;
+def : InstAlias<"fdiv{r|}p",    (DIV_FPrST0  ST1), 0>;
+def : InstAlias<"fxch",         (XCH_F       ST1), 0>;
+def : InstAlias<"fcom",         (COM_FST0r   ST1), 0>;
+def : InstAlias<"fcomp",        (COMP_FST0r  ST1), 0>;
+def : InstAlias<"fcomi",        (COM_FIr     ST1), 0>;
+def : InstAlias<"fcompi",       (COM_FIPr    ST1), 0>;
+def : InstAlias<"fucom",        (UCOM_Fr     ST1), 0>;
+def : InstAlias<"fucomp",       (UCOM_FPr    ST1), 0>;
+def : InstAlias<"fucomi",       (UCOM_FIr    ST1), 0>;
+def : InstAlias<"fucompi",      (UCOM_FIPr   ST1), 0>;
+
+// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op.
+// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)".  We also disambiguate
+// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with
+// gas.
+multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> {
+ def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)|st(0), $op}"),
+                 (Inst RST:$op), EmitAlias>;
+ def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)|st(0), st(0)}"),
+                 (Inst ST0), EmitAlias>;
+}
+
+defm : FpUnaryAlias<"fadd",   ADD_FST0r>;
+defm : FpUnaryAlias<"faddp",  ADD_FPrST0, 0>;
+defm : FpUnaryAlias<"fsub",   SUB_FST0r>;
+defm : FpUnaryAlias<"fsub{|r}p",  SUBR_FPrST0>;
+defm : FpUnaryAlias<"fsubr",  SUBR_FST0r>;
+defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>;
+defm : FpUnaryAlias<"fmul",   MUL_FST0r>;
+defm : FpUnaryAlias<"fmulp",  MUL_FPrST0>;
+defm : FpUnaryAlias<"fdiv",   DIV_FST0r>;
+defm : FpUnaryAlias<"fdiv{|r}p",  DIVR_FPrST0>;
+defm : FpUnaryAlias<"fdivr",  DIVR_FST0r>;
+defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>;
+defm : FpUnaryAlias<"fcomi",   COM_FIr, 0>;
+defm : FpUnaryAlias<"fucomi",  UCOM_FIr, 0>;
+defm : FpUnaryAlias<"fcompi",   COM_FIPr>;
+defm : FpUnaryAlias<"fucompi",  UCOM_FIPr>;
+
+
+// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they
+// commute.  We also allow fdiv[r]p/fsubrp even though they don't commute,
+// solely because gas supports it.
+def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>;
+def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>;
+def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>;
+def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>;
+def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>;
+def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>;
+
+// We accept "fnstsw %eax" even though it only writes %ax.
+def : InstAlias<"fnstsw\t{%eax|eax}", (FNSTSW16r)>;
+def : InstAlias<"fnstsw\t{%al|al}" , (FNSTSW16r)>;
+def : InstAlias<"fnstsw"     , (FNSTSW16r)>;
+
+// lcall and ljmp aliases.  This seems to be an odd mapping in 64-bit mode, but
+// this is compatible with what GAS does.
+def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall {*}$dst",    (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp {*}$dst",     (FARJMP32m  opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp $seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall {*}$dst",    (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp {*}$dst",     (FARJMP16m  opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+
+def : InstAlias<"call {*}$dst",     (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"jmp {*}$dst",      (JMP64m  i64mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"call {*}$dst",     (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"jmp {*}$dst",      (JMP32m  i32mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"call {*}$dst",     (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp {*}$dst",      (JMP16m  i16mem:$dst), 0>, Requires<[In16BitMode]>;
+
+
+// "imul <imm>, B" is an alias for "imul <imm>, B, B".
+def : InstAlias<"imul{w} {$imm, $r|$r, $imm}", (IMUL16rri  GR16:$r, GR16:$r, i16imm:$imm), 0>;
+def : InstAlias<"imul{w} {$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>;
+def : InstAlias<"imul{l} {$imm, $r|$r, $imm}", (IMUL32rri  GR32:$r, GR32:$r, i32imm:$imm), 0>;
+def : InstAlias<"imul{l} {$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>;
+def : InstAlias<"imul{q} {$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>;
+def : InstAlias<"imul{q} {$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>;
+
+// inb %dx -> inb %al, %dx
+def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>;
+def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>;
+def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>;
+def : InstAlias<"inb\t$port", (IN8ri u8imm:$port), 0>;
+def : InstAlias<"inw\t$port", (IN16ri u8imm:$port), 0>;
+def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>;
+
+
+// jmp and call aliases for lcall and ljmp.  jmp $42,$5 -> ljmp
+def : InstAlias<"call $seg, $off",  (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp $seg, $off",   (FARJMP16i  i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"call $seg, $off",  (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
+def : InstAlias<"jmp $seg, $off",   (FARJMP32i  i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
+def : InstAlias<"callw $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>;
+def : InstAlias<"jmpw $seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg)>;
+def : InstAlias<"calll $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"jmpl $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>;
+
+// Force mov without a suffix with a segment and mem to prefer the 'l' form of
+// the move.  All segment/mem forms are equivalent, this has the shortest
+// encoding.
+def : InstAlias<"mov {$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>;
+def : InstAlias<"mov {$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>;
+
+// Match 'movq <largeimm>, <reg>' as an alias for movabsq.
+def : InstAlias<"movq {$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
+
+// Match 'movq GR64, MMX' as an alias for movd.
+def : InstAlias<"movq {$src, $dst|$dst, $src}",
+                (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>;
+def : InstAlias<"movq {$src, $dst|$dst, $src}",
+                (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>;
+
+// movsx aliases
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;
+
+// movzx aliases
+def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>;
+// Note: No GR32->GR64 movzx form.
+
+// outb %dx -> outb %al, %dx
+def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>;
+def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>;
+def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>;
+def : InstAlias<"outb\t$port", (OUT8ir u8imm:$port), 0>;
+def : InstAlias<"outw\t$port", (OUT16ir u8imm:$port), 0>;
+def : InstAlias<"outl\t$port", (OUT32ir u8imm:$port), 0>;
+
+// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same
+// effect (both store to a 16-bit mem).  Force to sldtw to avoid ambiguity
+// errors, since its encoding is the most compact.
+def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem), 0>;
+
+// shld/shrd op,op -> shld op, op, CL
+def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>;
+def : InstAlias<"shld{l}\t{$r2, $r1|$r1, $r2}", (SHLD32rrCL GR32:$r1, GR32:$r2), 0>;
+def : InstAlias<"shld{q}\t{$r2, $r1|$r1, $r2}", (SHLD64rrCL GR64:$r1, GR64:$r2), 0>;
+def : InstAlias<"shrd{w}\t{$r2, $r1|$r1, $r2}", (SHRD16rrCL GR16:$r1, GR16:$r2), 0>;
+def : InstAlias<"shrd{l}\t{$r2, $r1|$r1, $r2}", (SHRD32rrCL GR32:$r1, GR32:$r2), 0>;
+def : InstAlias<"shrd{q}\t{$r2, $r1|$r1, $r2}", (SHRD64rrCL GR64:$r1, GR64:$r2), 0>;
+
+def : InstAlias<"shld{w}\t{$reg, $mem|$mem, $reg}", (SHLD16mrCL i16mem:$mem, GR16:$reg), 0>;
+def : InstAlias<"shld{l}\t{$reg, $mem|$mem, $reg}", (SHLD32mrCL i32mem:$mem, GR32:$reg), 0>;
+def : InstAlias<"shld{q}\t{$reg, $mem|$mem, $reg}", (SHLD64mrCL i64mem:$mem, GR64:$reg), 0>;
+def : InstAlias<"shrd{w}\t{$reg, $mem|$mem, $reg}", (SHRD16mrCL i16mem:$mem, GR16:$reg), 0>;
+def : InstAlias<"shrd{l}\t{$reg, $mem|$mem, $reg}", (SHRD32mrCL i32mem:$mem, GR32:$reg), 0>;
+def : InstAlias<"shrd{q}\t{$reg, $mem|$mem, $reg}", (SHRD64mrCL i64mem:$mem, GR64:$reg), 0>;
+
+/*  FIXME: This is disabled because the asm matcher is currently incapable of
+ *  matching a fixed immediate like $1.
+// "shl X, $1" is an alias for "shl X".
+multiclass ShiftRotateByOneAlias<string Mnemonic, string Opc> {
+ def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "8r1")) GR8:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "16r1")) GR16:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "32r1")) GR32:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "64r1")) GR64:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "8m1")) i8mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "16m1")) i16mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "32m1")) i32mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "64m1")) i64mem:$op)>;
+}
+
+defm : ShiftRotateByOneAlias<"rcl", "RCL">;
+defm : ShiftRotateByOneAlias<"rcr", "RCR">;
+defm : ShiftRotateByOneAlias<"rol", "ROL">;
+defm : ShiftRotateByOneAlias<"ror", "ROR">;
+FIXME */
+
+// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms.
+def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}",
+                (TEST8rm  GR8 :$val, i8mem :$mem), 0>;
+def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}",
+                (TEST16rm GR16:$val, i16mem:$mem), 0>;
+def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}",
+                (TEST32rm GR32:$val, i32mem:$mem), 0>;
+def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}",
+                (TEST64rm GR64:$val, i64mem:$mem), 0>;
+
+// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms.
+def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}",
+                (XCHG8rm  GR8 :$val, i8mem :$mem), 0>;
+def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}",
+                (XCHG16rm GR16:$val, i16mem:$mem), 0>;
+def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}",
+                (XCHG32rm GR32:$val, i32mem:$mem), 0>;
+def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}",
+                (XCHG64rm GR64:$val, i64mem:$mem), 0>;
+
+// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
+def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src), 0>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
+                (XCHG32ar GR32:$src), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
+                (XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>;
+
+// These aliases exist to get the parser to prioritize matching 8-bit
+// immediate encodings over matching the implicit ax/eax/rax encodings. By
+// explicitly mentioning the A register here, these entries will be ordered
+// first due to the more explicit immediate type.
+def : InstAlias<"adc{w}\t{$imm, %ax|ax, $imm}", (ADC16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"add{w}\t{$imm, %ax|ax, $imm}", (ADD16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"and{w}\t{$imm, %ax|ax, $imm}", (AND16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"cmp{w}\t{$imm, %ax|ax, $imm}", (CMP16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"or{w}\t{$imm, %ax|ax, $imm}",  (OR16ri8 AX,  i16i8imm:$imm), 0>;
+def : InstAlias<"sbb{w}\t{$imm, %ax|ax, $imm}", (SBB16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"sub{w}\t{$imm, %ax|ax, $imm}", (SUB16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"xor{w}\t{$imm, %ax|ax, $imm}", (XOR16ri8 AX, i16i8imm:$imm), 0>;
+
+def : InstAlias<"adc{l}\t{$imm, %eax|eax, $imm}", (ADC32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"add{l}\t{$imm, %eax|eax, $imm}", (ADD32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"and{l}\t{$imm, %eax|eax, $imm}", (AND32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"cmp{l}\t{$imm, %eax|eax, $imm}", (CMP32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"or{l}\t{$imm, %eax|eax, $imm}",  (OR32ri8 EAX,  i32i8imm:$imm), 0>;
+def : InstAlias<"sbb{l}\t{$imm, %eax|eax, $imm}", (SBB32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"sub{l}\t{$imm, %eax|eax, $imm}", (SUB32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"xor{l}\t{$imm, %eax|eax, $imm}", (XOR32ri8 EAX, i32i8imm:$imm), 0>;
+
+def : InstAlias<"adc{q}\t{$imm, %rax|rax, $imm}", (ADC64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"add{q}\t{$imm, %rax|rax, $imm}", (ADD64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"and{q}\t{$imm, %rax|rax, $imm}", (AND64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"cmp{q}\t{$imm, %rax|rax, $imm}", (CMP64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"or{q}\t{$imm, %rax|rax, $imm}",  (OR64ri8 RAX,  i64i8imm:$imm), 0>;
+def : InstAlias<"sbb{q}\t{$imm, %rax|rax, $imm}", (SBB64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"sub{q}\t{$imm, %rax|rax, $imm}", (SUB64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"xor{q}\t{$imm, %rax|rax, $imm}", (XOR64ri8 RAX, i64i8imm:$imm), 0>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
new file mode 100644
index 0000000..83f9b14
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
@@ -0,0 +1,674 @@
+//===-- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MMX instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+// All instructions that use MMX should be in this file, even if they also use
+// SSE.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX Multiclasses
+//===----------------------------------------------------------------------===//
+
+let Sched = WriteVecALU in {
+def MMX_INTALU_ITINS : OpndItins<
+  IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
+>;
+
+def MMX_INTALUQ_ITINS : OpndItins<
+  IIC_MMX_ALUQ_RR, IIC_MMX_ALUQ_RM
+>;
+
+def MMX_PHADDSUBW : OpndItins<
+  IIC_MMX_PHADDSUBW_RR, IIC_MMX_PHADDSUBW_RM
+>;
+
+def MMX_PHADDSUBD : OpndItins<
+  IIC_MMX_PHADDSUBD_RR, IIC_MMX_PHADDSUBD_RM
+>;
+}
+
+let Sched = WriteVecLogic in
+def MMX_INTALU_ITINS_VECLOGICSCHED : OpndItins<
+  IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
+>;
+
+let Sched = WriteVecIMul in
+def MMX_PMUL_ITINS : OpndItins<
+  IIC_MMX_PMUL, IIC_MMX_PMUL
+>;
+
+let Sched = WriteVecIMul in {
+def MMX_PSADBW_ITINS : OpndItins<
+  IIC_MMX_PSADBW, IIC_MMX_PSADBW
+>;
+
+def MMX_MISC_FUNC_ITINS : OpndItins<
+  IIC_MMX_MISC_FUNC_MEM, IIC_MMX_MISC_FUNC_REG
+>;
+}
+
+def MMX_SHIFT_ITINS : ShiftOpndItins<
+  IIC_MMX_SHIFT_RR, IIC_MMX_SHIFT_RM, IIC_MMX_SHIFT_RI
+>;
+
+let Sched = WriteShuffle in {
+def MMX_UNPCK_H_ITINS : OpndItins<
+  IIC_MMX_UNPCK_H_RR, IIC_MMX_UNPCK_H_RM
+>;
+
+def MMX_UNPCK_L_ITINS : OpndItins<
+  IIC_MMX_UNPCK_L, IIC_MMX_UNPCK_L
+>;
+
+def MMX_PCK_ITINS : OpndItins<
+  IIC_MMX_PCK_RR, IIC_MMX_PCK_RM
+>;
+
+def MMX_PSHUF_ITINS : OpndItins<
+  IIC_MMX_PSHUF, IIC_MMX_PSHUF
+>;
+} // Sched
+
+let Sched = WriteCvtF2I in {
+def MMX_CVT_PD_ITINS : OpndItins<
+  IIC_MMX_CVT_PD_RR, IIC_MMX_CVT_PD_RM
+>;
+
+def MMX_CVT_PS_ITINS : OpndItins<
+  IIC_MMX_CVT_PS_RR, IIC_MMX_CVT_PS_RM
+>;
+}
+
+let Constraints = "$src1 = $dst" in {
+  // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
+  // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
+  multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               OpndItins itins, bit Commutable = 0> {
+    def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+                 (ins VR64:$src1, VR64:$src2),
+                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                 [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
+              Sched<[itins.Sched]> {
+      let isCommutable = Commutable;
+    }
+    def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+                 (ins VR64:$src1, i64mem:$src2),
+                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                 [(set VR64:$dst, (IntId VR64:$src1,
+                                   (bitconvert (load_mmx addr:$src2))))],
+                 itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+
+  multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+                                string OpcodeStr, Intrinsic IntId,
+                                Intrinsic IntId2, ShiftOpndItins itins> {
+    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+                                  (ins VR64:$src1, VR64:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
+             Sched<[WriteVecShift]>;
+    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+                                  (ins VR64:$src1, i64mem:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (IntId VR64:$src1,
+                                    (bitconvert (load_mmx addr:$src2))))],
+                  itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
+    def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
+                                   (ins VR64:$src1, i32u8imm:$src2),
+                    !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           [(set VR64:$dst, (IntId2 VR64:$src1, imm:$src2))], itins.ri>,
+           Sched<[WriteVecShift]>;
+  }
+}
+
+/// Unary MMX instructions requiring SSSE3.
+multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
+                               Intrinsic IntId64, OpndItins itins> {
+  def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>,
+             Sched<[itins.Sched]>;
+
+  def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst,
+                     (IntId64 (bitconvert (memopmmx addr:$src))))],
+                   itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+/// Binary MMX instructions requiring SSSE3.
+let ImmT = NoImm, Constraints = "$src1 = $dst" in {
+multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
+                             Intrinsic IntId64, OpndItins itins> {
+  let isCommutable = 0 in
+  def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst),
+       (ins VR64:$src1, VR64:$src2),
+        !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+       [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>,
+      Sched<[itins.Sched]>;
+  def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst),
+       (ins VR64:$src1, i64mem:$src2),
+        !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+       [(set VR64:$dst,
+         (IntId64 VR64:$src1,
+          (bitconvert (memopmmx addr:$src2))))], itins.rm>,
+      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+}
+
+/// PALIGN MMX instructions (require SSSE3).
+multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
+  def R64irr  : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+      (ins VR64:$src1, VR64:$src2, u8imm:$src3),
+      !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+      [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>,
+      Sched<[WriteShuffle]>;
+  def R64irm  : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+      (ins VR64:$src1, i64mem:$src2, u8imm:$src3),
+      !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+      [(set VR64:$dst, (IntId VR64:$src1,
+                       (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>,
+      Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                         Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
+                         string asm, OpndItins itins, Domain d> {
+  def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+                  [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>,
+            Sched<[itins.Sched]>;
+  def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+                  [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>,
+            Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
+                    RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
+                    PatFrag ld_frag, string asm, Domain d> {
+  def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst),
+                  (ins DstRC:$src1, SrcRC:$src2), asm,
+                  [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+                  NoItinerary, d>, Sched<[WriteCvtI2F]>;
+  def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst),
+                  (ins DstRC:$src1, x86memop:$src2), asm,
+                  [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+                  NoItinerary, d>, Sched<[WriteCvtI2FLd]>;
+}
+
+//===----------------------------------------------------------------------===//
+// MMX EMMS Instruction
+//===----------------------------------------------------------------------===//
+
+def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms",
+                     [(int_x86_mmx_emms)], IIC_MMX_EMMS>;
+
+//===----------------------------------------------------------------------===//
+// MMX Scalar Instructions
+//===----------------------------------------------------------------------===//
+
+// Data Transfer Instructions
+def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set VR64:$dst,
+                         (x86mmx (scalar_to_vector GR32:$src)))],
+                        IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set VR64:$dst,
+                        (x86mmx (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>;
+
+let Predicates = [HasMMX] in {
+  let AddedComplexity = 15 in
+    def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)),
+              (MMX_MOVD64rr GR32:$src)>;
+  let AddedComplexity = 20 in
+    def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))),
+              (MMX_MOVD64rm addr:$src)>;
+}
+
+let mayStore = 1 in
+def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
+                        "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>,
+                   Sched<[WriteStore]>;
+
+def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
+                         "movd\t{$src, $dst|$dst, $src}",
+                         [(set GR32:$dst,
+                          (MMX_X86movd2w (x86mmx VR64:$src)))],
+                          IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>;
+
+let isBitcast = 1 in
+def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
+                             "movd\t{$src, $dst|$dst, $src}",
+                             [(set VR64:$dst, (bitconvert GR64:$src))],
+                             IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def MMX_MOVD64to64rm : MMXRI<0x6E, MRMSrcMem, (outs VR64:$dst),
+                             (ins i64mem:$src), "movd\t{$src, $dst|$dst, $src}",
+                             [], IIC_MMX_MOVQ_RM>, Sched<[WriteLoad]>;
+
+// These are 64 bit moves, but since the OS X assembler doesn't
+// recognize a register-register movq, we write them as
+// movd.
+let SchedRW = [WriteMove], isBitcast = 1 in {
+def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
+                               (outs GR64:$dst), (ins VR64:$src),
+                               "movd\t{$src, $dst|$dst, $src}",
+                             [(set GR64:$dst,
+                              (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>;
+let hasSideEffects = 0 in
+def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}", [],
+                        IIC_MMX_MOVQ_RR>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}", [],
+                        IIC_MMX_MOVQ_RR>;
+}
+} // SchedRW
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def MMX_MOVD64from64rm : MMXRI<0x7E, MRMDestMem,
+                               (outs i64mem:$dst), (ins VR64:$src),
+                               "movd\t{$src, $dst|$dst, $src}",
+                               [], IIC_MMX_MOV_REG_MM>, Sched<[WriteStore]>;
+
+let SchedRW = [WriteLoad] in {
+let canFoldAsLoad = 1 in
+def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [(set VR64:$dst, (load_mmx addr:$src))],
+                        IIC_MMX_MOVQ_RM>;
+} // SchedRW
+let SchedRW = [WriteStore] in
+def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [(store (x86mmx VR64:$src), addr:$dst)],
+                        IIC_MMX_MOVQ_RM>;
+
+let SchedRW = [WriteMove] in {
+def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+                             (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+                             [(set VR64:$dst,
+                               (x86mmx (bitconvert
+                               (i64 (extractelt (v2i64 VR128:$src),
+                                     (iPTR 0))))))],
+                             IIC_MMX_MOVQ_RR>;
+
+def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
+                              (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+                              [(set VR128:$dst,
+                                (v2i64
+                                  (scalar_to_vector
+                                    (i64 (bitconvert (x86mmx VR64:$src))))))],
+                              IIC_MMX_MOVQ_RR>;
+
+let isCodeGenOnly = 1, hasSideEffects = 1 in {
+def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
+                               (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+                               [], IIC_MMX_MOVQ_RR>;
+
+def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+                              (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+                              [], IIC_MMX_MOVQ_RR>;
+}
+} // SchedRW
+
+let Predicates = [HasSSE1] in
+def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+                         "movntq\t{$src, $dst|$dst, $src}",
+                         [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)],
+                         IIC_MMX_MOVQ_RM>, Sched<[WriteStore]>;
+
+let Predicates = [HasMMX] in {
+  let AddedComplexity = 15 in
+  // movd to MMX register zero-extends
+  def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))),
+            (MMX_MOVD64rr GR32:$src)>;
+  let AddedComplexity = 20 in
+  def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))),
+            (MMX_MOVD64rm addr:$src)>;
+}
+
+// Arithmetic Instructions
+defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d,
+                                     MMX_INTALU_ITINS>;
+// -- Addition
+defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d,
+                                   MMX_INTALU_ITINS, 1>;
+let Predicates = [HasSSE2] in
+defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q,
+                                   MMX_INTALUQ_ITINS, 1>;
+defm MMX_PADDSB  : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDSW  : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w,
+                                   MMX_INTALU_ITINS, 1>;
+
+defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w,
+                                   MMX_INTALU_ITINS, 1>;
+
+defm MMX_PHADDW  : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w,
+                                   MMX_PHADDSUBW>;
+defm MMX_PHADD   : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d,
+                                   MMX_PHADDSUBD>;
+defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
+                                   MMX_PHADDSUBW>;
+
+
+// -- Subtraction
+defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
+                                   MMX_INTALU_ITINS>;
+defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w,
+                                   MMX_INTALU_ITINS>;
+defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d,
+                                   MMX_INTALU_ITINS>;
+let Predicates = [HasSSE2] in
+defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q,
+                                   MMX_INTALUQ_ITINS>;
+
+defm MMX_PSUBSB  : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b,
+                                   MMX_INTALU_ITINS>;
+defm MMX_PSUBSW  : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w,
+                                   MMX_INTALU_ITINS>;
+
+defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b,
+                                   MMX_INTALU_ITINS>;
+defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w,
+                                   MMX_INTALU_ITINS>;
+
+defm MMX_PHSUBW  : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w,
+                                   MMX_PHADDSUBW>;
+defm MMX_PHSUBD  : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d,
+                                   MMX_PHADDSUBD>;
+defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw,
+                                   MMX_PHADDSUBW>;
+
+// -- Multiplication
+defm MMX_PMULLW  : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w,
+                                     MMX_PMUL_ITINS, 1>;
+
+defm MMX_PMULHW  : MMXI_binop_rm_int<0xE5, "pmulhw",  int_x86_mmx_pmulh_w,
+                                     MMX_PMUL_ITINS, 1>;
+let Predicates = [HasSSE1] in
+defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w,
+                                     MMX_PMUL_ITINS, 1>;
+let Predicates = [HasSSE2] in
+defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq,
+                                     MMX_PMUL_ITINS, 1>;
+let isCommutable = 1 in
+defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw",
+                                     int_x86_ssse3_pmul_hr_sw, MMX_PMUL_ITINS>;
+
+// -- Miscellanea
+defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd,
+                                     MMX_PMUL_ITINS, 1>;
+
+defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw",
+                                     int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>;
+let Predicates = [HasSSE1] in {
+defm MMX_PAVGB   : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PAVGW   : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMINUB  : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMINSW  : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMAXUB  : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMAXSW  : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PSADBW  : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw,
+                                     MMX_PSADBW_ITINS, 1>;
+}
+
+defm MMX_PSIGNB :  SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b,
+                                        MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGNW :  SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w,
+                                        MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGND :  SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d,
+                                        MMX_MISC_FUNC_ITINS>;
+let Constraints = "$src1 = $dst" in
+  defm MMX_PALIGN : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b>;
+
+// Logical Instructions
+defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand,
+                                  MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
+defm MMX_POR  : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,
+                                  MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
+defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor,
+                                  MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
+defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn,
+                                  MMX_INTALU_ITINS_VECLOGICSCHED>;
+
+// Shift Instructions
+defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+                                    int_x86_mmx_psrl_w, int_x86_mmx_psrli_w,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+                                    int_x86_mmx_psrl_d, int_x86_mmx_psrli_d,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+                                    int_x86_mmx_psrl_q, int_x86_mmx_psrli_q,
+                                    MMX_SHIFT_ITINS>;
+
+def : Pat<(int_x86_mmx_psrl_w VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSRLWrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psrl_d VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSRLDrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psrl_q VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSRLQrm VR64:$src1, addr:$src2)>;
+
+defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+                                    int_x86_mmx_psll_w, int_x86_mmx_pslli_w,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+                                    int_x86_mmx_psll_d, int_x86_mmx_pslli_d,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+                                    int_x86_mmx_psll_q, int_x86_mmx_pslli_q,
+                                    MMX_SHIFT_ITINS>;
+
+def : Pat<(int_x86_mmx_psll_w VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSLLWrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psll_d VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSLLDrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psll_q VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSLLQrm VR64:$src1, addr:$src2)>;
+
+defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+                                    int_x86_mmx_psra_w, int_x86_mmx_psrai_w,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+                                    int_x86_mmx_psra_d, int_x86_mmx_psrai_d,
+                                    MMX_SHIFT_ITINS>;
+
+def : Pat<(int_x86_mmx_psra_w VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSRAWrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psra_d VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSRADrm VR64:$src1, addr:$src2)>;
+
+// Comparison Instructions
+defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d,
+                                     MMX_INTALU_ITINS>;
+
+defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d,
+                                     MMX_INTALU_ITINS>;
+
+// -- Unpack Instructions
+defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw",
+                                       int_x86_mmx_punpckhbw,
+                                       MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd",
+                                       int_x86_mmx_punpckhwd,
+                                       MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq",
+                                       int_x86_mmx_punpckhdq,
+                                       MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw",
+                                       int_x86_mmx_punpcklbw,
+                                       MMX_UNPCK_L_ITINS>;
+defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd",
+                                       int_x86_mmx_punpcklwd,
+                                       MMX_UNPCK_L_ITINS>;
+defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq",
+                                       int_x86_mmx_punpckldq,
+                                       MMX_UNPCK_L_ITINS>;
+
+// -- Pack Instructions
+defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb,
+                                      MMX_PCK_ITINS>;
+defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw,
+                                      MMX_PCK_ITINS>;
+defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb,
+                                      MMX_PCK_ITINS>;
+
+// -- Shuffle Instructions
+defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b,
+                                       MMX_PSHUF_ITINS>;
+
+def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
+                          (outs VR64:$dst), (ins VR64:$src1, u8imm:$src2),
+                          "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set VR64:$dst,
+                             (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))],
+                          IIC_MMX_PSHUF>, Sched<[WriteShuffle]>;
+def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
+                          (outs VR64:$dst), (ins i64mem:$src1, u8imm:$src2),
+                          "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set VR64:$dst,
+                             (int_x86_sse_pshuf_w (load_mmx addr:$src1),
+                                                   imm:$src2))],
+                          IIC_MMX_PSHUF>, Sched<[WriteShuffleLd]>;
+
+
+
+
+// -- Conversion Instructions
+defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
+                      f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
+                      MMX_CVT_PS_ITINS, SSEPackedSingle>, PS;
+defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
+                      f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
+                      MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
+                       f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
+                       MMX_CVT_PS_ITINS, SSEPackedSingle>, PS;
+defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
+                       f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
+                       MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd,
+                         i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}",
+                         MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+let Constraints = "$src1 = $dst" in {
+  defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
+                         int_x86_sse_cvtpi2ps,
+                         i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+                          SSEPackedSingle>, PS;
+}
+
+// Extract / Insert
+let Predicates = [HasSSE1] in
+def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
+                       (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2),
+                       "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1,
+                                               imm:$src2))],
+                       IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
+let Constraints = "$src1 = $dst" in {
+let Predicates = [HasSSE1] in {
+  def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
+                      (outs VR64:$dst),
+                      (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3),
+                      "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+                                        GR32orGR64:$src2, imm:$src3))],
+                      IIC_MMX_PINSRW>, Sched<[WriteShuffle]>;
+
+  def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem,
+                     (outs VR64:$dst),
+                     (ins VR64:$src1, i16mem:$src2, i32u8imm:$src3),
+                     "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+                                         (i32 (anyext (loadi16 addr:$src2))),
+                                       imm:$src3))],
+                     IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+}
+
+// Mask creation
+let Predicates = [HasSSE1] in
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+                          (ins VR64:$src),
+                          "pmovmskb\t{$src, $dst|$dst, $src}",
+                          [(set GR32orGR64:$dst,
+                                (int_x86_mmx_pmovmskb VR64:$src))]>;
+
+
+// Low word of XMM to MMX.
+def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
+
+def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)),
+          (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>;
+
+def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
+          (x86mmx (MMX_MOVQ64rm addr:$src))>;
+
+// Misc.
+let SchedRW = [WriteShuffle] in {
+let Uses = [EDI], Predicates = [HasSSE1,Not64BitMode] in
+def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+                          "maskmovq\t{$mask, $src|$src, $mask}",
+                          [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
+                          IIC_MMX_MASKMOV>;
+let Uses = [RDI], Predicates = [HasSSE1,In64BitMode] in
+def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+                           "maskmovq\t{$mask, $src|$src, $mask}",
+                           [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)],
+                           IIC_MMX_MASKMOV>;
+}
+
+// 64-bit bit convert.
+let Predicates = [HasSSE2] in {
+def : Pat<(f64 (bitconvert (x86mmx VR64:$src))),
+          (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
+          (MMX_MOVFR642Qrr FR64:$src)>;
+}
+
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMPX.td b/contrib/llvm/lib/Target/X86/X86InstrMPX.td
new file mode 100644
index 0000000..31608cd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrMPX.td
@@ -0,0 +1,70 @@
+//===-- X86InstrMPX.td - MPX Instruction Set ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MPX instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
+  def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins i32mem:$src),
+              OpcodeStr#" \t{$src, $dst|$dst, $src}", []>,
+              Requires<[HasMPX, Not64BitMode]>;
+  def 64rm: RI<opc, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
+              OpcodeStr#" \t{$src, $dst|$dst, $src}", []>,
+              Requires<[HasMPX, In64BitMode]>;
+}
+
+defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
+
+multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> {
+  def 32rm: I<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, i32mem:$src2),
+              OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+              Requires<[HasMPX, Not64BitMode]>;
+  def 64rm: RI<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, i64mem:$src2),
+              OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+              Requires<[HasMPX, In64BitMode]>;
+  def 32rr: I<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR32:$src2),
+              OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+              Requires<[HasMPX, Not64BitMode]>;
+  def 64rr: RI<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR64:$src2),
+              OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+              Requires<[HasMPX, In64BitMode]>;
+}
+defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS;
+defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD;
+defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD;
+
+def BNDMOVRMrr   : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
+                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    Requires<[HasMPX]>;
+def BNDMOVRM32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
+                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    Requires<[HasMPX, Not64BitMode]>;
+def BNDMOVRM64rm : RI<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
+                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    Requires<[HasMPX, In64BitMode]>;
+
+def BNDMOVMRrr   : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
+                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    Requires<[HasMPX]>;
+def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs i64mem:$dst), (ins BNDR:$src),
+                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    Requires<[HasMPX, Not64BitMode]>;
+def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs i128mem:$dst), (ins BNDR:$src),
+                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    Requires<[HasMPX, In64BitMode]>;
+
+def BNDSTXmr:      I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
+                    "bndstx \t{$src, $dst|$dst, $src}", []>, PS,
+                    Requires<[HasMPX]>;
+def BNDLDXrm:      I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
+                    "bndldx \t{$src, $dst|$dst, $src}", []>, PS,
+                    Requires<[HasMPX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSGX.td b/contrib/llvm/lib/Target/X86/X86InstrSGX.td
new file mode 100644
index 0000000..84119ad
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSGX.td
@@ -0,0 +1,24 @@
+//===-- X86InstrSGX.td - SGX Instruction Set Extension -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel SGX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SGX instructions
+
+// ENCLS - Execute an Enclave System Function of Specified Leaf Number
+def ENCLS : I<0x01, MRM_CF, (outs), (ins),
+             "encls", []>, TB;
+
+// ENCLU - Execute an Enclave User Function of Specified Leaf Number
+def ENCLU : I<0x01, MRM_D7, (outs), (ins),
+             "enclu", []>, TB;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
new file mode 100644
index 0000000..624b931
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
@@ -0,0 +1,8944 @@
+//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 SSE instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
+  InstrItinClass rr = arg_rr;
+  InstrItinClass rm = arg_rm;
+  // InstrSchedModel info.
+  X86FoldableSchedWrite Sched = WriteFAdd;
+}
+
+class SizeItins<OpndItins arg_s, OpndItins arg_d> {
+  OpndItins s = arg_s;
+  OpndItins d = arg_d;
+}
+
+
+class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
+  InstrItinClass arg_ri> {
+  InstrItinClass rr = arg_rr;
+  InstrItinClass rm = arg_rm;
+  InstrItinClass ri = arg_ri;
+}
+
+
+// scalar
+let Sched = WriteFAdd in {
+def SSE_ALU_F32S : OpndItins<
+  IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
+>;
+
+def SSE_ALU_F64S : OpndItins<
+  IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
+>;
+}
+
+def SSE_ALU_ITINS_S : SizeItins<
+  SSE_ALU_F32S, SSE_ALU_F64S
+>;
+
+let Sched = WriteFMul in {
+def SSE_MUL_F32S : OpndItins<
+  IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
+>;
+
+def SSE_MUL_F64S : OpndItins<
+  IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
+>;
+}
+
+def SSE_MUL_ITINS_S : SizeItins<
+  SSE_MUL_F32S, SSE_MUL_F64S
+>;
+
+let Sched = WriteFDiv in {
+def SSE_DIV_F32S : OpndItins<
+  IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
+>;
+
+def SSE_DIV_F64S : OpndItins<
+  IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
+>;
+}
+
+def SSE_DIV_ITINS_S : SizeItins<
+  SSE_DIV_F32S, SSE_DIV_F64S
+>;
+
+// parallel
+let Sched = WriteFAdd in {
+def SSE_ALU_F32P : OpndItins<
+  IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
+>;
+
+def SSE_ALU_F64P : OpndItins<
+  IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
+>;
+}
+
+def SSE_ALU_ITINS_P : SizeItins<
+  SSE_ALU_F32P, SSE_ALU_F64P
+>;
+
+let Sched = WriteFMul in {
+def SSE_MUL_F32P : OpndItins<
+  IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
+>;
+
+def SSE_MUL_F64P : OpndItins<
+  IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
+>;
+}
+
+def SSE_MUL_ITINS_P : SizeItins<
+  SSE_MUL_F32P, SSE_MUL_F64P
+>;
+
+let Sched = WriteFDiv in {
+def SSE_DIV_F32P : OpndItins<
+  IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
+>;
+
+def SSE_DIV_F64P : OpndItins<
+  IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
+>;
+}
+
+def SSE_DIV_ITINS_P : SizeItins<
+  SSE_DIV_F32P, SSE_DIV_F64P
+>;
+
+let Sched = WriteVecLogic in
+def SSE_VEC_BIT_ITINS_P : OpndItins<
+  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
+>;
+
+def SSE_BIT_ITINS_P : OpndItins<
+  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
+>;
+
+let Sched = WriteVecALU in {
+def SSE_INTALU_ITINS_P : OpndItins<
+  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+def SSE_INTALUQ_ITINS_P : OpndItins<
+  IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
+>;
+}
+
+let Sched = WriteVecIMul in
+def SSE_INTMUL_ITINS_P : OpndItins<
+  IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
+>;
+
+def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
+  IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
+>;
+
+def SSE_MOVA_ITINS : OpndItins<
+  IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
+>;
+
+def SSE_MOVU_ITINS : OpndItins<
+  IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
+>;
+
+def SSE_DPPD_ITINS : OpndItins<
+  IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
+>;
+
+def SSE_DPPS_ITINS : OpndItins<
+  IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM
+>;
+
+def DEFAULT_ITINS : OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+def SSE_EXTRACT_ITINS : OpndItins<
+  IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM
+>;
+
+def SSE_INSERT_ITINS : OpndItins<
+  IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
+>;
+
+let Sched = WriteMPSAD in
+def SSE_MPSADBW_ITINS : OpndItins<
+  IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
+>;
+
+let Sched = WriteVecIMul in
+def SSE_PMULLD_ITINS : OpndItins<
+  IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
+>;
+
+// Definitions for backward compatibility.
+// The instructions mapped on these definitions uses a different itinerary
+// than the actual scheduling model.
+let Sched = WriteShuffle in
+def DEFAULT_ITINS_SHUFFLESCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteVecIMul in
+def DEFAULT_ITINS_VECIMULSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteShuffle in
+def SSE_INTALU_ITINS_SHUFF_P : OpndItins<
+  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+let Sched = WriteMPSAD in
+def DEFAULT_ITINS_MPSADSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteFBlend in
+def DEFAULT_ITINS_FBLENDSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteBlend in
+def DEFAULT_ITINS_BLENDSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteVarBlend in
+def DEFAULT_ITINS_VARBLENDSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteFBlend in
+def SSE_INTALU_ITINS_FBLEND_P : OpndItins<
+  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+let Sched = WriteBlend in
+def SSE_INTALU_ITINS_BLEND_P : OpndItins<
+  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 Instructions Classes
+//===----------------------------------------------------------------------===//
+
+/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
+multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass RC, X86MemOperand x86memop,
+                           Domain d, OpndItins itins, bit Is2Addr = 1> {
+  let isCommutable = 1 in {
+    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr, d>,
+       Sched<[itins.Sched]>;
+  }
+  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm, d>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
+multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                             string asm, string SSEVer, string FPSizeStr,
+                             Operand memopr, ComplexPattern mem_cpat,
+                             Domain d, OpndItins itins, bit Is2Addr = 1> {
+let isCodeGenOnly = 1 in {
+  def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (!cast<Intrinsic>(
+                 !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
+             RC:$src1, RC:$src2))], itins.rr, d>,
+       Sched<[itins.Sched]>;
+  def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
+       !if(Is2Addr,
+           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
+                                          SSEVer, "_", OpcodeStr, FPSizeStr))
+             RC:$src1, mem_cpat:$src2))], itins.rm, d>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+}
+
+/// sse12_fp_packed - SSE 1 & 2 packed instructions class
+multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass RC, ValueType vt,
+                           X86MemOperand x86memop, PatFrag mem_frag,
+                           Domain d, OpndItins itins, bit Is2Addr = 1> {
+  let isCommutable = 1 in
+    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
+       Sched<[itins.Sched]>;
+  let mayLoad = 1 in
+    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
+          itins.rm, d>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
+multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
+                                      string OpcodeStr, X86MemOperand x86memop,
+                                      list<dag> pat_rr, list<dag> pat_rm,
+                                      bit Is2Addr = 1> {
+  let isCommutable = 1, hasSideEffects = 0 in
+    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       pat_rr, NoItinerary, d>,
+       Sched<[WriteVecLogic]>;
+  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       pat_rm, NoItinerary, d>,
+       Sched<[WriteVecLogicLd, ReadAfterLd]>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Non-instruction patterns
+//===----------------------------------------------------------------------===//
+
+// A vector extract of the first f32/f64 position is a subregister copy
+def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
+          (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
+def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
+          (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
+
+// A 128-bit subvector extract from the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
+          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
+def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
+          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
+
+def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
+          (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
+def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
+          (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
+
+def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
+          (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
+def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
+          (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
+
+// A 128-bit subvector insert to the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+let AddedComplexity = 25 in { // to give priority over vinsertf128rm
+def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+}
+
+// Implicitly promote a 32-bit scalar to a vector.
+def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
+          (COPY_TO_REGCLASS FR32:$src, VR128)>;
+def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
+          (COPY_TO_REGCLASS FR32:$src, VR128)>;
+// Implicitly promote a 64-bit scalar to a vector.
+def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
+          (COPY_TO_REGCLASS FR64:$src, VR128)>;
+def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
+          (COPY_TO_REGCLASS FR64:$src, VR128)>;
+
+// Bitcasts between 128-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasSSE2] in {
+  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(f128  (bitconvert (i128  FR128:$src))), (f128  FR128:$src)>;
+  def : Pat<(i128  (bitconvert (f128  FR128:$src))), (i128  FR128:$src)>;
+}
+
+// Bitcasts between 256-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f64  (bitconvert (v8f32 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v8i32 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v4i64 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v32i8 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v8i32 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v4i64 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v4f64 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v32i8 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v8f32 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v8i32 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v4f64 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v32i8 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v4f64 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v4i64 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v8f32 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v8i32 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v32i8 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v8f32 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v4i64 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v4f64 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))),  (v16i16 VR256:$src)>;
+}
+
+// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
+// This is expanded by ExpandPostRAPseudos.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, SchedRW = [WriteZero] in {
+  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
+                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>;
+  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
+                   [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX & SSE - Zero/One Vectors
+//===----------------------------------------------------------------------===//
+
+// Alias instruction that maps zero vector to pxor / xorp* for sse.
+// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
+// swizzled by ExecutionDepsFix to pxor.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, SchedRW = [WriteZero] in {
+def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+}
+
+def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
+def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
+def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
+def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
+
+
+// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
+// and doesn't need it because on sandy bridge the register is set to zero
+// at the rename stage without using any execution unit, so SET0PSY
+// and SET0PDY can be used for vector int instructions without penalty
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, Predicates = [HasAVX], SchedRW = [WriteZero] in {
+def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                 [(set VR256:$dst, (v8f32 immAllZerosV))]>;
+}
+
+let Predicates = [HasAVX] in
+  def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
+}
+
+// AVX1 has no support for 256-bit integer instructions, but since the 128-bit
+// VPXOR instruction writes zero to its upper part, it's safe build zeros.
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
+          (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
+
+def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v16i16 (v8f32 immAllZerosV)),
+          (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
+
+def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
+          (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
+
+def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
+          (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+}
+
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-ones value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, SchedRW = [WriteZero] in {
+  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+  let Predicates = [HasAVX2] in
+  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move FP Scalar Instructions
+//
+// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
+// register copies because it's a partial register update; Register-to-register
+// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
+// that the insert be implementable in terms of a copy, and just mentioned, we
+// don't use movss/movsd for copies.
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
+                         X86MemOperand x86memop, string base_opc,
+                         string asm_opr, Domain d = GenericDomain> {
+  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
+              (ins VR128:$src1, RC:$src2),
+              !strconcat(base_opc, asm_opr),
+              [(set VR128:$dst, (vt (OpNode VR128:$src1,
+                                 (scalar_to_vector RC:$src2))))],
+              IIC_SSE_MOV_S_RR, d>, Sched<[WriteFShuffle]>;
+
+  // For the disassembler
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
+                  (ins VR128:$src1, RC:$src2),
+                  !strconcat(base_opc, asm_opr),
+                  [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
+}
+
+multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
+                      X86MemOperand x86memop, string OpcodeStr,
+                      Domain d = GenericDomain> {
+  // AVX
+  defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
+                              VEX_4V, VEX_LIG;
+
+  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
+                     VEX, VEX_LIG, Sched<[WriteStore]>;
+  // SSE1 & 2
+  let Constraints = "$src1 = $dst" in {
+    defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+                              "\t{$src2, $dst|$dst, $src2}", d>;
+  }
+
+  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
+                  Sched<[WriteStore]>;
+}
+
+// Loading from memory automatically zeroing upper bits.
+multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
+                         PatFrag mem_pat, string OpcodeStr,
+                         Domain d = GenericDomain> {
+  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(set RC:$dst, (mem_pat addr:$src))],
+                     IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>;
+  def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(set RC:$dst, (mem_pat addr:$src))],
+                     IIC_SSE_MOV_S_RM, d>, Sched<[WriteLoad]>;
+}
+
+defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
+                        SSEPackedSingle>, XS;
+defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
+                        SSEPackedDouble>, XD;
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+  defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
+                             SSEPackedSingle>, XS;
+
+  let AddedComplexity = 20 in
+    defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
+                               SSEPackedDouble>, XD;
+}
+
+// Patterns
+let Predicates = [UseAVX] in {
+  let AddedComplexity = 20 in {
+  // MOVSSrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+
+  // MOVSDrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzload addr:$src)),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+
+  // Represent the same patterns above but in the form they appear for
+  // 256-bit types
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
+  }
+
+  // Extract and store.
+  def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
+  def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>;
+
+  // Shuffle with VMOVSS
+  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+            (VMOVSSrr (v4i32 VR128:$src1),
+                      (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
+  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+            (VMOVSSrr (v4f32 VR128:$src1),
+                      (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;
+
+  // 256-bit variants
+  def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
+
+  // Shuffle with VMOVSD
+  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+
+  // 256-bit variants
+  def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
+
+  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+  // is during lowering, where it's not possible to recognize the fold cause
+  // it has two uses through a bitcast. One use disappears at isel time and the
+  // fold opportunity reappears.
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+}
+
+let Predicates = [UseSSE1] in {
+  let Predicates = [NoSSE41], AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVSS to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+            (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSSrm already zeros the high parts of the register.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+  }
+
+  // Extract and store.
+  def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
+
+  // Shuffle with MOVSS
+  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
+  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
+}
+
+let Predicates = [UseSSE2] in {
+  let Predicates = [NoSSE41], AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVSD to the lower bits.
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+            (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSDrm already zeros the high parts of the register.
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzload addr:$src)),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  }
+
+  // Extract and store.
+  def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>;
+
+  // Shuffle with MOVSD
+  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+
+  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+  // is during lowering, where it's not possible to recognize the fold because
+  // it has two uses through a bitcast. One use disappears at isel time and the
+  // fold opportunity reappears.
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
+                            X86MemOperand x86memop, PatFrag ld_frag,
+                            string asm, Domain d,
+                            OpndItins itins,
+                            bit IsReMaterializable = 1> {
+let hasSideEffects = 0 in
+  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
+           Sched<[WriteFShuffle]>;
+let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
+  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+                   [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
+           Sched<[WriteLoad]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
+                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+                              PS, VEX;
+defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
+                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+                              PD, VEX;
+defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
+                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+                              PS, VEX;
+defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
+                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+                              PD, VEX;
+
+defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
+                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+                              PS, VEX, VEX_L;
+defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
+                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+                              PD, VEX, VEX_L;
+defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
+                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+                              PS, VEX, VEX_L;
+defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
+                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+                              PD, VEX, VEX_L;
+}
+
+let Predicates = [UseSSE1] in {
+defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
+                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+                              PS;
+defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
+                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+                              PS;
+}
+let Predicates = [UseSSE2] in {
+defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
+                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+                              PD;
+defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
+                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+                              PD;
+}
+
+let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX]  in {
+def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movaps\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>, VEX;
+def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>, VEX;
+def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movups\t{$src, $dst|$dst, $src}",
+                   [(store (v4f32 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>, VEX;
+def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(store (v2f64 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>, VEX;
+def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movaps\t{$src, $dst|$dst, $src}",
+                   [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movups\t{$src, $dst|$dst, $src}",
+                   [(store (v8f32 VR256:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(store (v4f64 VR256:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+} // SchedRW
+
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteFShuffle] in {
+  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
+                          (ins VR128:$src),
+                          "movaps\t{$src, $dst|$dst, $src}", [],
+                          IIC_SSE_MOVA_P_RR>, VEX;
+  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
+                           (ins VR128:$src),
+                           "movapd\t{$src, $dst|$dst, $src}", [],
+                           IIC_SSE_MOVA_P_RR>, VEX;
+  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
+                           (ins VR128:$src),
+                           "movups\t{$src, $dst|$dst, $src}", [],
+                           IIC_SSE_MOVU_P_RR>, VEX;
+  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
+                           (ins VR128:$src),
+                           "movupd\t{$src, $dst|$dst, $src}", [],
+                           IIC_SSE_MOVU_P_RR>, VEX;
+  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movaps\t{$src, $dst|$dst, $src}", [],
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movapd\t{$src, $dst|$dst, $src}", [],
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movups\t{$src, $dst|$dst, $src}", [],
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movupd\t{$src, $dst|$dst, $src}", [],
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+}
+
+def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
+          (VMOVUPSYmr addr:$dst, VR256:$src)>;
+def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
+          (VMOVUPDYmr addr:$dst, VR256:$src)>;
+
+let SchedRW = [WriteStore] in {
+def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movaps\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>;
+def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>;
+def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movups\t{$src, $dst|$dst, $src}",
+                   [(store (v4f32 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>;
+def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(store (v2f64 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>;
+} // SchedRW
+
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteFShuffle] in {
+  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movaps\t{$src, $dst|$dst, $src}", [],
+                         IIC_SSE_MOVA_P_RR>;
+  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movapd\t{$src, $dst|$dst, $src}", [],
+                         IIC_SSE_MOVA_P_RR>;
+  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movups\t{$src, $dst|$dst, $src}", [],
+                         IIC_SSE_MOVU_P_RR>;
+  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movupd\t{$src, $dst|$dst, $src}", [],
+                         IIC_SSE_MOVU_P_RR>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
+            (VMOVUPDmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [UseSSE1] in
+  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+let Predicates = [UseSSE2] in
+  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
+            (MOVUPDmr addr:$dst, VR128:$src)>;
+
+// Use vmovaps/vmovups for AVX integer load/store.
+let Predicates = [HasAVX, NoVLX] in {
+  // 128-bit load/store
+  def : Pat<(alignedloadv2i64 addr:$src),
+            (VMOVAPSrm addr:$src)>;
+  def : Pat<(loadv2i64 addr:$src),
+            (VMOVUPSrm addr:$src)>;
+
+  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+
+  // 256-bit load/store
+  def : Pat<(alignedloadv4i64 addr:$src),
+            (VMOVAPSYrm addr:$src)>;
+  def : Pat<(loadv4i64 addr:$src),
+            (VMOVUPSYrm addr:$src)>;
+  def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+
+  // Special patterns for storing subvector extracts of lower 128-bits
+  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+  def : Pat<(alignedstore (v2f64 (extract_subvector
+                                  (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4f32 (extract_subvector
+                                  (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v2i64 (extract_subvector
+                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4i32 (extract_subvector
+                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v8i16 (extract_subvector
+                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v16i8 (extract_subvector
+                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+
+  def : Pat<(store (v2f64 (extract_subvector
+                           (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4f32 (extract_subvector
+                           (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v2i64 (extract_subvector
+                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4i32 (extract_subvector
+                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v8i16 (extract_subvector
+                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v16i8 (extract_subvector
+                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+}
+
+// Use movaps / movups for SSE integer load / store (one byte shorter).
+// The instructions selected below are then converted to MOVDQA/MOVDQU
+// during the SSE domain pass.
+let Predicates = [UseSSE1] in {
+  def : Pat<(alignedloadv2i64 addr:$src),
+            (MOVAPSrm addr:$src)>;
+  def : Pat<(loadv2i64 addr:$src),
+            (MOVUPSrm addr:$src)>;
+
+  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+}
+
+// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
+// bits are disregarded. FIXME: Set encoding to pseudo!
+let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
+let isCodeGenOnly = 1 in {
+  def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+                         "movaps\t{$src, $dst|$dst, $src}",
+                         [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
+                         IIC_SSE_MOVA_P_RM>, VEX;
+  def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+                         "movapd\t{$src, $dst|$dst, $src}",
+                         [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
+                         IIC_SSE_MOVA_P_RM>, VEX;
+  def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+                       "movaps\t{$src, $dst|$dst, $src}",
+                       [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
+                       IIC_SSE_MOVA_P_RM>;
+  def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+                       "movapd\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
+                       IIC_SSE_MOVA_P_RM>;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
+                                      string base_opc, string asm_opr,
+                                      InstrItinClass itin> {
+  def PSrm : PI<opc, MRMSrcMem,
+         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+         !strconcat(base_opc, "s", asm_opr),
+     [(set VR128:$dst,
+       (psnode VR128:$src1,
+              (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
+              itin, SSEPackedSingle>, PS,
+     Sched<[WriteFShuffleLd, ReadAfterLd]>;
+
+  def PDrm : PI<opc, MRMSrcMem,
+         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+         !strconcat(base_opc, "d", asm_opr),
+     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
+                              (scalar_to_vector (loadf64 addr:$src2)))))],
+              itin, SSEPackedDouble>, PD,
+     Sched<[WriteFShuffleLd, ReadAfterLd]>;
+
+}
+
+multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
+                                 string base_opc, InstrItinClass itin> {
+  let Predicates = [UseAVX] in
+    defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                                    itin>, VEX_4V;
+
+  let Constraints = "$src1 = $dst" in
+    defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+                                    "\t{$src2, $dst|$dst, $src2}",
+                                    itin>;
+}
+
+let AddedComplexity = 20 in {
+  defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
+                                    IIC_SSE_MOV_LH>;
+}
+
+let SchedRW = [WriteStore] in {
+let Predicates = [UseAVX] in {
+def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
+                                 (iPTR 0))), addr:$dst)],
+                                 IIC_SSE_MOV_LH>, VEX;
+def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (extractelt (v2f64 VR128:$src),
+                                 (iPTR 0))), addr:$dst)],
+                                 IIC_SSE_MOV_LH>, VEX;
+}// UseAVX
+def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
+                                 (iPTR 0))), addr:$dst)],
+                                 IIC_SSE_MOV_LH>;
+def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (extractelt (v2f64 VR128:$src),
+                                 (iPTR 0))), addr:$dst)],
+                                 IIC_SSE_MOV_LH>;
+} // SchedRW
+
+let Predicates = [UseAVX] in {
+  // Shuffle with VMOVLPS
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (VMOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (VMOVLPSrm VR128:$src1, addr:$src2)>;
+
+  // Shuffle with VMOVLPD
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (VMOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (VMOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1,
+                             (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+            (VMOVLPDrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
+                   addr:$src1),
+            (VMOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v4i32 (X86Movlps
+                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
+            (VMOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                   addr:$src1),
+            (VMOVLPDmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                   addr:$src1),
+            (VMOVLPDmr addr:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE1] in {
+  // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
+  def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
+                                 (iPTR 0))), addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+
+  // Shuffle with MOVLPS
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlps VR128:$src1,
+                      (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
+                                      addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v4i32 (X86Movlps
+                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
+                              addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // Shuffle with MOVLPD
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (MOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (MOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1,
+                             (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+            (MOVLPDrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                           addr:$src1),
+            (MOVLPDmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                           addr:$src1),
+            (MOVLPDmr addr:$src1, VR128:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Hi packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 20 in {
+  defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp",
+                                    IIC_SSE_MOV_LH>;
+}
+
+let SchedRW = [WriteStore] in {
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+let Predicates = [UseAVX] in {
+def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (extractelt
+                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
+                                            (bc_v2f64 (v4f32 VR128:$src))),
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (extractelt
+                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+} // UseAVX
+def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (extractelt
+                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
+                                            (bc_v2f64 (v4f32 VR128:$src))),
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (extractelt
+                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+} // SchedRW
+
+let Predicates = [UseAVX] in {
+  // VMOVHPS patterns
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+            (VMOVHPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+            (VMOVHPSrm VR128:$src1, addr:$src2)>;
+
+  // VMOVHPD patterns
+
+  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
+  // is during lowering, where it's not possible to recognize the load fold
+  // cause it has two uses through a bitcast. One use disappears at isel time
+  // and the fold opportunity reappears.
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+                      (scalar_to_vector (loadf64 addr:$src2)))),
+            (VMOVHPDrm VR128:$src1, addr:$src2)>;
+  // Also handle an i64 load because that may get selected as a faster way to
+  // load the data.
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+                      (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
+            (VMOVHPDrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(store (f64 (extractelt
+                          (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
+                          (iPTR 0))), addr:$dst),
+            (VMOVHPDmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [UseSSE1] in {
+  // MOVHPS patterns
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+            (MOVHPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
+            (MOVHPSrm VR128:$src1, addr:$src2)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // MOVHPD patterns
+
+  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
+  // is during lowering, where it's not possible to recognize the load fold
+  // cause it has two uses through a bitcast. One use disappears at isel time
+  // and the fold opportunity reappears.
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+                      (scalar_to_vector (loadf64 addr:$src2)))),
+            (MOVHPDrm VR128:$src1, addr:$src2)>;
+  // Also handle an i64 load because that may get selected as a faster way to
+  // load the data.
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+                      (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
+            (MOVHPDrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(store (f64 (extractelt
+                          (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
+                          (iPTR 0))), addr:$dst),
+            (MOVHPDmr addr:$dst, VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 20, Predicates = [UseAVX] in {
+  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
+                        IIC_SSE_MOV_LH>,
+                      VEX_4V, Sched<[WriteFShuffle]>;
+  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
+                        IIC_SSE_MOV_LH>,
+                      VEX_4V, Sched<[WriteFShuffle]>;
+}
+let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
+  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movlhps\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
+                        IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movhlps\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
+                        IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+}
+
+let Predicates = [UseAVX] in {
+  // MOVLHPS patterns
+  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+  // MOVHLPS patterns
+  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+            (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE1] in {
+  // MOVLHPS patterns
+  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+  // MOVHLPS patterns
+  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+            (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Conversion Instructions
+//===----------------------------------------------------------------------===//
+
+def SSE_CVT_PD : OpndItins<
+  IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
+>;
+
+let Sched = WriteCvtI2F in
+def SSE_CVT_PS : OpndItins<
+  IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
+>;
+
+let Sched = WriteCvtI2F in
+def SSE_CVT_Scalar : OpndItins<
+  IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SS2SI_32 : OpndItins<
+  IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SS2SI_64 : OpndItins<
+  IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SD2SI : OpndItins<
+  IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
+>;
+
+// FIXME: We probably want to match the rm form only when optimizing for
+// size, to avoid false depenendecies (see sse_fp_unop_s for details)
+multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                     SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
+                     string asm, OpndItins itins> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+                        [(set DstRC:$dst, (OpNode SrcRC:$src))],
+                        itins.rr>, Sched<[itins.Sched]>;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+                        [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
+                        itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                       X86MemOperand x86memop, string asm, Domain d,
+                       OpndItins itins> {
+let hasSideEffects = 0 in {
+  def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+             [], itins.rr, d>, Sched<[itins.Sched]>;
+  let mayLoad = 1 in
+  def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+             [], itins.rm, d>, Sched<[itins.Sched.Folded]>;
+}
+}
+
+// FIXME: We probably want to match the rm form only when optimizing for
+// size, to avoid false depenendecies (see sse_fp_unop_s for details)
+multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                          X86MemOperand x86memop, string asm> {
+let hasSideEffects = 0, Predicates = [UseAVX] in {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+           Sched<[WriteCvtI2F]>;
+  let mayLoad = 1 in
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins DstRC:$src1, x86memop:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+           Sched<[WriteCvtI2FLd, ReadAfterLd]>;
+} // hasSideEffects = 0
+}
+
+let Predicates = [UseAVX] in {
+defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+                                "cvttss2si\t{$src, $dst|$dst, $src}",
+                                SSE_CVT_SS2SI_32>,
+                                XS, VEX, VEX_LIG;
+defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
+                                "cvttss2si\t{$src, $dst|$dst, $src}",
+                                SSE_CVT_SS2SI_64>,
+                                XS, VEX, VEX_W, VEX_LIG;
+defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+                                "cvttsd2si\t{$src, $dst|$dst, $src}",
+                                SSE_CVT_SD2SI>,
+                                XD, VEX, VEX_LIG;
+defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
+                                "cvttsd2si\t{$src, $dst|$dst, $src}",
+                                SSE_CVT_SD2SI>,
+                                XD, VEX, VEX_W, VEX_LIG;
+
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+}
+// The assembler can recognize rr 64-bit instructions by seeing a rxx
+// register, but the same isn't true when only using memory operands,
+// provide other assembly "l" and "q" forms to address this explicitly
+// where appropriate to do so.
+defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">,
+                                  XS, VEX_4V, VEX_LIG;
+defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
+                                  XS, VEX_4V, VEX_W, VEX_LIG;
+defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
+                                  XD, VEX_4V, VEX_LIG;
+defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
+                                  XD, VEX_4V, VEX_W, VEX_LIG;
+
+let Predicates = [UseAVX] in {
+  def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+                (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
+  def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+                (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
+
+  def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
+            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+  def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
+            (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
+  def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
+            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+  def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
+            (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+  def : Pat<(f32 (sint_to_fp GR32:$src)),
+            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+  def : Pat<(f32 (sint_to_fp GR64:$src)),
+            (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+  def : Pat<(f64 (sint_to_fp GR32:$src)),
+            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+  def : Pat<(f64 (sint_to_fp GR64:$src)),
+            (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+}
+
+defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+                      "cvttss2si\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_SS2SI_32>, XS;
+defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
+                      "cvttss2si\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_SS2SI_64>, XS, REX_W;
+defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+                      "cvttsd2si\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_SD2SI>, XD;
+defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
+                      "cvttsd2si\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_SD2SI>, XD, REX_W;
+defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
+                      "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_Scalar>, XS;
+defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
+                      "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_Scalar>, XS, REX_W;
+defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
+                      "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_Scalar>, XD;
+defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
+                      "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_Scalar>, XD, REX_W;
+
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+
+def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
+                (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>;
+def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
+                (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>;
+
+// Conversion Instructions Intrinsics - Match intrinsics which expect MM
+// and/or XMM operand(s).
+
+// FIXME: We probably want to match the rm form only when optimizing for
+// size, to avoid false depenendecies (see sse_fp_unop_s for details)
+multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                         Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
+                         string asm, OpndItins itins> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
+           Sched<[itins.Sched]>;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
+           Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
+                    RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
+                    PatFrag ld_frag, string asm, OpndItins itins,
+                    bit Is2Addr = 1> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
+              !if(Is2Addr,
+                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+              [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+              itins.rr>, Sched<[itins.Sched]>;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins DstRC:$src1, x86memop:$src2),
+              !if(Is2Addr,
+                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+              [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+              itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [UseAVX] in {
+defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
+                  int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
+                  SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
+defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
+                    int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
+                    SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
+}
+defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
+                 sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
+defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
+                   sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+
+
+let isCodeGenOnly = 1 in {
+  let Predicates = [UseAVX] in {
+  defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+            int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
+            SSE_CVT_Scalar, 0>, XS, VEX_4V;
+  defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+            int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
+            SSE_CVT_Scalar, 0>, XS, VEX_4V,
+            VEX_W;
+  defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+            int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
+            SSE_CVT_Scalar, 0>, XD, VEX_4V;
+  defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+            int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
+            SSE_CVT_Scalar, 0>, XD,
+            VEX_4V, VEX_W;
+  }
+  let Constraints = "$src1 = $dst" in {
+    defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+                          int_x86_sse_cvtsi2ss, i32mem, loadi32,
+                          "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
+    defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+                          int_x86_sse_cvtsi642ss, i64mem, loadi64,
+                          "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
+    defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+                          int_x86_sse2_cvtsi2sd, i32mem, loadi32,
+                          "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
+    defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+                          int_x86_sse2_cvtsi642sd, i64mem, loadi64,
+                          "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
+  }
+} // isCodeGenOnly = 1
+
+/// SSE 1 Only
+
+// Aliases for intrinsics
+let isCodeGenOnly = 1 in {
+let Predicates = [UseAVX] in {
+defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
+                                    ssmem, sse_load_f32, "cvttss2si",
+                                    SSE_CVT_SS2SI_32>, XS, VEX;
+defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+                                   "cvttss2si", SSE_CVT_SS2SI_64>,
+                                   XS, VEX, VEX_W;
+defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
+                                    sdmem, sse_load_f64, "cvttsd2si",
+                                    SSE_CVT_SD2SI>, XD, VEX;
+defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+                                  "cvttsd2si", SSE_CVT_SD2SI>,
+                                  XD, VEX, VEX_W;
+}
+defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
+                                    ssmem, sse_load_f32, "cvttss2si",
+                                    SSE_CVT_SS2SI_32>, XS;
+defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+                                   "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
+defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
+                                    sdmem, sse_load_f64, "cvttsd2si",
+                                    SSE_CVT_SD2SI>, XD;
+defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+                                  "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+} // isCodeGenOnly = 1
+
+let Predicates = [UseAVX] in {
+defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+                                  ssmem, sse_load_f32, "cvtss2si",
+                                  SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
+defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
+                                  ssmem, sse_load_f32, "cvtss2si",
+                                  SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
+}
+defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+                               ssmem, sse_load_f32, "cvtss2si",
+                               SSE_CVT_SS2SI_32>, XS;
+defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
+                                 ssmem, sse_load_f32, "cvtss2si",
+                                 SSE_CVT_SS2SI_64>, XS, REX_W;
+
+defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
+                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+                               SSEPackedSingle, SSE_CVT_PS>,
+                               PS, VEX, Requires<[HasAVX]>;
+defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
+                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+                               SSEPackedSingle, SSE_CVT_PS>,
+                               PS, VEX, VEX_L, Requires<[HasAVX]>;
+
+defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
+                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
+                            SSEPackedSingle, SSE_CVT_PS>,
+                            PS, Requires<[UseSSE2]>;
+
+let Predicates = [UseAVX] in {
+def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
+def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
+def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
+def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
+}
+
+def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
+def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTSD2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
+def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
+def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTSD2SI64rm GR64:$dst, sdmem:$src)>;
+
+/// SSE 2 Only
+
+// Convert scalar double to scalar single
+let hasSideEffects = 0, Predicates = [UseAVX] in {
+def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
+                       (ins FR64:$src1, FR64:$src2),
+                      "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+                      IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
+                      Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
+                       (ins FR64:$src1, f64mem:$src2),
+                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [], IIC_SSE_CVT_Scalar_RM>,
+                      XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
+                      Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
+          Requires<[UseAVX]>;
+
+def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
+                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (fround FR64:$src))],
+                      IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
+def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
+                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (fround (loadf64 addr:$src)))],
+                      IIC_SSE_CVT_Scalar_RM>,
+                      XD,
+                  Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+
+let isCodeGenOnly = 1 in {
+def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
+                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
+                       Sched<[WriteCvtF2F]>;
+def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
+                                          VR128:$src1, sse_load_f64:$src2))],
+                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>,
+                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+
+let Constraints = "$src1 = $dst" in {
+def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
+                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
+                       Sched<[WriteCvtF2F]>;
+def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
+                                          VR128:$src1, sse_load_f64:$src2))],
+                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
+                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+} // isCodeGenOnly = 1
+
+// Convert scalar single to scalar double
+// SSE2 instructions with XS prefix
+let hasSideEffects = 0, Predicates = [UseAVX] in {
+def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
+                    (ins FR32:$src1, FR32:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [], IIC_SSE_CVT_Scalar_RR>,
+                    XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
+                    Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
+                    (ins FR32:$src1, f32mem:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [], IIC_SSE_CVT_Scalar_RM>,
+                    XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
+                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+def : Pat<(f64 (fextend FR32:$src)),
+    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
+def : Pat<(fextend (loadf32 addr:$src)),
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
+
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+    Requires<[UseAVX, OptForSize]>;
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
+    Requires<[UseAVX, OptForSpeed]>;
+
+def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
+                   "cvtss2sd\t{$src, $dst|$dst, $src}",
+                   [(set FR64:$dst, (fextend FR32:$src))],
+                   IIC_SSE_CVT_Scalar_RR>, XS,
+                 Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
+def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
+                   "cvtss2sd\t{$src, $dst|$dst, $src}",
+                   [(set FR64:$dst, (extloadf32 addr:$src))],
+                   IIC_SSE_CVT_Scalar_RM>, XS,
+                 Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+
+// extload f32 -> f64.  This matches load+fextend because we have a hack in
+// the isel (PreprocessForFPConvert) that can introduce loads after dag
+// combine.
+// Since these loads aren't folded into the fextend, we have to match it
+// explicitly here.
+def : Pat<(fextend (loadf32 addr:$src)),
+          (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(extloadf32 addr:$src),
+          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
+
+let isCodeGenOnly = 1 in {
+def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
+                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>,
+                    Sched<[WriteCvtF2F]>;
+def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
+                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
+                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>,
+                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
+def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
+                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
+                    Sched<[WriteCvtF2F]>;
+def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
+                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
+                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
+                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
+                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+} // isCodeGenOnly = 1
+
+// Convert packed single/double fp to doubleword
+def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
+                       IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvtps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
+                       IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                        "cvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR256:$dst,
+                          (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
+                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                        "cvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR256:$dst,
+                          (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
+                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "cvtps2dq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
+                     IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                     "cvtps2dq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
+                     IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+
+
+// Convert Packed Double FP to Packed DW Integers
+let Predicates = [HasAVX] in {
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+                       VEX, Sched<[WriteCvtF2I]>;
+
+// XMM only
+def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+                (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
+def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX,
+                       Sched<[WriteCvtF2ILd]>;
+
+// YMM only
+def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L,
+                       Sched<[WriteCvtF2I]>;
+def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>,
+                       VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
+                (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
+}
+
+def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))],
+                      IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
+def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
+                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
+
+// Convert with truncation packed single/double fp to doubleword
+// SSE2 packed instructions with XS prefix
+def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                         "cvttps2dq\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst,
+                           (int_x86_sse2_cvttps2dq VR128:$src))],
+                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                         "cvttps2dq\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+                                            (loadv4f32 addr:$src)))],
+                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                          "cvttps2dq\t{$src, $dst|$dst, $src}",
+                          [(set VR256:$dst,
+                            (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
+                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                          "cvttps2dq\t{$src, $dst|$dst, $src}",
+                          [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
+                                             (loadv8f32 addr:$src)))],
+                          IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
+                          Sched<[WriteCvtF2ILd]>;
+
+def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvttps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
+                       IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvttps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
+                       IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
+            (VCVTDQ2PSrr VR128:$src)>;
+  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))),
+            (VCVTDQ2PSrm addr:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+            (VCVTDQ2PSrr VR128:$src)>;
+  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
+            (VCVTDQ2PSrm addr:$src)>;
+
+  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+            (VCVTTPS2DQrr VR128:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
+            (VCVTTPS2DQrm addr:$src)>;
+
+  def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
+            (VCVTDQ2PSYrr VR256:$src)>;
+  def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))),
+            (VCVTDQ2PSYrm addr:$src)>;
+
+  def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
+            (VCVTTPS2DQYrr VR256:$src)>;
+  def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
+            (VCVTTPS2DQYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+            (CVTDQ2PSrr VR128:$src)>;
+  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+            (CVTDQ2PSrm addr:$src)>;
+
+  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
+            (CVTDQ2PSrr VR128:$src)>;
+  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
+            (CVTDQ2PSrm addr:$src)>;
+
+  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+            (CVTTPS2DQrr VR128:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
+            (CVTTPS2DQrm addr:$src)>;
+}
+
+def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                              (int_x86_sse2_cvttpd2dq VR128:$src))],
+                              IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
+
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+
+// XMM only
+def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
+                (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
+def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                         "cvttpd2dqx\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+                                            (loadv2f64 addr:$src)))],
+                         IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+
+// YMM only
+def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst,
+                           (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
+                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst,
+                          (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
+                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
+                (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
+
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
+            (VCVTTPD2DQYrr VR256:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
+            (VCVTTPD2DQYrm addr:$src)>;
+} // Predicates = [HasAVX]
+
+def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
+                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
+def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
+                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+                                        (memopv2f64 addr:$src)))],
+                                        IIC_SSE_CVT_PD_RM>,
+                      Sched<[WriteCvtF2ILd]>;
+
+// Convert packed single to packed double
+let Predicates = [HasAVX] in {
+                  // SSE2 instructions without OpSize prefix
+def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
+                     IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
+def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+                    IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>;
+def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR256:$dst,
+                       (int_x86_avx_cvt_ps2_pd_256 VR128:$src))],
+                     IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR256:$dst,
+                       (int_x86_avx_cvt_ps2_pd_256 (loadv4f32 addr:$src)))],
+                     IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+}
+
+let Predicates = [UseSSE2] in {
+def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtps2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
+                       IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
+def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                   "cvtps2pd\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+                   IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>;
+}
+
+// Convert Packed DW Integers to Packed Double FP
+let Predicates = [HasAVX] in {
+let hasSideEffects = 0, mayLoad = 1 in
+def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                     []>, VEX, Sched<[WriteCvtI2FLd]>;
+def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX,
+                   Sched<[WriteCvtI2F]>;
+def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR256:$dst,
+                       (int_x86_avx_cvtdq2_pd_256
+                        (bitconvert (loadv2i64 addr:$src))))]>, VEX, VEX_L,
+                    Sched<[WriteCvtI2FLd]>;
+def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR256:$dst,
+                       (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L,
+                    Sched<[WriteCvtI2F]>;
+}
+
+let hasSideEffects = 0, mayLoad = 1 in
+def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
+                       IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
+def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
+                       IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
+
+// AVX register conversion intrinsics
+let Predicates = [HasAVX] in {
+  def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))),
+            (VCVTDQ2PDrr VR128:$src)>;
+  def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))),
+            (VCVTDQ2PDrm addr:$src)>;
+
+  def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
+            (VCVTDQ2PDYrr VR128:$src)>;
+  def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
+            (VCVTDQ2PDYrm addr:$src)>;
+} // Predicates = [HasAVX]
+
+// SSE2 register conversion intrinsics
+let Predicates = [HasSSE2] in {
+  def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))),
+            (CVTDQ2PDrr VR128:$src)>;
+  def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))),
+            (CVTDQ2PDrm addr:$src)>;
+} // Predicates = [HasSSE2]
+
+// Convert packed double to packed single
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
+                       IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
+
+// XMM only
+def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
+                (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
+def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                        "cvtpd2psx\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))],
+                        IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
+
+// YMM only
+def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
+                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))],
+                        IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
+                (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
+
+def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
+                     IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
+def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
+                     IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
+
+
+// AVX 256-bit register conversion intrinsics
+// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
+// whenever possible to avoid declaring two versions of each one.
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
+            (VCVTDQ2PSYrr VR256:$src)>;
+  def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))),
+            (VCVTDQ2PSYrm addr:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  // Match fround and fextend for 128/256-bit conversions
+  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+            (VCVTPD2PSrr VR128:$src)>;
+  def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))),
+            (VCVTPD2PSXrm addr:$src)>;
+  def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
+            (VCVTPD2PSYrr VR256:$src)>;
+  def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
+            (VCVTPD2PSYrm addr:$src)>;
+
+  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
+            (VCVTPS2PDrr VR128:$src)>;
+  def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
+            (VCVTPS2PDYrr VR128:$src)>;
+  def : Pat<(v4f64 (extloadv4f32 addr:$src)),
+            (VCVTPS2PDYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // Match fround and fextend for 128 conversions
+  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+            (CVTPD2PSrr VR128:$src)>;
+  def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
+            (CVTPD2PSrm addr:$src)>;
+
+  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
+            (CVTPS2PDrr VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Compare Instructions
+//===----------------------------------------------------------------------===//
+
+// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
+multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
+                            Operand CC, SDNode OpNode, ValueType VT,
+                            PatFrag ld_frag, string asm, string asm_alt,
+                            OpndItins itins, ImmLeaf immLeaf> {
+  def rr : SIi8<0xC2, MRMSrcReg,
+                (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))],
+                itins.rr>, Sched<[itins.Sched]>;
+  def rm : SIi8<0xC2, MRMSrcMem,
+                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+                [(set RC:$dst, (OpNode (VT RC:$src1),
+                                         (ld_frag addr:$src2), immLeaf:$cc))],
+                                         itins.rm>,
+           Sched<[itins.Sched.Folded, ReadAfterLd]>;
+
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
+                      (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, [],
+                      IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
+    let mayLoad = 1 in
+    def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
+                      (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, [],
+                      IIC_SSE_ALU_F32S_RM>,
+                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+}
+
+defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
+                 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+                 SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG;
+defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
+                 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+                 SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare
+                 XD, VEX_4V, VEX_LIG;
+
+let Constraints = "$src1 = $dst" in {
+  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
+                  "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
+                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S,
+                  i8immZExt3>, XS;
+  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
+                  "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
+                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+                  SSE_ALU_F64S, i8immZExt3>, XD;
+}
+
+multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
+                         Intrinsic Int, string asm, OpndItins itins,
+                         ImmLeaf immLeaf> {
+  def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, VR128:$src, CC:$cc), asm,
+                        [(set VR128:$dst, (Int VR128:$src1,
+                                               VR128:$src, immLeaf:$cc))],
+                                               itins.rr>,
+           Sched<[itins.Sched]>;
+  def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, x86memop:$src, CC:$cc), asm,
+                        [(set VR128:$dst, (Int VR128:$src1,
+                                               (load addr:$src), immLeaf:$cc))],
+                                               itins.rm>,
+           Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let isCodeGenOnly = 1 in {
+  // Aliases to match intrinsics which expect XMM operand(s).
+  defm Int_VCMPSS  : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
+                       "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+                       SSE_ALU_F32S, i8immZExt5>,
+                       XS, VEX_4V;
+  defm Int_VCMPSD  : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
+                       "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+                       SSE_ALU_F32S, i8immZExt5>, // same latency as f32
+                       XD, VEX_4V;
+  let Constraints = "$src1 = $dst" in {
+    defm Int_CMPSS  : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
+                         "cmp${cc}ss\t{$src, $dst|$dst, $src}",
+                         SSE_ALU_F32S, i8immZExt3>, XS;
+    defm Int_CMPSD  : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
+                         "cmp${cc}sd\t{$src, $dst|$dst, $src}",
+                         SSE_ALU_F64S, i8immZExt3>,
+                         XD;
+}
+}
+
+
+// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
+multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
+                            ValueType vt, X86MemOperand x86memop,
+                            PatFrag ld_frag, string OpcodeStr> {
+  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
+                     IIC_SSE_COMIS_RR>,
+          Sched<[WriteFAdd]>;
+  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+                     [(set EFLAGS, (OpNode (vt RC:$src1),
+                                           (ld_frag addr:$src2)))],
+                                           IIC_SSE_COMIS_RM>,
+          Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+
+let Defs = [EFLAGS] in {
+  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
+                                  "ucomiss">, PS, VEX, VEX_LIG;
+  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
+                                  "ucomisd">, PD, VEX, VEX_LIG;
+  let Pattern = []<dag> in {
+    defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
+                                    "comiss">, PS, VEX, VEX_LIG;
+    defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
+                                    "comisd">, PD, VEX, VEX_LIG;
+  }
+
+  let isCodeGenOnly = 1 in {
+    defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+                              load, "ucomiss">, PS, VEX;
+    defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+                              load, "ucomisd">, PD, VEX;
+
+    defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
+                              load, "comiss">, PS, VEX;
+    defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
+                              load, "comisd">, PD, VEX;
+  }
+  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
+                                  "ucomiss">, PS;
+  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
+                                  "ucomisd">, PD;
+
+  let Pattern = []<dag> in {
+    defm COMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
+                                    "comiss">, PS;
+    defm COMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
+                                    "comisd">, PD;
+  }
+
+  let isCodeGenOnly = 1 in {
+    defm Int_UCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+                                load, "ucomiss">, PS;
+    defm Int_UCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+                                load, "ucomisd">, PD;
+
+    defm Int_COMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
+                                    "comiss">, PS;
+    defm Int_COMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
+                                    "comisd">, PD;
+  }
+} // Defs = [EFLAGS]
+
+// sse12_cmp_packed - sse 1 & 2 compare packed instructions
+multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
+                            Operand CC, Intrinsic Int, string asm,
+                            string asm_alt, Domain d, ImmLeaf immLeaf,
+                            PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> {
+  let isCommutable = 1 in
+  def rri : PIi8<0xC2, MRMSrcReg,
+             (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+             [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))],
+             itins.rr, d>,
+            Sched<[WriteFAdd]>;
+  def rmi : PIi8<0xC2, MRMSrcMem,
+             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+             [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2), immLeaf:$cc))],
+             itins.rm, d>,
+            Sched<[WriteFAddLd, ReadAfterLd]>;
+
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    def rri_alt : PIi8<0xC2, MRMSrcReg,
+               (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
+               asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
+    let mayLoad = 1 in
+    def rmi_alt : PIi8<0xC2, MRMSrcMem,
+               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
+               asm_alt, [], itins.rm, d>,
+               Sched<[WriteFAddLd, ReadAfterLd]>;
+  }
+}
+
+defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
+               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V;
+defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
+               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V;
+defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
+               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L;
+defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
+               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in {
+  defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
+                 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
+                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+                 SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS;
+  defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
+                 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
+                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+                 SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+          (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)),
+          (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+          (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)),
+          (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
+          (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
+def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)),
+          (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
+          (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
+def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)),
+          (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [UseSSE1] in {
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+          (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)),
+          (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [UseSSE2] in {
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+          (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)),
+          (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Shuffle Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
+multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
+                         ValueType vt, string asm, PatFrag mem_frag,
+                         Domain d> {
+  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
+                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
+                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
+            Sched<[WriteFShuffleLd, ReadAfterLd]>;
+  def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
+                 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
+                 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
+                                     (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
+            Sched<[WriteFShuffle]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
+           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           loadv4f32, SSEPackedSingle>, PS, VEX_4V;
+  defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
+           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L;
+  defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
+           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           loadv2f64, SSEPackedDouble>, PD, VEX_4V;
+  defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
+           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L;
+}
+let Constraints = "$src1 = $dst" in {
+  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
+                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    memopv4f32, SSEPackedSingle>, PS;
+  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
+                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    memopv2f64, SSEPackedDouble>, PD;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v4i32 (X86Shufp VR128:$src1,
+                       (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
+            (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+  def : Pat<(v2i64 (X86Shufp VR128:$src1,
+                       (loadv2i64 addr:$src2), (i8 imm:$imm))),
+            (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+  // 256-bit patterns
+  def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+            (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+  def : Pat<(v8i32 (X86Shufp VR256:$src1,
+                      (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+            (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+
+  def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+            (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+  def : Pat<(v4i64 (X86Shufp VR256:$src1,
+                              (loadv4i64 addr:$src2), (i8 imm:$imm))),
+            (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+let Predicates = [UseSSE1] in {
+  def : Pat<(v4i32 (X86Shufp VR128:$src1,
+                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+            (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // Generic SHUFPD patterns
+  def : Pat<(v2i64 (X86Shufp VR128:$src1,
+                       (memopv2i64 addr:$src2), (i8 imm:$imm))),
+            (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Unpack FP Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
+multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
+                                   PatFrag mem_frag, RegisterClass RC,
+                                   X86MemOperand x86memop, string asm,
+                                   Domain d> {
+    def rr : PI<opc, MRMSrcReg,
+                (outs RC:$dst), (ins RC:$src1, RC:$src2),
+                asm, [(set RC:$dst,
+                           (vt (OpNode RC:$src1, RC:$src2)))],
+                           IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>;
+    def rm : PI<opc, MRMSrcMem,
+                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+                asm, [(set RC:$dst,
+                           (vt (OpNode RC:$src1,
+                                       (mem_frag addr:$src2))))],
+                                       IIC_SSE_UNPCK, d>,
+             Sched<[WriteFShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
+      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedSingle>, PS, VEX_4V;
+defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
+      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedDouble>, PD, VEX_4V;
+defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
+      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedSingle>, PS, VEX_4V;
+defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
+      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedDouble>, PD, VEX_4V;
+
+defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
+      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
+defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
+      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
+defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
+      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
+defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
+      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
+}// Predicates = [HasAVX, NoVLX]
+let Constraints = "$src1 = $dst" in {
+  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
+        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
+                       SSEPackedSingle>, PS;
+  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
+        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
+                       SSEPackedDouble>, PD;
+  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
+        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
+                       SSEPackedSingle>, PS;
+  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
+        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
+                       SSEPackedDouble>, PD;
+} // Constraints = "$src1 = $dst"
+
+let Predicates = [HasAVX1Only] in {
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
+            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
+            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
+            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
+            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
+            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
+            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Extract Floating-Point Sign mask
+//===----------------------------------------------------------------------===//
+
+/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
+multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
+                                Domain d> {
+  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set GR32orGR64:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
+              Sched<[WriteVecLogic]>;
+}
+
+let Predicates = [HasAVX] in {
+  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
+                                        "movmskps", SSEPackedSingle>, PS, VEX;
+  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
+                                        "movmskpd", SSEPackedDouble>, PD, VEX;
+  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
+                                        "movmskps", SSEPackedSingle>, PS,
+                                        VEX, VEX_L;
+  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
+                                        "movmskpd", SSEPackedDouble>, PD,
+                                        VEX, VEX_L;
+
+  def : Pat<(i32 (X86fgetsign FR32:$src)),
+            (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(i64 (X86fgetsign FR32:$src)),
+            (SUBREG_TO_REG (i64 0),
+             (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>;
+  def : Pat<(i32 (X86fgetsign FR64:$src)),
+            (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(i64 (X86fgetsign FR64:$src)),
+            (SUBREG_TO_REG (i64 0),
+             (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>;
+}
+
+defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
+                                     SSEPackedSingle>, PS;
+defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
+                                     SSEPackedDouble>, PD;
+
+def : Pat<(i32 (X86fgetsign FR32:$src)),
+          (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>,
+      Requires<[UseSSE1]>;
+def : Pat<(i64 (X86fgetsign FR32:$src)),
+          (SUBREG_TO_REG (i64 0),
+           (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>,
+      Requires<[UseSSE1]>;
+def : Pat<(i32 (X86fgetsign FR64:$src)),
+          (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>,
+      Requires<[UseSSE2]>;
+def : Pat<(i64 (X86fgetsign FR64:$src)),
+          (SUBREG_TO_REG (i64 0),
+           (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>,
+      Requires<[UseSSE2]>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Logical Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+/// PDI_binop_rm - Simple SSE2 binary operator.
+multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                        X86MemOperand x86memop, OpndItins itins,
+                        bit IsCommutable, bit Is2Addr> {
+  let isCommutable = IsCommutable in
+  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
+       Sched<[itins.Sched]>;
+  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1,
+                                     (bitconvert (memop_frag addr:$src2)))))],
+                                     itins.rm>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+} // ExeDomain = SSEPackedInt
+
+multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
+                         ValueType OpVT128, ValueType OpVT256,
+                         OpndItins itins, bit IsCommutable = 0, Predicate prd> {
+let Predicates = [HasAVX, prd] in
+  defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
+                    VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
+
+let Constraints = "$src1 = $dst" in
+  defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
+                           memopv2i64, i128mem, itins, IsCommutable, 1>;
+
+let Predicates = [HasAVX2, prd] in
+  defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
+                               OpVT256, VR256, loadv4i64, i256mem, itins,
+                               IsCommutable, 0>, VEX_4V, VEX_L;
+}
+
+// These are ordered here for pattern ordering requirements with the fp versions
+
+defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
+                           SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
+defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
+                           SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
+defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
+                           SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
+defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
+                           SSE_VEC_BIT_ITINS_P, 0, NoVLX>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Logical Instructions
+//===----------------------------------------------------------------------===//
+
+// Multiclass for scalars using the X86 logical operation aliases for FP.
+multiclass sse12_fp_packed_scalar_logical_alias<
+    bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
+  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+                FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>,
+                PS, VEX_4V;
+
+  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+                FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>,
+                PD, VEX_4V;
+
+  let Constraints = "$src1 = $dst" in {
+    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
+                f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS;
+
+    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
+                f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD;
+  }
+}
+
+let isCodeGenOnly = 1 in {
+  defm FsAND  : sse12_fp_packed_scalar_logical_alias<0x54, "and", X86fand,
+                SSE_BIT_ITINS_P>;
+  defm FsOR   : sse12_fp_packed_scalar_logical_alias<0x56, "or", X86for,
+                SSE_BIT_ITINS_P>;
+  defm FsXOR  : sse12_fp_packed_scalar_logical_alias<0x57, "xor", X86fxor,
+                SSE_BIT_ITINS_P>;
+
+  let isCommutable = 0 in
+    defm FsANDN : sse12_fp_packed_scalar_logical_alias<0x55, "andn", X86fandn,
+                  SSE_BIT_ITINS_P>;
+}
+
+// Multiclass for vectors using the X86 logical operation aliases for FP.
+multiclass sse12_fp_packed_vector_logical_alias<
+    bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
+  let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
+  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+              VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>,
+              PS, VEX_4V;
+
+  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+        VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>,
+        PD, VEX_4V;
+
+  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+        VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>,
+        PS, VEX_4V, VEX_L;
+
+  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+        VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>,
+        PD, VEX_4V, VEX_L;
+  }
+
+  let Constraints = "$src1 = $dst" in {
+    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
+                v4f32, f128mem, memopv4f32, SSEPackedSingle, itins>,
+                PS;
+
+    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
+                v2f64, f128mem, memopv2f64, SSEPackedDouble, itins>,
+                PD;
+  }
+}
+
+let isCodeGenOnly = 1 in {
+  defm FvAND  : sse12_fp_packed_vector_logical_alias<0x54, "and", X86fand,
+                SSE_BIT_ITINS_P>;
+  defm FvOR   : sse12_fp_packed_vector_logical_alias<0x56, "or", X86for,
+                SSE_BIT_ITINS_P>;
+  defm FvXOR  : sse12_fp_packed_vector_logical_alias<0x57, "xor", X86fxor,
+                SSE_BIT_ITINS_P>;
+
+  let isCommutable = 0 in
+    defm FvANDN : sse12_fp_packed_vector_logical_alias<0x55, "andn", X86fandn,
+                  SSE_BIT_ITINS_P>;
+}
+
+/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
+///
+multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
+                                   SDNode OpNode> {
+  let Predicates = [HasAVX, NoVLX] in {
+  defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
+        !strconcat(OpcodeStr, "ps"), f256mem,
+        [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
+        [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
+                           (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L;
+
+  defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
+        !strconcat(OpcodeStr, "pd"), f256mem,
+        [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
+                                  (bc_v4i64 (v4f64 VR256:$src2))))],
+        [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
+                                  (loadv4i64 addr:$src2)))], 0>,
+                                  PD, VEX_4V, VEX_L;
+
+  // In AVX no need to add a pattern for 128-bit logical rr ps, because they
+  // are all promoted to v2i64, and the patterns are covered by the int
+  // version. This is needed in SSE only, because v2i64 isn't supported on
+  // SSE1, but only on SSE2.
+  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+       !strconcat(OpcodeStr, "ps"), f128mem, [],
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+                                 (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V;
+
+  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+       !strconcat(OpcodeStr, "pd"), f128mem,
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                 (bc_v2i64 (v2f64 VR128:$src2))))],
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                 (loadv2i64 addr:$src2)))], 0>,
+                                                 PD, VEX_4V;
+  }
+
+  let Constraints = "$src1 = $dst" in {
+    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+         !strconcat(OpcodeStr, "ps"), f128mem,
+         [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))],
+         [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+                                   (memopv2i64 addr:$src2)))]>, PS;
+
+    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+         !strconcat(OpcodeStr, "pd"), f128mem,
+         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                   (bc_v2i64 (v2f64 VR128:$src2))))],
+         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                   (memopv2i64 addr:$src2)))]>, PD;
+  }
+}
+
+defm AND  : sse12_fp_packed_logical<0x54, "and", and>;
+defm OR   : sse12_fp_packed_logical<0x56, "or", or>;
+defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor>;
+let isCommutable = 0 in
+  defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
+
+// AVX1 requires type coercions in order to fold loads directly into logical
+// operations.
+let Predicates = [HasAVX1Only] in {
+  def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))),
+            (VANDPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))),
+            (VORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))),
+            (VXORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))),
+            (VANDNPSYrm VR256:$src1, addr:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+
+/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
+/// vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements unmodified (therefore these cannot be commuted).
+///
+/// These three forms can each be reg+reg or reg+mem.
+///
+
+/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
+/// classes below
+multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SizeItins itins> {
+  let Predicates = [HasAVX, NoVLX] in {
+  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+                               VR128, v4f32, f128mem, loadv4f32,
+                               SSEPackedSingle, itins.s, 0>, PS, VEX_4V;
+  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+                               VR128, v2f64, f128mem, loadv2f64,
+                               SSEPackedDouble, itins.d, 0>, PD, VEX_4V;
+
+  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
+                        OpNode, VR256, v8f32, f256mem, loadv8f32,
+                        SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L;
+  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
+                        OpNode, VR256, v4f64, f256mem, loadv4f64,
+                        SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L;
+  }
+
+  let Constraints = "$src1 = $dst" in {
+    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
+                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
+                              itins.s>, PS;
+    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
+                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
+                              itins.d>, PD;
+  }
+}
+
+multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  SizeItins itins> {
+  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+                         OpNode, FR32, f32mem, SSEPackedSingle, itins.s, 0>,
+                         XS, VEX_4V, VEX_LIG;
+  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+                         OpNode, FR64, f64mem, SSEPackedDouble, itins.d, 0>,
+                         XD, VEX_4V, VEX_LIG;
+
+  let Constraints = "$src1 = $dst" in {
+    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+                              OpNode, FR32, f32mem, SSEPackedSingle,
+                              itins.s>, XS;
+    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+                              OpNode, FR64, f64mem, SSEPackedDouble,
+                              itins.d>, XD;
+  }
+}
+
+multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
+                                      SizeItins itins> {
+  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+                   SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG;
+  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+                   SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG;
+
+  let Constraints = "$src1 = $dst" in {
+    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+                   SSEPackedSingle, itins.s>, XS;
+    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+                   SSEPackedDouble, itins.d>, XD;
+  }
+}
+
+// Binary Arithmetic instructions
+defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
+           basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
+           basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
+defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
+           basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
+           basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
+let isCommutable = 0 in {
+  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
+  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
+  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
+  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
+}
+
+let isCodeGenOnly = 1 in {
+  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
+  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
+}
+
+// Patterns used to select SSE scalar fp arithmetic instructions from
+// either:
+//
+// (1) a scalar fp operation followed by a blend
+//
+// The effect is that the backend no longer emits unnecessary vector
+// insert instructions immediately after SSE scalar fp instructions
+// like addss or mulss.
+//
+// For example, given the following code:
+//   __m128 foo(__m128 A, __m128 B) {
+//     A[0] += B[0];
+//     return A;
+//   }
+//
+// Previously we generated:
+//   addss %xmm0, %xmm1
+//   movss %xmm1, %xmm0
+//
+// We now generate:
+//   addss %xmm1, %xmm0
+//
+// (2) a vector packed single/double fp operation followed by a vector insert
+//
+// The effect is that the backend converts the packed fp instruction
+// followed by a vector insert into a single SSE scalar fp instruction.
+//
+// For example, given the following code:
+//   __m128 foo(__m128 A, __m128 B) {
+//     __m128 C = A + B;
+//     return (__m128) {c[0], a[1], a[2], a[3]};
+//   }
+//
+// Previously we generated:
+//   addps %xmm0, %xmm1
+//   movss %xmm1, %xmm0
+//
+// We now generate:
+//   addss %xmm1, %xmm0
+
+// TODO: Some canonicalization in lowering would simplify the number of
+// patterns we have to try to match.
+multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
+  let Predicates = [UseSSE1] in {
+    // extracted scalar math op with insert via movss
+    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+          (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
+          FR32:$src))))),
+      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
+          (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+    // vector math op with insert via movss
+    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
+  }
+
+  // With SSE 4.1, blendi is preferred to movsd, so match that too.
+  let Predicates = [UseSSE41] in {
+    // extracted scalar math op with insert via blend
+    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+          (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
+          FR32:$src))), (i8 1))),
+      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
+          (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+    // vector math op with insert via blend
+    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+      (!cast<I>(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>;
+
+  }
+
+  // Repeat everything for AVX, except for the movss + scalar combo...
+  // because that one shouldn't occur with AVX codegen?
+  let Predicates = [HasAVX] in {
+    // extracted scalar math op with insert via blend
+    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+          (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
+          FR32:$src))), (i8 1))),
+      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
+          (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+    // vector math op with insert via movss
+    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
+
+    // vector math op with insert via blend
+    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
+  }
+}
+
+defm : scalar_math_f32_patterns<fadd, "ADD">;
+defm : scalar_math_f32_patterns<fsub, "SUB">;
+defm : scalar_math_f32_patterns<fmul, "MUL">;
+defm : scalar_math_f32_patterns<fdiv, "DIV">;
+
+multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
+  let Predicates = [UseSSE2] in {
+    // extracted scalar math op with insert via movsd
+    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+          (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
+          FR64:$src))))),
+      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
+          (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+    // vector math op with insert via movsd
+    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+  }
+
+  // With SSE 4.1, blendi is preferred to movsd, so match those too.
+  let Predicates = [UseSSE41] in {
+    // extracted scalar math op with insert via blend
+    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+          (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
+          FR64:$src))), (i8 1))),
+      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
+          (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+    // vector math op with insert via blend
+    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+  }
+
+  // Repeat everything for AVX.
+  let Predicates = [HasAVX] in {
+    // extracted scalar math op with insert via movsd
+    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+          (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
+          FR64:$src))))),
+      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
+          (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+    // extracted scalar math op with insert via blend
+    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+          (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
+          FR64:$src))), (i8 1))),
+      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
+          (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+    // vector math op with insert via movsd
+    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+
+    // vector math op with insert via blend
+    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+  }
+}
+
+defm : scalar_math_f64_patterns<fadd, "ADD">;
+defm : scalar_math_f64_patterns<fsub, "SUB">;
+defm : scalar_math_f64_patterns<fmul, "MUL">;
+defm : scalar_math_f64_patterns<fdiv, "DIV">;
+
+
+/// Unop Arithmetic
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+
+let Sched = WriteFSqrt in {
+def SSE_SQRTPS : OpndItins<
+  IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM
+>;
+
+def SSE_SQRTSS : OpndItins<
+  IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM
+>;
+
+def SSE_SQRTPD : OpndItins<
+  IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM
+>;
+
+def SSE_SQRTSD : OpndItins<
+  IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM
+>;
+}
+
+let Sched = WriteFRsqrt in {
+def SSE_RSQRTPS : OpndItins<
+  IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
+>;
+
+def SSE_RSQRTSS : OpndItins<
+  IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
+>;
+}
+
+let Sched = WriteFRcp in {
+def SSE_RCPP : OpndItins<
+  IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
+>;
+
+def SSE_RCPS : OpndItins<
+  IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
+>;
+}
+
+/// sse_fp_unop_s - SSE1 unops in scalar form
+/// For the non-AVX defs, we need $src1 to be tied to $dst because
+/// the HW instructions are 2 operand / destructive.
+multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                          ValueType vt, ValueType ScalarVT,
+                          X86MemOperand x86memop, Operand vec_memop,
+                          ComplexPattern mem_cpat, Intrinsic Intr,
+                          SDNode OpNode, Domain d, OpndItins itins,
+                          Predicate target, string Suffix> {
+  let hasSideEffects = 0 in {
+  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
+              !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
+            [(set RC:$dst, (OpNode RC:$src1))], itins.rr, d>, Sched<[itins.Sched]>,
+            Requires<[target]>;
+  let mayLoad = 1 in
+  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
+            !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
+            [(set RC:$dst, (OpNode (load addr:$src1)))], itins.rm, d>,
+            Sched<[itins.Sched.Folded, ReadAfterLd]>,
+            Requires<[target, OptForSize]>;
+
+  let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in {
+  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+              !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+            []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  let mayLoad = 1 in
+  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, vec_memop:$src2),
+              !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+            []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+  }
+
+  let Predicates = [target] in {
+  def : Pat<(vt (OpNode mem_cpat:$src)),
+            (vt (COPY_TO_REGCLASS (vt (!cast<Instruction>(NAME#Suffix##m_Int)
+                 (vt (IMPLICIT_DEF)), mem_cpat:$src)), RC))>;
+  // These are unary operations, but they are modeled as having 2 source operands
+  // because the high elements of the destination are unchanged in SSE.
+  def : Pat<(Intr VR128:$src),
+            (!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>;
+  def : Pat<(Intr (load addr:$src)),
+            (vt (COPY_TO_REGCLASS(!cast<Instruction>(NAME#Suffix##m)
+                                      addr:$src), VR128))>;
+  }
+  // We don't want to fold scalar loads into these instructions unless
+  // optimizing for size. This is because the folded instruction will have a
+  // partial register update, while the unfolded sequence will not, e.g.
+  // movss mem, %xmm0
+  // rcpss %xmm0, %xmm0
+  // which has a clobber before the rcp, vs.
+  // rcpss mem, %xmm0
+  let Predicates = [target, OptForSize] in {
+    def : Pat<(Intr mem_cpat:$src),
+               (!cast<Instruction>(NAME#Suffix##m_Int)
+                      (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
+  }
+}
+
+multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                          ValueType vt, ValueType ScalarVT,
+                          X86MemOperand x86memop, Operand vec_memop,
+                          ComplexPattern mem_cpat,
+                          Intrinsic Intr, SDNode OpNode, Domain d,
+                          OpndItins itins, string Suffix> {
+  let hasSideEffects = 0 in {
+  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            [], itins.rr, d>, Sched<[itins.Sched]>;
+  let mayLoad = 1 in
+  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            [], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  let isCodeGenOnly = 1 in {
+  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
+                (ins VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             []>, Sched<[itins.Sched.Folded]>;
+  let mayLoad = 1 in
+  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
+                (ins VR128:$src1, vec_memop:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+  }
+
+  // We don't want to fold scalar loads into these instructions unless
+  // optimizing for size. This is because the folded instruction will have a
+  // partial register update, while the unfolded sequence will not, e.g.
+  // vmovss mem, %xmm0
+  // vrcpss %xmm0, %xmm0, %xmm0
+  // which has a clobber before the rcp, vs.
+  // vrcpss mem, %xmm0, %xmm0
+  // TODO: In theory, we could fold the load, and avoid the stall caused by
+  // the partial register store, either in ExeDepFix or with smarter RA.
+  let Predicates = [UseAVX] in {
+   def : Pat<(OpNode RC:$src),  (!cast<Instruction>("V"#NAME#Suffix##r)
+                                (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
+  }
+  let Predicates = [HasAVX] in {
+   def : Pat<(Intr VR128:$src),
+             (!cast<Instruction>("V"#NAME#Suffix##r_Int) (vt (IMPLICIT_DEF)),
+                                 VR128:$src)>;
+  }
+  let Predicates = [HasAVX, OptForSize] in {
+    def : Pat<(Intr mem_cpat:$src),
+              (!cast<Instruction>("V"#NAME#Suffix##m_Int)
+                    (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
+  }
+  let Predicates = [UseAVX, OptForSize] in {
+    def : Pat<(ScalarVT (OpNode (load addr:$src))),
+              (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
+            addr:$src)>;
+    def : Pat<(vt (OpNode mem_cpat:$src)),
+              (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)),
+                                  mem_cpat:$src)>;
+  }
+}
+
+/// sse1_fp_unop_p - SSE1 unops in packed form.
+multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          OpndItins itins, list<Predicate> prds> {
+let Predicates = prds in {
+  def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       !strconcat("v", OpcodeStr,
+                                  "ps\t{$src, $dst|$dst, $src}"),
+                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
+                       itins.rr>, VEX, Sched<[itins.Sched]>;
+  def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       !strconcat("v", OpcodeStr,
+                                  "ps\t{$src, $dst|$dst, $src}"),
+                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
+                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+  def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                        !strconcat("v", OpcodeStr,
+                                   "ps\t{$src, $dst|$dst, $src}"),
+                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
+                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+  def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                        !strconcat("v", OpcodeStr,
+                                   "ps\t{$src, $dst|$dst, $src}"),
+                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
+                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+}
+
+  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>,
+            Sched<[itins.Sched]>;
+  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>,
+            Sched<[itins.Sched.Folded]>;
+}
+
+/// sse2_fp_unop_p - SSE2 unops in vector forms.
+multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
+                          SDNode OpNode, OpndItins itins> {
+let Predicates = [HasAVX] in {
+  def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       !strconcat("v", OpcodeStr,
+                                  "pd\t{$src, $dst|$dst, $src}"),
+                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
+                       itins.rr>, VEX, Sched<[itins.Sched]>;
+  def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       !strconcat("v", OpcodeStr,
+                                  "pd\t{$src, $dst|$dst, $src}"),
+                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
+                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+  def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                        !strconcat("v", OpcodeStr,
+                                   "pd\t{$src, $dst|$dst, $src}"),
+                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
+                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+  def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                        !strconcat("v", OpcodeStr,
+                                   "pd\t{$src, $dst|$dst, $src}"),
+                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
+                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+}
+
+  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>,
+            Sched<[itins.Sched]>;
+  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>,
+            Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          OpndItins itins> {
+  defm SS        :  sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
+                      ssmem, sse_load_f32,
+                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
+                      SSEPackedSingle, itins, UseSSE1, "SS">, XS;
+  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
+                      f32mem, ssmem, sse_load_f32,
+                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
+                      SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG;
+}
+
+multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          OpndItins itins> {
+  defm SD         : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
+                         sdmem, sse_load_f64,
+                         !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
+                         OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
+  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
+                         f64mem, sdmem, sse_load_f64,
+                         !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
+                         OpNode, SSEPackedDouble, itins, "SD">,
+                         XD, VEX_4V, VEX_LIG;
+}
+
+// Square root.
+defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>,
+             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>,
+             sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>,
+             sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
+             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >;
+defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>,
+             sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>;
+
+// There is no f64 version of the reciprocal approximation instructions.
+
+// TODO: We should add *scalar* op patterns for these just like we have for
+// the binops above. If the binop and unop patterns could all be unified
+// that would be even better.
+
+multiclass scalar_unary_math_patterns<Intrinsic Intr, string OpcPrefix,
+                                      SDNode Move, ValueType VT,
+                                      Predicate BasePredicate> {
+  let Predicates = [BasePredicate] in {
+    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
+              (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+  }
+
+  // With SSE 4.1, blendi is preferred to movs*, so match that too.
+  let Predicates = [UseSSE41] in {
+    def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
+              (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+  }
+
+  // Repeat for AVX versions of the instructions.
+  let Predicates = [HasAVX] in {
+    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
+              (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+
+    def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
+              (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+  }
+}
+
+defm : scalar_unary_math_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
+                                  v4f32, UseSSE1>;
+defm : scalar_unary_math_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
+                                  v4f32, UseSSE1>;
+defm : scalar_unary_math_patterns<int_x86_sse_sqrt_ss, "SQRTSS", X86Movss,
+                                  v4f32, UseSSE1>;
+defm : scalar_unary_math_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd,
+                                  v2f64, UseSSE2>;
+
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Non-temporal stores
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+let SchedRW = [WriteStore] in {
+let Predicates = [HasAVX, NoVLX] in {
+def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
+                     (ins f128mem:$dst, VR128:$src),
+                     "movntps\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v4f32 VR128:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX;
+def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
+                     (ins f128mem:$dst, VR128:$src),
+                     "movntpd\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v2f64 VR128:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX;
+
+let ExeDomain = SSEPackedInt in
+def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
+                         (ins f128mem:$dst, VR128:$src),
+                         "movntdq\t{$src, $dst|$dst, $src}",
+                         [(alignednontemporalstore (v2i64 VR128:$src),
+                                                   addr:$dst)],
+                                                   IIC_SSE_MOVNT>, VEX;
+
+def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
+                     (ins f256mem:$dst, VR256:$src),
+                     "movntps\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v8f32 VR256:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX, VEX_L;
+def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
+                     (ins f256mem:$dst, VR256:$src),
+                     "movntpd\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v4f64 VR256:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX, VEX_L;
+let ExeDomain = SSEPackedInt in
+def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
+                    (ins f256mem:$dst, VR256:$src),
+                    "movntdq\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v4i64 VR256:$src),
+                                              addr:$dst)],
+                                              IIC_SSE_MOVNT>, VEX, VEX_L;
+}
+
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntps\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
+                    IIC_SSE_MOVNT>;
+def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntpd\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
+                    IIC_SSE_MOVNT>;
+
+let ExeDomain = SSEPackedInt in
+def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntdq\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
+                    IIC_SSE_MOVNT>;
+
+// There is no AVX form for instructions below this point
+def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                 "movnti{l}\t{$src, $dst|$dst, $src}",
+                 [(nontemporalstore (i32 GR32:$src), addr:$dst)],
+                 IIC_SSE_MOVNT>,
+               PS, Requires<[HasSSE2]>;
+def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                     "movnti{q}\t{$src, $dst|$dst, $src}",
+                     [(nontemporalstore (i64 GR64:$src), addr:$dst)],
+                     IIC_SSE_MOVNT>,
+                  PS, Requires<[HasSSE2]>;
+} // SchedRW = [WriteStore]
+
+let Predicates = [HasAVX2, NoVLX] in {
+  def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
+            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
+            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
+            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
+            (VMOVNTDQmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
+            (VMOVNTDQmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
+            (VMOVNTDQmr addr:$dst, VR128:$src)>;
+}
+
+def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
+          (MOVNTDQmr addr:$dst, VR128:$src)>;
+def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
+          (MOVNTDQmr addr:$dst, VR128:$src)>;
+def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
+          (MOVNTDQmr addr:$dst, VR128:$src)>;
+
+} // AddedComplexity
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Prefetch and memory fence
+//===----------------------------------------------------------------------===//
+
+// Prefetch intrinsic.
+let Predicates = [HasSSE1], SchedRW = [WriteLoad] in {
+def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
+    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
+    IIC_SSE_PREFETCH>, TB;
+def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
+    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
+    IIC_SSE_PREFETCH>, TB;
+def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
+    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
+    IIC_SSE_PREFETCH>, TB;
+def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
+    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
+    IIC_SSE_PREFETCH>, TB;
+}
+
+// FIXME: How should flush instruction be modeled?
+let SchedRW = [WriteLoad] in {
+// Flush cache
+def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
+               IIC_SSE_PREFETCH>, PS, Requires<[HasSSE2]>;
+}
+
+let SchedRW = [WriteNop] in {
+// Pause. This "instruction" is encoded as "rep; nop", so even though it
+// was introduced with SSE2, it's backward compatible.
+def PAUSE : I<0x90, RawFrm, (outs), (ins),
+              "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>,
+              OBXS, Requires<[HasSSE2]>;
+}
+
+let SchedRW = [WriteFence] in {
+// Load, store, and memory fence
+def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
+               "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
+               PS, Requires<[HasSSE1]>;
+def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
+               "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
+               TB, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
+               "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
+               TB, Requires<[HasSSE2]>;
+} // SchedRW
+
+def : Pat<(X86SFence), (SFENCE)>;
+def : Pat<(X86LFence), (LFENCE)>;
+def : Pat<(X86MFence), (MFENCE)>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Load/Store XCSR register
+//===----------------------------------------------------------------------===//
+
+def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+                  IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>;
+def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+                  IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
+
+let Predicates = [UseSSE1] in {
+def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
+                "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+                IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>;
+def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+                "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+                IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+let hasSideEffects = 0, SchedRW = [WriteMove] in {
+def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
+                    VEX;
+def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
+                    VEX, VEX_L;
+def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
+                    VEX;
+def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
+                    VEX, VEX_L;
+}
+
+// For Disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteMove] in {
+def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                        "movdqa\t{$src, $dst|$dst, $src}", [],
+                        IIC_SSE_MOVA_P_RR>,
+                        VEX;
+def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+                        "movdqa\t{$src, $dst|$dst, $src}", [],
+                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                        "movdqu\t{$src, $dst|$dst, $src}", [],
+                        IIC_SSE_MOVU_P_RR>,
+                        VEX;
+def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+                        "movdqu\t{$src, $dst|$dst, $src}", [],
+                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+}
+
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
+    hasSideEffects = 0, SchedRW = [WriteLoad] in {
+def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
+                   VEX;
+def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
+                   VEX, VEX_L;
+let Predicates = [HasAVX] in {
+  def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+                    XS, VEX;
+  def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+                    XS, VEX, VEX_L;
+}
+}
+
+let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
+                     (ins i128mem:$dst, VR128:$src),
+                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
+                     VEX;
+def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
+                     (ins i256mem:$dst, VR256:$src),
+                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
+                     VEX, VEX_L;
+let Predicates = [HasAVX] in {
+def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
+                  XS, VEX;
+def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
+                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
+                  XS, VEX, VEX_L;
+}
+}
+
+let SchedRW = [WriteMove] in {
+let hasSideEffects = 0 in
+def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
+
+def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "movdqu\t{$src, $dst|$dst, $src}",
+                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+
+// For Disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                       "movdqa\t{$src, $dst|$dst, $src}", [],
+                       IIC_SSE_MOVA_P_RR>;
+
+def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                       "movdqu\t{$src, $dst|$dst, $src}",
+                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+}
+} // SchedRW
+
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
+    hasSideEffects = 0, SchedRW = [WriteLoad] in {
+def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}",
+                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
+                   IIC_SSE_MOVA_P_RM>;
+def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "movdqu\t{$src, $dst|$dst, $src}",
+                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
+                   IIC_SSE_MOVU_P_RM>,
+                 XS, Requires<[UseSSE2]>;
+}
+
+let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}",
+                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
+                   IIC_SSE_MOVA_P_MR>;
+def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                   "movdqu\t{$src, $dst|$dst, $src}",
+                   [/*(store (v2i64 VR128:$src), addr:$dst)*/],
+                   IIC_SSE_MOVU_P_MR>,
+                 XS, Requires<[UseSSE2]>;
+}
+
+} // ExeDomain = SSEPackedInt
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
+            (VMOVDQUmr addr:$dst, VR128:$src)>;
+  def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
+            (VMOVDQUYmr addr:$dst, VR256:$src)>;
+}
+let Predicates = [UseSSE2] in
+def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
+          (MOVDQUmr addr:$dst, VR128:$src)>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Arithmetic Instructions
+//===---------------------------------------------------------------------===//
+
+let Sched = WriteVecIMul in
+def SSE_PMADD : OpndItins<
+  IIC_SSE_PMADD, IIC_SSE_PMADD
+>;
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                            RegisterClass RC, PatFrag memop_frag,
+                            X86MemOperand x86memop,
+                            OpndItins itins,
+                            bit IsCommutable = 0,
+                            bit Is2Addr = 1> {
+  let isCommutable = IsCommutable in
+  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>,
+      Sched<[itins.Sched]>;
+  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))],
+       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
+                             Intrinsic IntId256, OpndItins itins,
+                             bit IsCommutable = 0> {
+let Predicates = [HasAVX] in
+  defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128,
+                                 VR128, loadv2i64, i128mem, itins,
+                                 IsCommutable, 0>, VEX_4V;
+
+let Constraints = "$src1 = $dst" in
+  defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64,
+                               i128mem, itins, IsCommutable, 1>;
+
+let Predicates = [HasAVX2] in
+  defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256,
+                                   VR256, loadv4i64, i256mem, itins,
+                                   IsCommutable, 0>, VEX_4V, VEX_L;
+}
+
+multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
+                         string OpcodeStr, SDNode OpNode,
+                         SDNode OpNode2, RegisterClass RC,
+                         ValueType DstVT, ValueType SrcVT, PatFrag bc_frag,
+                         PatFrag ld_frag, ShiftOpndItins itins,
+                         bit Is2Addr = 1> {
+  // src2 is always 128-bit
+  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
+        itins.rr>, Sched<[WriteVecShift]>;
+  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, i128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode RC:$src1,
+                       (bc_frag (ld_frag addr:$src2)))))], itins.rm>,
+      Sched<[WriteVecShiftLd, ReadAfterLd]>;
+  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
+       (ins RC:$src1, u8imm:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>,
+       Sched<[WriteVecShift]>;
+}
+
+/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
+multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+                         PatFrag memop_frag, X86MemOperand x86memop,
+                         OpndItins itins,
+                         bit IsCommutable = 0, bit Is2Addr = 1> {
+  let isCommutable = IsCommutable in
+  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+       Sched<[itins.Sched]>;
+  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
+                                     (bitconvert (memop_frag addr:$src2)))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+} // ExeDomain = SSEPackedInt
+
+defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
+                             SSE_INTALU_ITINS_P, 1, NoVLX>;
+defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
+                             SSE_INTALUQ_ITINS_P, 1, NoVLX>;
+defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
+                             SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
+                             SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
+                             SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
+                             SSE_INTALU_ITINS_P, 0, NoVLX>;
+defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
+                             SSE_INTALUQ_ITINS_P, 0, NoVLX>;
+defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMINSW  : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PAVGB   : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PAVGW   : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+
+// Intrinsic forms
+defm PSUBSB  : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b,
+                                 int_x86_avx2_psubs_b, SSE_INTALU_ITINS_P, 0>;
+defm PSUBSW  : PDI_binop_all_int<0xE9, "psubsw" , int_x86_sse2_psubs_w,
+                                 int_x86_avx2_psubs_w, SSE_INTALU_ITINS_P, 0>;
+defm PADDSB  : PDI_binop_all_int<0xEC, "paddsb" , int_x86_sse2_padds_b,
+                                 int_x86_avx2_padds_b, SSE_INTALU_ITINS_P, 1>;
+defm PADDSW  : PDI_binop_all_int<0xED, "paddsw" , int_x86_sse2_padds_w,
+                                 int_x86_avx2_padds_w, SSE_INTALU_ITINS_P, 1>;
+defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b,
+                                 int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>;
+defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
+                                 int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
+defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
+                                 int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
+
+let Predicates = [HasAVX] in
+defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
+                             loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
+                             VEX_4V;
+let Predicates = [HasAVX2] in
+defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
+                             loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>,
+                             VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
+                            memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>;
+
+let Predicates = [HasAVX] in
+defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
+                              loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
+                              VEX_4V;
+let Predicates = [HasAVX2] in
+defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
+                               VR256, loadv4i64, i256mem,
+                               SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
+                             memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Logical Instructions
+//===---------------------------------------------------------------------===//
+
+let Predicates = [HasAVX, NoVLX] in {
+defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
+                            VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
+                            VR128, v2i64, v2i64, bc_v2i64, loadv2i64,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+
+defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
+                            VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
+                            VR128, v2i64, v2i64, bc_v2i64, loadv2i64,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+
+defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
+                            VR128, v4i32, v4i32, bc_v4i32, loadv2i64,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+} // Predicates = [HasAVX, NoVLX]
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
+                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
+                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
+                            VR128, v8i16, v8i16, bc_v8i16, loadv2i64,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+} // Predicates = [HasAVX, NoVLX_Or_NoBWI]
+
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] ,
+                                    Predicates = [HasAVX, NoVLX_Or_NoBWI]in {
+  // 128-bit logical shifts.
+  def VPSLLDQri : PDIi8<0x73, MRM7r,
+                    (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
+                    "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst,
+                      (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))]>,
+                    VEX_4V;
+  def VPSRLDQri : PDIi8<0x73, MRM3r,
+                    (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
+                    "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst,
+                      (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>,
+                    VEX_4V;
+  // PSRADQri doesn't exist in SSE[1-3].
+} // Predicates = [HasAVX, NoVLX_Or_NoBWI]
+
+let Predicates = [HasAVX2, NoVLX] in {
+defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
+                             VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
+                             VR256, v4i64, v2i64, bc_v2i64, loadv2i64,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+
+defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
+                             VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
+                             VR256, v4i64, v2i64, bc_v2i64, loadv2i64,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+
+defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
+                             VR256, v8i32, v4i32, bc_v4i32, loadv2i64,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+}// Predicates = [HasAVX2, NoVLX]
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
+                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
+                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
+                             VR256, v16i16, v8i16, bc_v8i16, loadv2i64,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+}// Predicates = [HasAVX2, NoVLX_Or_NoBWI]
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 ,
+                                    Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+  // 256-bit logical shifts.
+  def VPSLLDQYri : PDIi8<0x73, MRM7r,
+                    (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
+                    "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR256:$dst,
+                      (v4i64 (X86vshldq VR256:$src1, (i8 imm:$src2))))]>,
+                    VEX_4V, VEX_L;
+  def VPSRLDQYri : PDIi8<0x73, MRM3r,
+                    (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
+                    "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR256:$dst,
+                      (v4i64 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>,
+                    VEX_4V, VEX_L;
+  // PSRADQYri doesn't exist in SSE[1-3].
+} // Predicates = [HasAVX2, NoVLX_Or_NoBWI]
+
+let Constraints = "$src1 = $dst" in {
+defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
+                           VR128, v8i16, v8i16, bc_v8i16, memopv2i64,
+                           SSE_INTSHIFT_ITINS_P>;
+defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
+                           VR128, v4i32, v4i32, bc_v4i32, memopv2i64,
+                           SSE_INTSHIFT_ITINS_P>;
+defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
+                           VR128, v2i64, v2i64, bc_v2i64, memopv2i64,
+                           SSE_INTSHIFT_ITINS_P>;
+
+defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
+                           VR128, v8i16, v8i16, bc_v8i16, memopv2i64,
+                           SSE_INTSHIFT_ITINS_P>;
+defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
+                           VR128, v4i32, v4i32, bc_v4i32, memopv2i64,
+                           SSE_INTSHIFT_ITINS_P>;
+defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
+                           VR128, v2i64, v2i64, bc_v2i64, memopv2i64,
+                           SSE_INTSHIFT_ITINS_P>;
+
+defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
+                           VR128, v8i16, v8i16, bc_v8i16, memopv2i64,
+                           SSE_INTSHIFT_ITINS_P>;
+defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
+                           VR128, v4i32, v4i32, bc_v4i32, memopv2i64,
+                           SSE_INTSHIFT_ITINS_P>;
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
+  // 128-bit logical shifts.
+  def PSLLDQri : PDIi8<0x73, MRM7r,
+                       (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
+                       "pslldq\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))],
+                       IIC_SSE_INTSHDQ_P_RI>;
+  def PSRLDQri : PDIi8<0x73, MRM3r,
+                       (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
+                       "psrldq\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))],
+                       IIC_SSE_INTSHDQ_P_RI>;
+  // PSRADQri doesn't exist in SSE[1-3].
+}
+} // Constraints = "$src1 = $dst"
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Comparison Instructions
+//===---------------------------------------------------------------------===//
+
+defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
+                             SSE_INTALU_ITINS_P, 1, NoVLX>;
+defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
+                             SSE_INTALU_ITINS_P, 0, NoVLX>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Shuffle Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
+                         SDNode OpNode> {
+let Predicates = [HasAVX] in {
+  def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, u8imm:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      [(set VR128:$dst,
+                        (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
+                      IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
+  def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
+                      (ins i128mem:$src1, u8imm:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set VR128:$dst,
+                       (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
+                        (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
+                  Sched<[WriteShuffleLd]>;
+}
+
+let Predicates = [HasAVX2] in {
+  def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
+                       (ins VR256:$src1, u8imm:$src2),
+                       !strconcat("v", OpcodeStr,
+                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                       [(set VR256:$dst,
+                         (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
+                       IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
+  def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
+                       (ins i256mem:$src1, u8imm:$src2),
+                       !strconcat("v", OpcodeStr,
+                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      [(set VR256:$dst,
+                        (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
+                         (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
+                   Sched<[WriteShuffleLd]>;
+}
+
+let Predicates = [UseSSE2] in {
+  def ri : Ii8<0x70, MRMSrcReg,
+               (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
+               !strconcat(OpcodeStr,
+                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set VR128:$dst,
+                  (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
+                IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>;
+  def mi : Ii8<0x70, MRMSrcMem,
+               (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
+               !strconcat(OpcodeStr,
+                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set VR128:$dst,
+                  (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
+                          (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>,
+           Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+}
+} // ExeDomain = SSEPackedInt
+
+defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, PD;
+defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS;
+defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
+            (VPSHUFDmi addr:$src1, imm:$imm)>;
+  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+            (VPSHUFDri VR128:$src1, imm:$imm)>;
+}
+
+let Predicates = [UseSSE2] in {
+  def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
+            (PSHUFDmi addr:$src1, imm:$imm)>;
+  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+            (PSHUFDri VR128:$src1, imm:$imm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// Packed Integer Pack Instructions (SSE & AVX)
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
+                     ValueType ArgVT, SDNode OpNode, PatFrag bc_frag,
+                     PatFrag ld_frag, bit Is2Addr = 1> {
+  def rr : PDI<opc, MRMSrcReg,
+               (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+               !if(Is2Addr,
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   !strconcat(OpcodeStr,
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+               [(set VR128:$dst,
+                     (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
+               Sched<[WriteShuffle]>;
+  def rm : PDI<opc, MRMSrcMem,
+               (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+               !if(Is2Addr,
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   !strconcat(OpcodeStr,
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+               [(set VR128:$dst,
+                     (OutVT (OpNode VR128:$src1,
+                                    (bc_frag (ld_frag addr:$src2)))))]>,
+               Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
+                       ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> {
+  def Yrr : PDI<opc, MRMSrcReg,
+                (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+                !strconcat(OpcodeStr,
+                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set VR256:$dst,
+                      (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
+                Sched<[WriteShuffle]>;
+  def Yrm : PDI<opc, MRMSrcMem,
+                (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+                !strconcat(OpcodeStr,
+                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set VR256:$dst,
+                      (OutVT (OpNode VR256:$src1,
+                                     (bc_frag (loadv4i64 addr:$src2)))))]>,
+                Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
+                     ValueType ArgVT, SDNode OpNode, PatFrag bc_frag,
+                     PatFrag ld_frag, bit Is2Addr = 1> {
+  def rr : SS48I<opc, MRMSrcReg,
+                 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                 !if(Is2Addr,
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     !strconcat(OpcodeStr,
+                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+                 [(set VR128:$dst,
+                       (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
+                 Sched<[WriteShuffle]>;
+  def rm : SS48I<opc, MRMSrcMem,
+                 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                 !if(Is2Addr,
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     !strconcat(OpcodeStr,
+                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+                 [(set VR128:$dst,
+                       (OutVT (OpNode VR128:$src1,
+                                      (bc_frag (ld_frag addr:$src2)))))]>,
+                 Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
+                     ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> {
+  def Yrr : SS48I<opc, MRMSrcReg,
+                  (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+                  !strconcat(OpcodeStr,
+                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set VR256:$dst,
+                        (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
+                  Sched<[WriteShuffle]>;
+  def Yrm : SS48I<opc, MRMSrcMem,
+                  (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+                  !strconcat(OpcodeStr,
+                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set VR256:$dst,
+                        (OutVT (OpNode VR256:$src1,
+                                       (bc_frag (loadv4i64 addr:$src2)))))]>,
+                  Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss,
+                             bc_v8i16, loadv2i64, 0>, VEX_4V;
+  defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss,
+                             bc_v4i32, loadv2i64, 0>, VEX_4V;
+
+  defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus,
+                             bc_v8i16, loadv2i64, 0>, VEX_4V;
+  defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus,
+                             bc_v4i32, loadv2i64, 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2] in {
+  defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss,
+                               bc_v16i16>, VEX_4V, VEX_L;
+  defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss,
+                               bc_v8i32>, VEX_4V, VEX_L;
+
+  defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus,
+                               bc_v16i16>, VEX_4V, VEX_L;
+  defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus,
+                               bc_v8i32>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss,
+                            bc_v8i16, memopv2i64>;
+  defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss,
+                            bc_v4i32, memopv2i64>;
+
+  defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus,
+                            bc_v8i16, memopv2i64>;
+
+  let Predicates = [HasSSE41] in
+  defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus,
+                            bc_v4i32, memopv2i64>;
+}
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Unpack Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
+                       SDNode OpNode, PatFrag bc_frag, PatFrag ld_frag,
+                       bit Is2Addr = 1> {
+  def rr : PDI<opc, MRMSrcReg,
+      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+      !if(Is2Addr,
+          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
+          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
+      IIC_SSE_UNPCK>, Sched<[WriteShuffle]>;
+  def rm : PDI<opc, MRMSrcMem,
+      (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+      !if(Is2Addr,
+          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
+          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set VR128:$dst, (OpNode VR128:$src1,
+                                  (bc_frag (ld_frag addr:$src2))))],
+                                               IIC_SSE_UNPCK>,
+      Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
+                         SDNode OpNode, PatFrag bc_frag> {
+  def Yrr : PDI<opc, MRMSrcReg,
+      (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+      [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>,
+      Sched<[WriteShuffle]>;
+  def Yrm : PDI<opc, MRMSrcMem,
+      (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+      [(set VR256:$dst, (OpNode VR256:$src1,
+                                  (bc_frag (loadv4i64 addr:$src2))))]>,
+      Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
+                                 bc_v16i8, loadv2i64, 0>, VEX_4V;
+  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
+                                 bc_v8i16, loadv2i64, 0>, VEX_4V;
+  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
+                                 bc_v16i8, loadv2i64, 0>, VEX_4V;
+  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
+                                 bc_v8i16, loadv2i64, 0>, VEX_4V;
+}
+let Predicates = [HasAVX, NoVLX] in {
+  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
+                                 bc_v4i32, loadv2i64, 0>, VEX_4V;
+  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
+                                 bc_v2i64, loadv2i64, 0>, VEX_4V;
+  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
+                                 bc_v4i32, loadv2i64, 0>, VEX_4V;
+  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
+                                 bc_v2i64, loadv2i64, 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+  defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl,
+                                   bc_v32i8>, VEX_4V, VEX_L;
+  defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl,
+                                   bc_v16i16>, VEX_4V, VEX_L;
+  defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh,
+                                   bc_v32i8>, VEX_4V, VEX_L;
+  defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh,
+                                   bc_v16i16>, VEX_4V, VEX_L;
+}
+let Predicates = [HasAVX2, NoVLX] in {
+  defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
+                                   bc_v8i32>, VEX_4V, VEX_L;
+  defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
+                                   bc_v4i64>, VEX_4V, VEX_L;
+  defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh,
+                                   bc_v8i32>, VEX_4V, VEX_L;
+  defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh,
+                                   bc_v4i64>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
+                                bc_v16i8, memopv2i64>;
+  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
+                                bc_v8i16, memopv2i64>;
+  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
+                                bc_v4i32, memopv2i64>;
+  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
+                                bc_v2i64, memopv2i64>;
+
+  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
+                                bc_v16i8, memopv2i64>;
+  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
+                                bc_v8i16, memopv2i64>;
+  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
+                                bc_v4i32, memopv2i64>;
+  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
+                                bc_v2i64, memopv2i64>;
+}
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Extract and Insert
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pinsrw<bit Is2Addr = 1> {
+  def rri : Ii8<0xC4, MRMSrcReg,
+       (outs VR128:$dst), (ins VR128:$src1,
+        GR32orGR64:$src2, u8imm:$src3),
+       !if(Is2Addr,
+           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+       [(set VR128:$dst,
+         (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))],
+       IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
+  def rmi : Ii8<0xC4, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1,
+                        i16mem:$src2, u8imm:$src3),
+       !if(Is2Addr,
+           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+       [(set VR128:$dst,
+         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
+                    imm:$src3))], IIC_SSE_PINSRW>,
+       Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+// Extract
+let Predicates = [HasAVX, NoBWI] in
+def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
+                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
+                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                            imm:$src2))]>, PD, VEX,
+                Sched<[WriteShuffle]>;
+def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
+                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
+                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                            imm:$src2))], IIC_SSE_PEXTRW>,
+               Sched<[WriteShuffleLd, ReadAfterLd]>;
+
+// Insert
+let Predicates = [HasAVX, NoBWI] in
+defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
+
+let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
+defm PINSRW : sse2_pinsrw, PD;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Mask Creation
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
+
+def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+           (ins VR128:$src),
+           "pmovmskb\t{$src, $dst|$dst, $src}",
+           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+           IIC_SSE_MOVMSK>, VEX;
+
+let Predicates = [HasAVX2] in {
+def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+           (ins VR256:$src),
+           "pmovmskb\t{$src, $dst|$dst, $src}",
+           [(set GR32orGR64:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>,
+           VEX, VEX_L;
+}
+
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
+           "pmovmskb\t{$src, $dst|$dst, $src}",
+           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+           IIC_SSE_MOVMSK>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Conditional Store
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
+
+let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
+def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
+           (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
+           IIC_SSE_MASKMOV>, VEX;
+let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
+def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
+           (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
+           IIC_SSE_MASKMOV>, VEX;
+
+let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
+def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
+           IIC_SSE_MASKMOV>;
+let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
+def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
+           IIC_SSE_MASKMOV>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Doubleword/Quadword
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Move Int Doubleword to Packed Double Int
+//
+def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+                        VEX, Sched<[WriteMove]>;
+def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_SSE_MOVDQ>,
+                      VEX, Sched<[WriteLoad]>;
+def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))],
+                          IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>;
+let isCodeGenOnly = 1 in
+def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "movq\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))],
+                       IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+
+def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+                  Sched<[WriteMove]>;
+def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))],
+                          IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+let isCodeGenOnly = 1 in
+def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))],
+                       IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+
+//===---------------------------------------------------------------------===//
+// Move Int Doubleword to Single Scalar
+//
+let isCodeGenOnly = 1 in {
+  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert GR32:$src))],
+                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+
+  def VMOVDI2SSrm  : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+                        IIC_SSE_MOVDQ>,
+                        VEX, Sched<[WriteLoad]>;
+  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert GR32:$src))],
+                        IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+
+  def MOVDI2SSrm  : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+}
+
+//===---------------------------------------------------------------------===//
+// Move Packed Doubleword Int to Packed Double Int
+//
+def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
+                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
+                    Sched<[WriteMove]>;
+def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
+                       (ins i32mem:$dst, VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(store (i32 (extractelt (v4i32 VR128:$src),
+                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+                                     VEX, Sched<[WriteStore]>;
+def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
+                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
+                   Sched<[WriteMove]>;
+def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(store (i32 (extractelt (v4i32 VR128:$src),
+                                     (iPTR 0))), addr:$dst)],
+                                     IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+
+def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
+
+def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
+
+//===---------------------------------------------------------------------===//
+// Move Packed Doubleword Int first element to Doubleword Int
+//
+let SchedRW = [WriteMove] in {
+def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                          "movq\t{$src, $dst|$dst, $src}",
+                          [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
+                                                        (iPTR 0)))],
+                                                           IIC_SSE_MOVD_ToGP>,
+                      VEX;
+
+def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
+                                                         (iPTR 0)))],
+                                                         IIC_SSE_MOVD_ToGP>;
+} //SchedRW
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs),
+                          (ins i64mem:$dst, VR128:$src),
+                          "movq\t{$src, $dst|$dst, $src}",
+                          [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+
+//===---------------------------------------------------------------------===//
+// Bitcast FR64 <-> GR64
+//
+let isCodeGenOnly = 1 in {
+  let Predicates = [UseAVX] in
+  def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                          "movq\t{$src, $dst|$dst, $src}",
+                          [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
+                          VEX, Sched<[WriteLoad]>;
+  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                           "movq\t{$src, $dst|$dst, $src}",
+                           [(set GR64:$dst, (bitconvert FR64:$src))],
+                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+  def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                           "movq\t{$src, $dst|$dst, $src}",
+                           [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+
+  def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                         "movq\t{$src, $dst|$dst, $src}",
+                         [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
+                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                         "mov{d|q}\t{$src, $dst|$dst, $src}",
+                         [(set GR64:$dst, (bitconvert FR64:$src))],
+                         IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+  def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                         "movq\t{$src, $dst|$dst, $src}",
+                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+                         IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+}
+
+//===---------------------------------------------------------------------===//
+// Move Scalar Single to Double Int
+//
+let isCodeGenOnly = 1 in {
+  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set GR32:$dst, (bitconvert FR32:$src))],
+                        IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
+  def VMOVSS2DImr  : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set GR32:$dst, (bitconvert FR32:$src))],
+                        IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+  def MOVSS2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+                        IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+}
+
+let Predicates = [UseAVX] in {
+  let AddedComplexity = 15 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (VMOVDI2PDIrr GR32:$src)>;
+
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+              (VMOV64toPQIrr GR64:$src)>;
+
+    def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+              (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>;
+  }
+  // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
+  // These instructions also write zeros in the high part of a 256-bit register.
+  let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+              (VMOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+              (VMOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+              (VMOVDI2PDIrm addr:$src)>;
+    def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+              (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
+  }
+  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
+}
+
+let Predicates = [UseSSE2] in {
+  let AddedComplexity = 15 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (MOVDI2PDIrr GR32:$src)>;
+
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+              (MOV64toPQIrr GR64:$src)>;
+  }
+  let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+              (MOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+              (MOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+              (MOVDI2PDIrm addr:$src)>;
+  }
+}
+
+// These are the correct encodings of the instructions so that we know how to
+// read correct assembly, even though we continue to emit the wrong ones for
+// compatibility with Darwin's buggy assembler.
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
+// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Quadword
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Move Quadword Int to Packed Quadword Int
+//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteLoad] in {
+def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                    "vmovq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst,
+                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+                    VEX, Requires<[UseAVX]>;
+def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                    "movq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst,
+                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
+                      IIC_SSE_MOVDQ>, XS,
+                    Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
+} // ExeDomain, SchedRW
+
+//===---------------------------------------------------------------------===//
+// Move Packed Quadword Int to Quadword Int
+//
+let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
+def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                      "movq\t{$src, $dst|$dst, $src}",
+                      [(store (i64 (extractelt (v2i64 VR128:$src),
+                                    (iPTR 0))), addr:$dst)],
+                                    IIC_SSE_MOVDQ>, VEX;
+def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                      "movq\t{$src, $dst|$dst, $src}",
+                      [(store (i64 (extractelt (v2i64 VR128:$src),
+                                    (iPTR 0))), addr:$dst)],
+                                    IIC_SSE_MOVDQ>;
+} // ExeDomain, SchedRW
+
+// For disassembler only
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteVecLogic] in {
+def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                     "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX;
+def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                      "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
+}
+
+//===---------------------------------------------------------------------===//
+// Store / copy lower 64-bits of a XMM register.
+//
+let Predicates = [HasAVX] in
+def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
+          (VMOVPQI2QImr addr:$dst, VR128:$src)>;
+let Predicates = [UseSSE2] in
+def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
+          (MOVPQI2QImr addr:$dst, VR128:$src)>;
+
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, AddedComplexity = 20 in {
+def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                     "vmovq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
+                                                 (loadi64 addr:$src))))))],
+                                                 IIC_SSE_MOVDQ>,
+                     XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>;
+
+def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                     "movq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
+                                                 (loadi64 addr:$src))))))],
+                                                 IIC_SSE_MOVDQ>,
+                     XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
+} // ExeDomain, isCodeGenOnly, AddedComplexity
+
+let Predicates = [UseAVX], AddedComplexity = 20 in {
+  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+            (VMOVZQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzload addr:$src)),
+            (VMOVZQI2PQIrm addr:$src)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+              (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>;
+}
+
+let Predicates = [UseSSE2], AddedComplexity = 20 in {
+  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+            (MOVZQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4i64 (alignedX86vzload addr:$src)),
+          (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>;
+def : Pat<(v4i64 (X86vzload addr:$src)),
+          (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
+// IA32 document. movq xmm1, xmm2 does clear the high bits.
+//
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
+let AddedComplexity = 15 in
+def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "vmovq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
+                    IIC_SSE_MOVQ_RR>,
+                      XS, VEX, Requires<[UseAVX]>;
+let AddedComplexity = 15 in
+def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
+                    IIC_SSE_MOVQ_RR>,
+                      XS, Requires<[UseSSE2]>;
+} // ExeDomain, SchedRW
+
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in {
+let AddedComplexity = 20 in
+def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                        "vmovq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl
+                                             (loadv2i64 addr:$src))))],
+                                             IIC_SSE_MOVDQ>,
+                      XS, VEX, Requires<[UseAVX]>;
+let AddedComplexity = 20 in {
+def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl
+                                             (loadv2i64 addr:$src))))],
+                                             IIC_SSE_MOVDQ>,
+                      XS, Requires<[UseSSE2]>;
+}
+} // ExeDomain, isCodeGenOnly, SchedRW
+
+let AddedComplexity = 20 in {
+  let Predicates = [UseAVX] in {
+    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+              (VMOVZPQILo2PQIrr VR128:$src)>;
+  }
+  let Predicates = [UseSSE2] in {
+    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+              (MOVZPQILo2PQIrr VR128:$src)>;
+  }
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
+                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
+                              X86MemOperand x86memop> {
+def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      [(set RC:$dst, (vt (OpNode RC:$src)))],
+                      IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
+                      IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
+  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
+  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
+  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
+}
+defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
+                                   memopv4f32, f128mem>;
+defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
+                                   memopv4f32, f128mem>;
+
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+            (VMOVSHDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
+            (VMOVSHDUPrm addr:$src)>;
+  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+            (VMOVSLDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
+            (VMOVSLDUPrm addr:$src)>;
+  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
+            (VMOVSHDUPYrr VR256:$src)>;
+  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
+            (VMOVSHDUPYrm addr:$src)>;
+  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
+            (VMOVSLDUPYrr VR256:$src)>;
+  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
+            (VMOVSLDUPYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE3] in {
+  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+            (MOVSHDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+            (MOVSHDUPrm addr:$src)>;
+  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+            (MOVSLDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+            (MOVSLDUPrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Replicate Double FP - MOVDDUP
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_replicate_dfp<string OpcodeStr> {
+def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))],
+                    IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst,
+                      (v2f64 (X86Movddup
+                              (scalar_to_vector (loadf64 addr:$src)))))],
+                              IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
+}
+
+// FIXME: Merge with above classe when there're patterns for the ymm version
+multiclass sse3_replicate_dfp_y<string OpcodeStr> {
+def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
+                    Sched<[WriteFShuffle]>;
+def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR256:$dst,
+                      (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
+                    Sched<[WriteLoad]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
+  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
+}
+
+defm MOVDDUP : sse3_replicate_dfp<"movddup">;
+
+
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(X86Movddup (loadv2f64 addr:$src)),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+
+  // 256-bit version
+  def : Pat<(X86Movddup (loadv4i64 addr:$src)),
+            (VMOVDDUPYrm addr:$src)>;
+  def : Pat<(X86Movddup (v4i64 VR256:$src)),
+            (VMOVDDUPYrr VR256:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+  def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+  def : Pat<(X86Movddup (bc_v2f64
+                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+}
+
+let Predicates = [UseAVX, OptForSize] in {
+  def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
+            (VMOVDDUPrm addr:$src)>;
+  def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
+            (VMOVDDUPrm addr:$src)>;
+}
+
+let Predicates = [UseSSE3] in {
+  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (bc_v2f64
+                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+            (MOVDDUPrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Move Unaligned Integer
+//===---------------------------------------------------------------------===//
+
+let SchedRW = [WriteLoad] in {
+let Predicates = [HasAVX] in {
+  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "vlddqu\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
+  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                   "vlddqu\t{$src, $dst|$dst, $src}",
+                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
+                   VEX, VEX_L;
+}
+def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "lddqu\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
+                   IIC_SSE_LDDQU>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Arithmetic
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop, OpndItins itins,
+                       PatFrag ld_frag, bit Is2Addr = 1> {
+  def rr : I<0xD0, MRMSrcReg,
+       (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>,
+       Sched<[itins.Sched]>;
+  def rm : I<0xD0, MRMSrcMem,
+       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))], itins.rr>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  let ExeDomain = SSEPackedSingle in {
+    defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
+                               f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V;
+    defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
+                        f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L;
+  }
+  let ExeDomain = SSEPackedDouble in {
+    defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
+                               f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V;
+    defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
+                        f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L;
+  }
+}
+let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
+  let ExeDomain = SSEPackedSingle in
+  defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
+                              f128mem, SSE_ALU_F32P, memopv4f32>, XD;
+  let ExeDomain = SSEPackedDouble in
+  defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
+                              f128mem, SSE_ALU_F64P, memopv2f64>, PD;
+}
+
+// Patterns used to select 'addsub' instructions.
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
+            (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
+  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (loadv4f32 addr:$rhs))),
+            (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
+  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
+            (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (loadv2f64 addr:$rhs))),
+            (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
+
+  def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))),
+            (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>;
+  def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (loadv8f32 addr:$rhs))),
+            (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>;
+  def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))),
+            (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
+  def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (loadv4f64 addr:$rhs))),
+            (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>;
+}
+
+let Predicates = [UseSSE3] in {
+  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
+            (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
+  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (memopv4f32 addr:$rhs))),
+            (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
+  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
+            (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (memopv2f64 addr:$rhs))),
+            (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 Instructions
+//===---------------------------------------------------------------------===//
+
+// Horizontal ops
+multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
+                   X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
+                   bit Is2Addr = 1> {
+  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
+      Sched<[WriteFAdd]>;
+
+  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
+        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
+                  X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
+                  bit Is2Addr = 1> {
+  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
+      Sched<[WriteFAdd]>;
+
+  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
+        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  let ExeDomain = SSEPackedSingle in {
+    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
+                            X86fhadd, loadv4f32, 0>, VEX_4V;
+    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
+                            X86fhsub, loadv4f32, 0>, VEX_4V;
+    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
+                            X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L;
+    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
+                            X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L;
+  }
+  let ExeDomain = SSEPackedDouble in {
+    defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
+                            X86fhadd, loadv2f64, 0>, VEX_4V;
+    defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
+                            X86fhsub, loadv2f64, 0>, VEX_4V;
+    defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
+                            X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L;
+    defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
+                            X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L;
+  }
+}
+
+let Constraints = "$src1 = $dst" in {
+  let ExeDomain = SSEPackedSingle in {
+    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
+                          memopv4f32>;
+    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
+                          memopv4f32>;
+  }
+  let ExeDomain = SSEPackedDouble in {
+    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
+                         memopv2f64>;
+    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
+                         memopv2f64>;
+  }
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Absolute Instructions
+//===---------------------------------------------------------------------===//
+
+
+/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
+                            PatFrag ld_frag> {
+  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>,
+                    Sched<[WriteVecALU]>;
+
+  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins i128mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst,
+                      (IntId128
+                       (bitconvert (ld_frag addr:$src))))], IIC_SSE_PABS_RM>,
+                    Sched<[WriteVecALULd]>;
+}
+
+/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
+                              Intrinsic IntId256> {
+  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+                    (ins VR256:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR256:$dst, (IntId256 VR256:$src))]>,
+                    Sched<[WriteVecALU]>;
+
+  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+                    (ins i256mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR256:$dst,
+                      (IntId256
+                       (bitconvert (loadv4i64 addr:$src))))]>,
+                    Sched<[WriteVecALULd]>;
+}
+
+// Helper fragments to match sext vXi1 to vXiY.
+def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
+                                               VR128:$src))>;
+def v8i1sextv8i16  : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>;
+def v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>;
+def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
+                                               VR256:$src))>;
+def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
+def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
+
+let Predicates = [HasAVX] in {
+  defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb", int_x86_ssse3_pabs_b_128,
+                                  loadv2i64>, VEX;
+  defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw", int_x86_ssse3_pabs_w_128,
+                                  loadv2i64>, VEX;
+  defm VPABSD  : SS3I_unop_rm_int<0x1E, "vpabsd", int_x86_ssse3_pabs_d_128,
+                                  loadv2i64>, VEX;
+
+  def : Pat<(xor
+            (bc_v2i64 (v16i1sextv16i8)),
+            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
+            (VPABSBrr128 VR128:$src)>;
+  def : Pat<(xor
+            (bc_v2i64 (v8i1sextv8i16)),
+            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
+            (VPABSWrr128 VR128:$src)>;
+  def : Pat<(xor
+            (bc_v2i64 (v4i1sextv4i32)),
+            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
+            (VPABSDrr128 VR128:$src)>;
+}
+
+let Predicates = [HasAVX2] in {
+  defm VPABSB  : SS3I_unop_rm_int_y<0x1C, "vpabsb",
+                                    int_x86_avx2_pabs_b>, VEX, VEX_L;
+  defm VPABSW  : SS3I_unop_rm_int_y<0x1D, "vpabsw",
+                                    int_x86_avx2_pabs_w>, VEX, VEX_L;
+  defm VPABSD  : SS3I_unop_rm_int_y<0x1E, "vpabsd",
+                                    int_x86_avx2_pabs_d>, VEX, VEX_L;
+
+  def : Pat<(xor
+            (bc_v4i64 (v32i1sextv32i8)),
+            (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
+            (VPABSBrr256 VR256:$src)>;
+  def : Pat<(xor
+            (bc_v4i64 (v16i1sextv16i16)),
+            (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
+            (VPABSWrr256 VR256:$src)>;
+  def : Pat<(xor
+            (bc_v4i64 (v8i1sextv8i32)),
+            (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
+            (VPABSDrr256 VR256:$src)>;
+}
+
+defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", int_x86_ssse3_pabs_b_128,
+                              memopv2i64>;
+defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", int_x86_ssse3_pabs_w_128,
+                              memopv2i64>;
+defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", int_x86_ssse3_pabs_d_128,
+                              memopv2i64>;
+
+let Predicates = [HasSSSE3] in {
+  def : Pat<(xor
+            (bc_v2i64 (v16i1sextv16i8)),
+            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
+            (PABSBrr128 VR128:$src)>;
+  def : Pat<(xor
+            (bc_v2i64 (v8i1sextv8i16)),
+            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
+            (PABSWrr128 VR128:$src)>;
+  def : Pat<(xor
+            (bc_v2i64 (v4i1sextv4i32)),
+            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
+            (PABSDrr128 VR128:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Binary Operator Instructions
+//===---------------------------------------------------------------------===//
+
+let Sched = WriteVecALU in {
+def SSE_PHADDSUBD : OpndItins<
+  IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
+>;
+def SSE_PHADDSUBSW : OpndItins<
+  IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
+>;
+def SSE_PHADDSUBW : OpndItins<
+  IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
+>;
+}
+let Sched = WriteShuffle in
+def SSE_PSHUFB : OpndItins<
+  IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
+>;
+let Sched = WriteVecALU in
+def SSE_PSIGN : OpndItins<
+  IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
+>;
+let Sched = WriteVecIMul in
+def SSE_PMULHRSW : OpndItins<
+  IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
+>;
+
+/// SS3I_binop_rm - Simple SSSE3 bin op
+multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                         X86MemOperand x86memop, OpndItins itins,
+                         bit Is2Addr = 1> {
+  let isCommutable = 1 in
+  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
+       Sched<[itins.Sched]>;
+  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst,
+         (OpVT (OpNode RC:$src1,
+          (bitconvert (memop_frag addr:$src2)))))], itins.rm>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
+                             Intrinsic IntId128, OpndItins itins,
+                             PatFrag ld_frag, bit Is2Addr = 1> {
+  let isCommutable = 1 in
+  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+       Sched<[itins.Sched]>;
+  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst,
+         (IntId128 VR128:$src1,
+          (bitconvert (ld_frag addr:$src2))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
+                               Intrinsic IntId256,
+                               X86FoldableSchedWrite Sched> {
+  let isCommutable = 1 in
+  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+       (ins VR256:$src1, VR256:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
+       Sched<[Sched]>;
+  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+       (ins VR256:$src1, i256mem:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set VR256:$dst,
+         (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
+       Sched<[Sched.Folded, ReadAfterLd]>;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX] in {
+let isCommutable = 0 in {
+  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V;
+  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PHADDSUBD, 0>, VEX_4V;
+  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V;
+  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PHADDSUBD, 0>, VEX_4V;
+  defm VPSIGNB    : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PSIGN, 0>, VEX_4V;
+  defm VPSIGNW    : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PSIGN, 0>, VEX_4V;
+  defm VPSIGND    : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PSIGN, 0>, VEX_4V;
+  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PSHUFB, 0>, VEX_4V;
+  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
+                                      int_x86_ssse3_phadd_sw_128,
+                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
+  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
+                                      int_x86_ssse3_phsub_sw_128,
+                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
+  defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw",
+                                      int_x86_ssse3_pmadd_ub_sw_128,
+                                      SSE_PMADD, loadv2i64, 0>, VEX_4V;
+}
+defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
+                                      int_x86_ssse3_pmul_hr_sw_128,
+                                      SSE_PMULHRSW, loadv2i64, 0>, VEX_4V;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX2] in {
+let isCommutable = 0 in {
+  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
+                                  loadv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
+                                  loadv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
+                                  loadv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
+                                  loadv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPSIGNBY   : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
+                                  loadv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPSIGNWY   : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
+                                  loadv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPSIGNDY   : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
+                                  loadv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
+                                  loadv4i64, i256mem,
+                                  SSE_PSHUFB, 0>, VEX_4V, VEX_L;
+  defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
+                                        int_x86_avx2_phadd_sw,
+                                        WriteVecALU>, VEX_4V, VEX_L;
+  defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
+                                        int_x86_avx2_phsub_sw,
+                                        WriteVecALU>, VEX_4V, VEX_L;
+  defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
+                                       int_x86_avx2_pmadd_ub_sw,
+                                        WriteVecIMul>, VEX_4V, VEX_L;
+}
+defm VPMULHRSW    : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
+                                        int_x86_avx2_pmul_hr_sw,
+                                        WriteVecIMul>, VEX_4V, VEX_L;
+}
+
+// None of these have i8 immediate fields.
+let ImmT = NoImm, Constraints = "$src1 = $dst" in {
+let isCommutable = 0 in {
+  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128,
+                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
+  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128,
+                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
+  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128,
+                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
+  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128,
+                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
+  defm PSIGNB    : SS3I_binop_rm<0x08, "psignb", X86psign, v16i8, VR128,
+                                 memopv2i64, i128mem, SSE_PSIGN>;
+  defm PSIGNW    : SS3I_binop_rm<0x09, "psignw", X86psign, v8i16, VR128,
+                                 memopv2i64, i128mem, SSE_PSIGN>;
+  defm PSIGND    : SS3I_binop_rm<0x0A, "psignd", X86psign, v4i32, VR128,
+                                 memopv2i64, i128mem, SSE_PSIGN>;
+  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128,
+                                 memopv2i64, i128mem, SSE_PSHUFB>;
+  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
+                                     int_x86_ssse3_phadd_sw_128,
+                                     SSE_PHADDSUBSW, memopv2i64>;
+  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
+                                     int_x86_ssse3_phsub_sw_128,
+                                     SSE_PHADDSUBSW, memopv2i64>;
+  defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw",
+                                     int_x86_ssse3_pmadd_ub_sw_128,
+                                     SSE_PMADD, memopv2i64>;
+}
+defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw",
+                                     int_x86_ssse3_pmul_hr_sw_128,
+                                     SSE_PMULHRSW, memopv2i64>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Align Instruction Patterns
+//===---------------------------------------------------------------------===//
+
+multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
+  let hasSideEffects = 0 in {
+  def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>;
+  let mayLoad = 1 in
+  def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+  }
+}
+
+multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
+  let hasSideEffects = 0 in {
+  def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
+      (ins VR256:$src1, VR256:$src2, u8imm:$src3),
+      !strconcat(asm,
+                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+      []>, Sched<[WriteShuffle]>;
+  let mayLoad = 1 in
+  def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
+      (ins VR256:$src1, i256mem:$src2, u8imm:$src3),
+      !strconcat(asm,
+                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+      []>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+  }
+}
+
+let Predicates = [HasAVX] in
+  defm VPALIGN : ssse3_palignr<"vpalignr", 0>, VEX_4V;
+let Predicates = [HasAVX2] in
+  defm VPALIGN : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
+  defm PALIGN : ssse3_palignr<"palignr">;
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>;
+}
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+let Predicates = [UseSSSE3] in {
+def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Thread synchronization
+//===---------------------------------------------------------------------===//
+
+let SchedRW = [WriteSystem] in {
+let usesCustomInserter = 1 in {
+def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
+                [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
+                Requires<[HasSSE3]>;
+}
+
+let Uses = [EAX, ECX, EDX] in
+def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
+                 TB, Requires<[HasSSE3]>;
+let Uses = [ECX, EAX] in
+def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
+                [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
+                TB, Requires<[HasSSE3]>;
+} // SchedRW
+
+def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
+def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
+
+def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
+      Requires<[Not64BitMode]>;
+def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
+      Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Packed Move with Sign/Zero Extend
+//===----------------------------------------------------------------------===//
+
+multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
+                          RegisterClass OutRC, RegisterClass InRC,
+                          OpndItins itins> {
+  def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [], itins.rr>,
+                 Sched<[itins.Sched]>;
+
+  def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [],
+                 itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
+                          X86MemOperand MemOp, X86MemOperand MemYOp,
+                          OpndItins SSEItins, OpndItins AVXItins,
+                          OpndItins AVX2Itins> {
+  defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>;
+  let Predicates = [HasAVX, NoVLX] in
+    defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
+                                     VR128, VR128, AVXItins>, VEX;
+  let Predicates = [HasAVX2, NoVLX] in
+    defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
+                                     VR256, VR128, AVX2Itins>, VEX, VEX_L;
+}
+
+multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr,
+                                X86MemOperand MemOp, X86MemOperand MemYOp> {
+  defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
+                                        MemOp, MemYOp,
+                                        SSE_INTALU_ITINS_SHUFF_P,
+                                        DEFAULT_ITINS_SHUFFLESCHED,
+                                        DEFAULT_ITINS_SHUFFLESCHED>;
+  defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
+                                        !strconcat("pmovzx", OpcodeStr),
+                                        MemOp, MemYOp,
+                                        SSE_INTALU_ITINS_SHUFF_P,
+                                        DEFAULT_ITINS_SHUFFLESCHED,
+                                        DEFAULT_ITINS_SHUFFLESCHED>;
+}
+
+defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem>;
+defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem>;
+defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem>;
+
+defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem>;
+defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem>;
+
+defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem>;
+
+// AVX2 Patterns
+multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> {
+  // Register-Register patterns
+  def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
+            (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
+  def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))),
+            (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
+  def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))),
+            (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
+
+  def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
+            (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
+  def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))),
+            (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
+
+  def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
+            (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
+
+  // On AVX2, we also support 256bit inputs.
+  def : Pat<(v16i16 (ExtOp (v32i8 VR256:$src))),
+            (!cast<I>(OpcPrefix#BWYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v8i32 (ExtOp (v32i8 VR256:$src))),
+            (!cast<I>(OpcPrefix#BDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v4i64 (ExtOp (v32i8 VR256:$src))),
+            (!cast<I>(OpcPrefix#BQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v8i32 (ExtOp (v16i16 VR256:$src))),
+            (!cast<I>(OpcPrefix#WDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v4i64 (ExtOp (v16i16 VR256:$src))),
+            (!cast<I>(OpcPrefix#WQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v4i64 (ExtOp (v8i32 VR256:$src))),
+            (!cast<I>(OpcPrefix#DQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  // Simple Register-Memory patterns
+  def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+
+  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+
+  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
+            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+
+  // AVX2 Register-Memory patterns
+  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+  def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+  def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+
+  def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+
+  def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+
+  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+
+  def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+
+  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+  defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>;
+  defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>;
+}
+
+// SSE4.1/AVX patterns.
+multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
+                                SDNode ExtOp, PatFrag ExtLoad16> {
+  def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
+            (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
+  def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
+            (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
+  def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
+            (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
+
+  def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
+            (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
+  def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
+            (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
+
+  def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
+            (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
+
+  def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+
+  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+
+  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
+            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+
+  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+  def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+  def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+  def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+
+  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+
+  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+
+  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+
+  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+
+  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
+  defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
+}
+
+let Predicates = [UseSSE41] in {
+  defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>;
+  defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Extract Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
+multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+                 (ins VR128:$src1, u8imm:$src2),
+                 !strconcat(OpcodeStr,
+                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
+                                         imm:$src2))]>,
+                  Sched<[WriteShuffle]>;
+  let hasSideEffects = 0, mayStore = 1,
+      SchedRW = [WriteShuffleLd, WriteRMW] in
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
+                 !strconcat(OpcodeStr,
+                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1),
+                                                 imm:$src2)))), addr:$dst)]>;
+}
+
+let Predicates = [HasAVX, NoBWI] in
+  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
+
+defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
+
+
+/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
+multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+                   (ins VR128:$src1, u8imm:$src2),
+                   !strconcat(OpcodeStr,
+                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   []>, Sched<[WriteShuffle]>;
+
+  let hasSideEffects = 0, mayStore = 1,
+      SchedRW = [WriteShuffleLd, WriteRMW] in
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1),
+                                                  imm:$src2)))), addr:$dst)]>;
+}
+
+let Predicates = [HasAVX, NoBWI] in
+  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
+
+defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
+
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+                 (ins VR128:$src1, u8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32:$dst,
+                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
+                  Sched<[WriteShuffle]>;
+  let SchedRW = [WriteShuffleLd, WriteRMW] in
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
+                          addr:$dst)]>;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
+
+defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
+                 (ins VR128:$src1, u8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR64:$dst,
+                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
+                  Sched<[WriteShuffle]>, REX_W;
+  let SchedRW = [WriteShuffleLd, WriteRMW] in
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
+                          addr:$dst)]>, REX_W;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
+
+defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
+
+/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
+/// destination
+multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
+                            OpndItins itins = DEFAULT_ITINS> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+                 (ins VR128:$src1, u8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32orGR64:$dst,
+                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
+                    itins.rr>, Sched<[WriteFBlend]>;
+  let SchedRW = [WriteFBlendLd, WriteRMW] in
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
+                          addr:$dst)], itins.rm>;
+}
+
+let ExeDomain = SSEPackedSingle in {
+  let Predicates = [UseAVX] in
+    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
+  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
+}
+
+// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+                                              imm:$src2))),
+                 addr:$dst),
+          (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+          Requires<[HasAVX]>;
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+                                              imm:$src2))),
+                 addr:$dst),
+          (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+          Requires<[UseSSE41]>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Insert Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
+      Sched<[WriteShuffle]>;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
+                   imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoBWI] in
+  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
+
+multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, GR32:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
+      Sched<[WriteShuffle]>;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
+                          imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
+
+multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, GR64:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
+      Sched<[WriteShuffle]>;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
+                          imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
+let Constraints = "$src1 = $dst" in
+  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
+
+// insertps has a few different modes, there's the first two here below which
+// are optimized inserts that won't zero arbitrary elements in the destination
+// vector. The next one matches the intrinsic and could zero arbitrary elements
+// in the target vector.
+multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
+                           OpndItins itins = DEFAULT_ITINS> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
+      Sched<[WriteFShuffle]>;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86insertps VR128:$src1,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+                    imm:$src3))], itins.rm>,
+      Sched<[WriteFShuffleLd, ReadAfterLd]>;
+}
+
+let ExeDomain = SSEPackedSingle in {
+  let Predicates = [UseAVX] in
+    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
+  let Constraints = "$src1 = $dst" in
+    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
+}
+
+let Predicates = [UseSSE41] in {
+  // If we're inserting an element from a load or a null pshuf of a load,
+  // fold the load into the insertps instruction.
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
+                       (scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
+                   imm:$src3)),
+            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
+                      (loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
+            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
+let Predicates = [UseAVX] in {
+  // If we're inserting an element from a vbroadcast of a load, fold the
+  // load into the X86insertps instruction.
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+                (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
+            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+                (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
+            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Round Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
+                            X86MemOperand x86memop, RegisterClass RC,
+                            PatFrag mem_frag32, PatFrag mem_frag64,
+                            Intrinsic V4F32Int, Intrinsic V2F64Int> {
+let ExeDomain = SSEPackedSingle in {
+  // Intrinsic operation, reg.
+  // Vector intrinsic operation, reg
+  def PSr : SS4AIi8<opcps, MRMSrcReg,
+                    (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
+                    IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
+
+  // Vector intrinsic operation, mem
+  def PSm : SS4AIi8<opcps, MRMSrcMem,
+                    (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst,
+                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
+                          IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>;
+} // ExeDomain = SSEPackedSingle
+
+let ExeDomain = SSEPackedDouble in {
+  // Vector intrinsic operation, reg
+  def PDr : SS4AIi8<opcpd, MRMSrcReg,
+                    (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
+                    IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
+
+  // Vector intrinsic operation, mem
+  def PDm : SS4AIi8<opcpd, MRMSrcMem,
+                    (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst,
+                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
+                          IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>;
+} // ExeDomain = SSEPackedDouble
+}
+
+multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
+                            string OpcodeStr,
+                            Intrinsic F32Int,
+                            Intrinsic F64Int, bit Is2Addr = 1> {
+let ExeDomain = GenericDomain in {
+  // Operation, reg.
+  let hasSideEffects = 0 in
+  def SSr : SS4AIi8<opcss, MRMSrcReg,
+      (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
+      !if(Is2Addr,
+          !strconcat(OpcodeStr,
+              "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+          !strconcat(OpcodeStr,
+              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      []>, Sched<[WriteFAdd]>;
+
+  // Intrinsic operation, reg.
+  let isCodeGenOnly = 1 in
+  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
+        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+        Sched<[WriteFAdd]>;
+
+  // Intrinsic operation, mem.
+  def SSm : SS4AIi8<opcss, MRMSrcMem,
+        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst,
+             (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
+        Sched<[WriteFAddLd, ReadAfterLd]>;
+
+  // Operation, reg.
+  let hasSideEffects = 0 in
+  def SDr : SS4AIi8<opcsd, MRMSrcReg,
+        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        []>, Sched<[WriteFAdd]>;
+
+  // Intrinsic operation, reg.
+  let isCodeGenOnly = 1 in
+  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
+        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+        Sched<[WriteFAdd]>;
+
+  // Intrinsic operation, mem.
+  def SDm : SS4AIi8<opcsd, MRMSrcMem,
+        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst,
+              (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
+        Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = GenericDomain
+}
+
+// FP round - roundss, roundps, roundsd, roundpd
+let Predicates = [HasAVX] in {
+  // Intrinsic form
+  defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
+                                  loadv4f32, loadv2f64,
+                                  int_x86_sse41_round_ps,
+                                  int_x86_sse41_round_pd>, VEX;
+  defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
+                                  loadv8f32, loadv4f64,
+                                  int_x86_avx_round_ps_256,
+                                  int_x86_avx_round_pd_256>, VEX, VEX_L;
+  defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
+                                  int_x86_sse41_round_ss,
+                                  int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+}
+
+let Predicates = [UseAVX] in {
+  def : Pat<(ffloor FR32:$src),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
+  def : Pat<(f64 (ffloor FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
+  def : Pat<(f32 (fnearbyint FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+  def : Pat<(f64 (fnearbyint FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+  def : Pat<(f32 (fceil FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
+  def : Pat<(f64 (fceil FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
+  def : Pat<(f32 (frint FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+  def : Pat<(f64 (frint FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+  def : Pat<(f32 (ftrunc FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
+  def : Pat<(f64 (ftrunc FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f32 (ffloor VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0x9))>;
+  def : Pat<(v4f32 (fnearbyint VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0xC))>;
+  def : Pat<(v4f32 (fceil VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0xA))>;
+  def : Pat<(v4f32 (frint VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0x4))>;
+  def : Pat<(v4f32 (ftrunc VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0xB))>;
+
+  def : Pat<(v2f64 (ffloor VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0x9))>;
+  def : Pat<(v2f64 (fnearbyint VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0xC))>;
+  def : Pat<(v2f64 (fceil VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0xA))>;
+  def : Pat<(v2f64 (frint VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0x4))>;
+  def : Pat<(v2f64 (ftrunc VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0xB))>;
+
+  def : Pat<(v8f32 (ffloor VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0x9))>;
+  def : Pat<(v8f32 (fnearbyint VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0xC))>;
+  def : Pat<(v8f32 (fceil VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0xA))>;
+  def : Pat<(v8f32 (frint VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0x4))>;
+  def : Pat<(v8f32 (ftrunc VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0xB))>;
+
+  def : Pat<(v4f64 (ffloor VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0x9))>;
+  def : Pat<(v4f64 (fnearbyint VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0xC))>;
+  def : Pat<(v4f64 (fceil VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0xA))>;
+  def : Pat<(v4f64 (frint VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0x4))>;
+  def : Pat<(v4f64 (ftrunc VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0xB))>;
+}
+
+defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
+                               memopv4f32, memopv2f64,
+                               int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
+let Constraints = "$src1 = $dst" in
+defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
+                               int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
+
+let Predicates = [UseSSE41] in {
+  def : Pat<(ffloor FR32:$src),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
+  def : Pat<(f64 (ffloor FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
+  def : Pat<(f32 (fnearbyint FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+  def : Pat<(f64 (fnearbyint FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+  def : Pat<(f32 (fceil FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
+  def : Pat<(f64 (fceil FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
+  def : Pat<(f32 (frint FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+  def : Pat<(f64 (frint FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+  def : Pat<(f32 (ftrunc FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
+  def : Pat<(f64 (ftrunc FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
+
+  def : Pat<(v4f32 (ffloor VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0x9))>;
+  def : Pat<(v4f32 (fnearbyint VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0xC))>;
+  def : Pat<(v4f32 (fceil VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0xA))>;
+  def : Pat<(v4f32 (frint VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0x4))>;
+  def : Pat<(v4f32 (ftrunc VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0xB))>;
+
+  def : Pat<(v2f64 (ffloor VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0x9))>;
+  def : Pat<(v2f64 (fnearbyint VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0xC))>;
+  def : Pat<(v2f64 (fceil VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0xA))>;
+  def : Pat<(v2f64 (frint VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0x4))>;
+  def : Pat<(v2f64 (ftrunc VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0xB))>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Packed Bit Test
+//===----------------------------------------------------------------------===//
+
+// ptest instruction we'll lower to this in X86ISelLowering primarily from
+// the intel intrinsic that corresponds to this.
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
+def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
+                Sched<[WriteVecLogic]>, VEX;
+def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
+                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
+
+def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
+                Sched<[WriteVecLogic]>, VEX, VEX_L;
+def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
+                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L;
+}
+
+let Defs = [EFLAGS] in {
+def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+              "ptest\t{$src2, $src1|$src1, $src2}",
+              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
+              Sched<[WriteVecLogic]>;
+def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+              "ptest\t{$src2, $src1|$src1, $src2}",
+              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
+              Sched<[WriteVecLogicLd, ReadAfterLd]>;
+}
+
+// The bit test instructions below are AVX only
+multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
+  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
+            Sched<[WriteVecLogic]>, VEX;
+  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
+            Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
+let ExeDomain = SSEPackedSingle in {
+defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>;
+defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>,
+                            VEX_L;
+}
+let ExeDomain = SSEPackedDouble in {
+defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>;
+defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
+                            VEX_L;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Misc Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
+  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                     "popcnt{w}\t{$src, $dst|$dst, $src}",
+                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
+                     IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
+                     OpSize16, XS;
+  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                     "popcnt{w}\t{$src, $dst|$dst, $src}",
+                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
+                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
+                      Sched<[WriteFAddLd]>, OpSize16, XS;
+
+  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                     "popcnt{l}\t{$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
+                     IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
+                     OpSize32, XS;
+
+  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                     "popcnt{l}\t{$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
+                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
+                      Sched<[WriteFAddLd]>, OpSize32, XS;
+
+  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                      "popcnt{q}\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
+                      IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS;
+  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                      "popcnt{q}\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
+                       (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
+                       Sched<[WriteFAddLd]>, XS;
+}
+
+
+
+// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
+multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
+                                 Intrinsic IntId128, PatFrag ld_frag,
+                                 X86FoldableSchedWrite Sched> {
+  def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (IntId128 VR128:$src))]>,
+                    Sched<[Sched]>;
+  def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                     (ins i128mem:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(set VR128:$dst,
+                       (IntId128 (bitconvert (ld_frag addr:$src))))]>,
+                    Sched<[Sched.Folded]>;
+}
+
+// PHMIN has the same profile as PSAD, thus we use the same scheduling
+// model, although the naming is misleading.
+let Predicates = [HasAVX] in
+defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
+                                         int_x86_sse41_phminposuw, loadv2i64,
+                                         WriteVecIMul>, VEX;
+defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
+                                         int_x86_sse41_phminposuw, memopv2i64,
+                                         WriteVecIMul>;
+
+/// SS48I_binop_rm - Simple SSE41 binary operator.
+multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                          X86MemOperand x86memop, bit Is2Addr = 1,
+                          OpndItins itins = SSE_INTALU_ITINS_P> {
+  let isCommutable = 1 in
+  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
+       Sched<[itins.Sched]>;
+  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst,
+         (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst
+/// types.
+multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+                         PatFrag memop_frag, X86MemOperand x86memop,
+                         OpndItins itins,
+                         bit IsCommutable = 0, bit Is2Addr = 1> {
+  let isCommutable = IsCommutable in
+  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+       Sched<[itins.Sched]>;
+  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
+                                     (bitconvert (memop_frag addr:$src2)))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMULDQ   : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
+                                   VR128, loadv2i64, i128mem,
+                                   SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
+                                  VR256, loadv4i64, i256mem,
+                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMULDQ   : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
+                                  VR128, memopv2i64, i128mem,
+                                  SSE_INTMUL_ITINS_P, 1>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
+                                 memopv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
+                                 VEX_4V;
+  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
+                                 memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                 VEX_4V;
+}
+let Predicates = [HasAVX2] in {
+  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
+                                  loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>,
+                                  VEX_4V, VEX_L;
+  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
+                                memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>;
+  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
+                                memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>;
+}
+
+/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
+multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
+                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
+                 X86MemOperand x86memop, bit Is2Addr = 1,
+                 OpndItins itins = DEFAULT_ITINS> {
+  let isCommutable = 1 in
+  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
+        Sched<[itins.Sched]>;
+  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
+        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set RC:$dst,
+          (IntId RC:$src1,
+           (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
+        Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
+multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                           X86MemOperand x86memop, bit Is2Addr = 1,
+                           OpndItins itins = DEFAULT_ITINS> {
+  let isCommutable = 1 in
+  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
+        itins.rr>, Sched<[itins.Sched]>;
+  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
+        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set RC:$dst,
+          (OpVT (OpNode RC:$src1,
+                 (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>,
+        Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  let isCommutable = 0 in {
+    defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
+                                        VR128, loadv2i64, i128mem, 0,
+                                        DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
+  }
+
+  let ExeDomain = SSEPackedSingle in {
+  defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32,
+                                  VR128, loadv4f32, f128mem, 0,
+                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
+  defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32,
+                                   VR256, loadv8f32, f256mem, 0,
+                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
+  }
+  let ExeDomain = SSEPackedDouble in {
+  defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
+                                  VR128, loadv2f64, f128mem, 0,
+                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
+  defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
+                                   VR256, loadv4f64, f256mem, 0,
+                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
+  }
+  defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
+                                  VR128, loadv2i64, i128mem, 0,
+                                  DEFAULT_ITINS_BLENDSCHED>, VEX_4V;
+
+  let ExeDomain = SSEPackedSingle in
+  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
+                                   VR128, loadv4f32, f128mem, 0,
+                                   SSE_DPPS_ITINS>, VEX_4V;
+  let ExeDomain = SSEPackedDouble in
+  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
+                                   VR128, loadv2f64, f128mem, 0,
+                                   SSE_DPPS_ITINS>, VEX_4V;
+  let ExeDomain = SSEPackedSingle in
+  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
+                                    VR256, loadv8f32, i256mem, 0,
+                                    SSE_DPPS_ITINS>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX2] in {
+  let isCommutable = 0 in {
+  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
+                                  VR256, loadv4i64, i256mem, 0,
+                                  DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L;
+  }
+  defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
+                                   VR256, loadv4i64, i256mem, 0,
+                                   DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  let isCommutable = 0 in {
+  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
+                                     VR128, memopv2i64, i128mem,
+                                     1, SSE_MPSADBW_ITINS>;
+  }
+  let ExeDomain = SSEPackedSingle in
+  defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32,
+                                 VR128, memopv4f32, f128mem,
+                                 1, SSE_INTALU_ITINS_FBLEND_P>;
+  let ExeDomain = SSEPackedDouble in
+  defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64,
+                                 VR128, memopv2f64, f128mem,
+                                 1, SSE_INTALU_ITINS_FBLEND_P>;
+  defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16,
+                                 VR128, memopv2i64, i128mem,
+                                 1, SSE_INTALU_ITINS_BLEND_P>;
+  let ExeDomain = SSEPackedSingle in
+  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
+                                  VR128, memopv4f32, f128mem, 1,
+                                  SSE_DPPS_ITINS>;
+  let ExeDomain = SSEPackedDouble in
+  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
+                                  VR128, memopv2f64, f128mem, 1,
+                                  SSE_DPPD_ITINS>;
+}
+
+/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
+multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
+                                    RegisterClass RC, X86MemOperand x86memop,
+                                    PatFrag mem_frag, Intrinsic IntId,
+                                    X86FoldableSchedWrite Sched> {
+  def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst),
+                  (ins RC:$src1, RC:$src2, RC:$src3),
+                  !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                  [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
+                  NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
+                Sched<[Sched]>;
+
+  def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst),
+                  (ins RC:$src1, x86memop:$src2, RC:$src3),
+                  !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                  [(set RC:$dst,
+                        (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
+                               RC:$src3))],
+                  NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
+                Sched<[Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+let ExeDomain = SSEPackedDouble in {
+defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
+                                           loadv2f64, int_x86_sse41_blendvpd,
+                                           WriteFVarBlend>;
+defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
+                                  loadv4f64, int_x86_avx_blendv_pd_256,
+                                  WriteFVarBlend>, VEX_L;
+} // ExeDomain = SSEPackedDouble
+let ExeDomain = SSEPackedSingle in {
+defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
+                                           loadv4f32, int_x86_sse41_blendvps,
+                                           WriteFVarBlend>;
+defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
+                                  loadv8f32, int_x86_avx_blendv_ps_256,
+                                  WriteFVarBlend>, VEX_L;
+} // ExeDomain = SSEPackedSingle
+defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
+                                           loadv2i64, int_x86_sse41_pblendvb,
+                                           WriteVarBlend>;
+}
+
+let Predicates = [HasAVX2] in {
+defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
+                                      loadv4i64, int_x86_avx2_pblendvb,
+                                      WriteVarBlend>, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
+                            (v16i8 VR128:$src2))),
+            (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
+                            (v4i32 VR128:$src2))),
+            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
+                            (v4f32 VR128:$src2))),
+            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
+                            (v2i64 VR128:$src2))),
+            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
+                            (v2f64 VR128:$src2))),
+            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
+                            (v8i32 VR256:$src2))),
+            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
+                            (v8f32 VR256:$src2))),
+            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
+                            (v4i64 VR256:$src2))),
+            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
+                            (v4f64 VR256:$src2))),
+            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+}
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
+                            (v32i8 VR256:$src2))),
+            (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+}
+
+// Patterns
+// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
+// on targets where they have equal performance. These were changed to use
+// blends because blends have better throughput on SandyBridge and Haswell, but
+// movs[s/d] are 1-2 byte shorter instructions.
+let Predicates = [UseAVX] in {
+  let AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVS{S,D} to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+            (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+            (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+
+  // Move low f32 and clear high bits.
+  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
+            (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>;
+
+  // Move low f64 and clear high bits.
+  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
+            (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
+  }
+
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0),
+                           (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
+                           sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0),
+                           (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
+                           sub_xmm)>;
+
+  // These will incur an FP/int domain crossing penalty, but it may be the only
+  // way without AVX2. Do not add any complexity because we may be able to match
+  // more optimal patterns defined earlier in this file.
+  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
+            (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;
+  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
+            (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>;
+}
+
+// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
+// on targets where they have equal performance. These were changed to use
+// blends because blends have better throughput on SandyBridge and Haswell, but
+// movs[s/d] are 1-2 byte shorter instructions.
+let Predicates = [UseSSE41] in {
+  // With SSE41 we can use blends for these patterns.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
+  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+            (BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>;
+}
+
+
+/// SS41I_ternary_int - SSE 4.1 ternary operator
+let Uses = [XMM0], Constraints = "$src1 = $dst" in {
+  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+                               X86MemOperand x86memop, Intrinsic IntId,
+                               OpndItins itins = DEFAULT_ITINS> {
+    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src1, VR128:$src2),
+                    !strconcat(OpcodeStr,
+                     "\t{$src2, $dst|$dst, $src2}"),
+                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
+                    itins.rr>, Sched<[itins.Sched]>;
+
+    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins VR128:$src1, x86memop:$src2),
+                    !strconcat(OpcodeStr,
+                     "\t{$src2, $dst|$dst, $src2}"),
+                    [(set VR128:$dst,
+                      (IntId VR128:$src1,
+                       (bitconvert (mem_frag addr:$src2)), XMM0))],
+                       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+}
+
+let ExeDomain = SSEPackedDouble in
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
+                                  int_x86_sse41_blendvpd,
+                                  DEFAULT_ITINS_FBLENDSCHED>;
+let ExeDomain = SSEPackedSingle in
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
+                                  int_x86_sse41_blendvps,
+                                  DEFAULT_ITINS_FBLENDSCHED>;
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
+                                  int_x86_sse41_pblendvb,
+                                  DEFAULT_ITINS_VARBLENDSCHED>;
+
+// Aliases with the implicit xmm0 argument
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
+
+let Predicates = [UseSSE41] in {
+  def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
+                            (v16i8 VR128:$src2))),
+            (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
+                            (v4i32 VR128:$src2))),
+            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
+                            (v4f32 VR128:$src2))),
+            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
+                            (v2i64 VR128:$src2))),
+            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
+                            (v2f64 VR128:$src2))),
+            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+}
+
+let SchedRW = [WriteLoad] in {
+let Predicates = [HasAVX] in
+def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                       "vmovntdqa\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
+                       VEX;
+let Predicates = [HasAVX2] in
+def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                         "vmovntdqa\t{$src, $dst|$dst, $src}",
+                         [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
+                         VEX, VEX_L;
+def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                       "movntdqa\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - Compare Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS42I_binop_rm - Simple SSE 4.2 binary operator
+multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                          X86MemOperand x86memop, bit Is2Addr = 1> {
+  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>;
+  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst,
+         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>;
+}
+
+let Predicates = [HasAVX] in
+  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
+                                 loadv2i64, i128mem, 0>, VEX_4V;
+
+let Predicates = [HasAVX2] in
+  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
+
+let Constraints = "$src1 = $dst" in
+  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
+                                memopv2i64, i128mem>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - String/text Processing Instructions
+//===----------------------------------------------------------------------===//
+
+// Packed Compare Implicit Length Strings, Return Mask
+multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
+  def REG : PseudoI<(outs VR128:$dst),
+                    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
+                                                  imm:$src3))]>;
+  def MEM : PseudoI<(outs VR128:$dst),
+                    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
+                       (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
+}
+
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
+                         Requires<[HasAVX]>;
+  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>,
+                         Requires<[UseSSE42]>;
+}
+
+multiclass pcmpistrm_SS42AI<string asm> {
+  def rr : SS42AI<0x62, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, Sched<[WritePCmpIStrM]>;
+  let mayLoad = 1 in
+  def rm :SS42AI<0x62, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>;
+}
+
+let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
+  defm PCMPISTRM128  : pcmpistrm_SS42AI<"pcmpistrm"> ;
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> {
+  def REG : PseudoI<(outs VR128:$dst),
+                    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
+    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
+                       VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+  def MEM : PseudoI<(outs VR128:$dst),
+                    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
+    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
+                       (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>;
+}
+
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>,
+                         Requires<[HasAVX]>;
+  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", memopv2i64>,
+                         Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpestrm<string asm> {
+  def rr : SS42AI<0x60, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, Sched<[WritePCmpEStrM]>;
+  let mayLoad = 1 in
+  def rm : SS42AI<0x60, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>;
+}
+
+let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
+  defm PCMPESTRM128 :  SS42AI_pcmpestrm<"pcmpestrm">;
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
+  def REG : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
+  def MEM : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+    [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
+                              (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
+}
+
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+  defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
+                      Requires<[HasAVX]>;
+  defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>,
+                      Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpistri<string asm> {
+  def rr : SS42AI<0x63, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, Sched<[WritePCmpIStrI]>;
+  let mayLoad = 1 in
+  def rm : SS42AI<0x63, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>;
+}
+
+let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
+  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> {
+  def REG : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+  def MEM : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX,
+       imm:$src5))]>;
+}
+
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+  defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>,
+                      Requires<[HasAVX]>;
+  defm PCMPESTRI  : pseudo_pcmpestri<"#PCMPESTRI", memopv2i64>,
+                      Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpestri<string asm> {
+  def rr : SS42AI<0x61, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, Sched<[WritePCmpEStrI]>;
+  let mayLoad = 1 in
+  def rm : SS42AI<0x61, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>;
+}
+
+let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
+  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - CRC Instructions
+//===----------------------------------------------------------------------===//
+
+// No CRC instructions have AVX equivalents
+
+// crc intrinsic instruction
+// This set of instructions are only rm, the only difference is the size
+// of r and m.
+class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
+                   RegisterClass RCIn, SDPatternOperator Int> :
+  SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
+         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
+         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>,
+         Sched<[WriteFAdd]>;
+
+class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
+                   X86MemOperand x86memop, SDPatternOperator Int> :
+  SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
+         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
+         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
+         IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+
+let Constraints = "$src1 = $dst" in {
+  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
+                                 int_x86_sse42_crc32_32_8>;
+  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
+                                 int_x86_sse42_crc32_32_8>;
+  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
+                                 int_x86_sse42_crc32_32_16>, OpSize16;
+  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
+                                 int_x86_sse42_crc32_32_16>, OpSize16;
+  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
+                                 int_x86_sse42_crc32_32_32>, OpSize32;
+  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
+                                 int_x86_sse42_crc32_32_32>, OpSize32;
+  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
+                                 int_x86_sse42_crc32_64_64>, REX_W;
+  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
+                                 int_x86_sse42_crc32_64_64>, REX_W;
+  let hasSideEffects = 0 in {
+    let mayLoad = 1 in
+    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
+                                   null_frag>, REX_W;
+    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
+                                   null_frag>, REX_W;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// SHA-NI Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
+                      bit UsesXMM0 = 0> {
+  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
+             (ins VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+             [!if(UsesXMM0,
+                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
+                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
+
+  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, i128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+             [!if(UsesXMM0,
+                  (set VR128:$dst, (IntId VR128:$src1,
+                    (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
+                  (set VR128:$dst, (IntId VR128:$src1,
+                    (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8;
+}
+
+let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
+  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
+                         (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                         [(set VR128:$dst,
+                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
+                            (i8 imm:$src3)))]>, TA;
+  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
+                         (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                         [(set VR128:$dst,
+                           (int_x86_sha1rnds4 VR128:$src1,
+                            (bc_v4i32 (memopv2i64 addr:$src2)),
+                            (i8 imm:$src3)))]>, TA;
+
+  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>;
+  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>;
+  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>;
+
+  let Uses=[XMM0] in
+  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>;
+
+  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>;
+  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>;
+}
+
+// Aliases with explicit %xmm0
+def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (SHA256RNDS2rr VR128:$dst, VR128:$src2)>;
+def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>;
+
+//===----------------------------------------------------------------------===//
+// AES-NI Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
+                             PatFrag ld_frag, bit Is2Addr = 1> {
+  def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+       Sched<[WriteAESDecEnc]>;
+  def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst,
+         (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
+       Sched<[WriteAESDecEncLd, ReadAfterLd]>;
+}
+
+// Perform One Round of an AES Encryption/Decryption Flow
+let Predicates = [HasAVX, HasAES] in {
+  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
+                         int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V;
+  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
+                         int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V;
+  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
+                         int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V;
+  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
+                         int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
+                         int_x86_aesni_aesenc, memopv2i64>;
+  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
+                         int_x86_aesni_aesenclast, memopv2i64>;
+  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
+                         int_x86_aesni_aesdec, memopv2i64>;
+  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
+                         int_x86_aesni_aesdeclast, memopv2i64>;
+}
+
+// Perform the AES InvMixColumn Transformation
+let Predicates = [HasAVX, HasAES] in {
+  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1),
+      "vaesimc\t{$src1, $dst|$dst, $src1}",
+      [(set VR128:$dst,
+        (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
+      VEX;
+  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
+      (ins i128mem:$src1),
+      "vaesimc\t{$src1, $dst|$dst, $src1}",
+      [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
+      Sched<[WriteAESIMCLd]>, VEX;
+}
+def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
+  (ins VR128:$src1),
+  "aesimc\t{$src1, $dst|$dst, $src1}",
+  [(set VR128:$dst,
+    (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
+def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
+  (ins i128mem:$src1),
+  "aesimc\t{$src1, $dst|$dst, $src1}",
+  [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
+  Sched<[WriteAESIMCLd]>;
+
+// AES Round Key Generation Assist
+let Predicates = [HasAVX, HasAES] in {
+  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, u8imm:$src2),
+      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      [(set VR128:$dst,
+        (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+      Sched<[WriteAESKeyGen]>, VEX;
+  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
+      (ins i128mem:$src1, u8imm:$src2),
+      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      [(set VR128:$dst,
+        (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
+      Sched<[WriteAESKeyGenLd]>, VEX;
+}
+def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
+  (ins VR128:$src1, u8imm:$src2),
+  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+  [(set VR128:$dst,
+    (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+  Sched<[WriteAESKeyGen]>;
+def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
+  (ins i128mem:$src1, u8imm:$src2),
+  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+  [(set VR128:$dst,
+    (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
+  Sched<[WriteAESKeyGenLd]>;
+
+//===----------------------------------------------------------------------===//
+// PCLMUL Instructions
+//===----------------------------------------------------------------------===//
+
+// AVX carry-less Multiplication instructions
+let isCommutable = 1 in
+def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           [(set VR128:$dst,
+             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
+           Sched<[WriteCLMul]>;
+
+def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+                              (loadv2i64 addr:$src2), imm:$src3))]>,
+           Sched<[WriteCLMulLd, ReadAfterLd]>;
+
+// Carry-less Multiplication instructions
+let Constraints = "$src1 = $dst" in {
+let isCommutable = 1 in
+def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+           [(set VR128:$dst,
+             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
+             IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>;
+
+def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+                              (memopv2i64 addr:$src2), imm:$src3))],
+                              IIC_SSE_PCLMULQDQ_RM>,
+           Sched<[WriteCLMulLd, ReadAfterLd]>;
+} // Constraints = "$src1 = $dst"
+
+
+multiclass pclmul_alias<string asm, int immop> {
+  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
+                  (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>;
+
+  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
+                  (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>;
+
+  def : InstAlias<!strconcat("vpclmul", asm,
+                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop),
+                  0>;
+
+  def : InstAlias<!strconcat("vpclmul", asm,
+                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop),
+                  0>;
+}
+defm : pclmul_alias<"hqhq", 0x11>;
+defm : pclmul_alias<"hqlq", 0x01>;
+defm : pclmul_alias<"lqhq", 0x10>;
+defm : pclmul_alias<"lqlq", 0x00>;
+
+//===----------------------------------------------------------------------===//
+// SSE4A Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE4A] in {
+
+let Constraints = "$src = $dst" in {
+def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
+                 (ins VR128:$src, u8imm:$len, u8imm:$idx),
+                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
+                 [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
+                                    imm:$idx))]>, PD;
+def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
+              (ins VR128:$src, VR128:$mask),
+              "extrq\t{$mask, $src|$src, $mask}",
+              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
+                                 VR128:$mask))]>, PD;
+
+def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
+                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
+                   [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
+                                      imm:$len, imm:$idx))]>, XD;
+def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
+                 (ins VR128:$src, VR128:$mask),
+                 "insertq\t{$mask, $src|$src, $mask}",
+                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
+                                    VR128:$mask))]>, XD;
+}
+
+def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
+                "movntss\t{$src, $dst|$dst, $src}",
+                [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS;
+
+def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                "movntsd\t{$src, $dst|$dst, $src}",
+                [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VBROADCAST - Load from memory and broadcast to all elements of the
+//              destination operand
+//
+class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                           X86MemOperand x86memop, ValueType VT,
+                           PatFrag ld_frag, SchedWrite Sched> :
+  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+        [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
+        Sched<[Sched]>, VEX {
+    let mayLoad = 1;
+}
+
+// AVX2 adds register forms
+class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                        ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
+  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
+         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+         [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
+         Sched<[Sched]>, VEX;
+
+let ExeDomain = SSEPackedSingle in {
+  def VBROADCASTSSrm  : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
+                                             f32mem, v4f32, loadf32, WriteLoad>;
+  def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
+                                             f32mem, v8f32, loadf32,
+                                             WriteFShuffleLd>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in
+def VBROADCASTSDYrm  : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
+                                    v4f64, loadf64, WriteFShuffleLd>, VEX_L;
+
+let ExeDomain = SSEPackedSingle in {
+  def VBROADCASTSSrr  : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
+                                          v4f32, v4f32, WriteFShuffle>;
+  def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
+                                          v8f32, v4f32, WriteFShuffle256>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in
+def VBROADCASTSDYrr  : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
+                                         v4f64, v2f64, WriteFShuffle256>, VEX_L;
+
+let mayLoad = 1, Predicates = [HasAVX2] in
+def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
+                           (ins i128mem:$src),
+                           "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
+                           Sched<[WriteLoad]>, VEX, VEX_L;
+
+def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
+                           (ins f128mem:$src),
+                           "vbroadcastf128\t{$src, $dst|$dst, $src}",
+                           [(set VR256:$dst,
+                              (int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>,
+                           Sched<[WriteFShuffleLd]>, VEX, VEX_L;
+
+let Predicates = [HasAVX] in
+def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
+          (VBROADCASTF128 addr:$src)>;
+
+
+//===----------------------------------------------------------------------===//
+// VINSERTF128 - Insert packed floating-point values
+//
+let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
+def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
+          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L;
+let mayLoad = 1 in
+def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
+          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+
+def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
+                                   (bc_v4i32 (loadv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
+                                   (bc_v16i8 (loadv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
+                                   (bc_v8i16 (loadv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VEXTRACTF128 - Extract packed floating-point values
+//
+let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
+def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
+          (ins VR256:$src1, u8imm:$src2),
+          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, Sched<[WriteFShuffle]>, VEX, VEX_L;
+let mayStore = 1 in
+def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
+          (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
+          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, Sched<[WriteStore]>, VEX, VEX_L;
+}
+
+// AVX1 patterns
+let Predicates = [HasAVX] in {
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v4f32 (VEXTRACTF128rr
+                    (v8f32 VR256:$src1),
+                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v2f64 (VEXTRACTF128rr
+                    (v4f64 VR256:$src1),
+                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+
+def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v2i64 (VEXTRACTF128rr
+                  (v4i64 VR256:$src1),
+                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v4i32 (VEXTRACTF128rr
+                  (v8i32 VR256:$src1),
+                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v8i16 (VEXTRACTF128rr
+                  (v16i16 VR256:$src1),
+                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v16i8 (VEXTRACTF128rr
+                  (v32i8 VR256:$src1),
+                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+
+def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VMASKMOV - Conditional SIMD Packed Loads and Stores
+//
+multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
+                          Intrinsic IntLd, Intrinsic IntLd256,
+                          Intrinsic IntSt, Intrinsic IntSt256> {
+  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, f128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
+             VEX_4V;
+  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, f256mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+             VEX_4V, VEX_L;
+  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
+             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
+  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
+             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
+                                 int_x86_avx_maskload_ps,
+                                 int_x86_avx_maskload_ps_256,
+                                 int_x86_avx_maskstore_ps,
+                                 int_x86_avx_maskstore_ps_256>;
+let ExeDomain = SSEPackedDouble in
+defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
+                                 int_x86_avx_maskload_pd,
+                                 int_x86_avx_maskload_pd_256,
+                                 int_x86_avx_maskstore_pd,
+                                 int_x86_avx_maskstore_pd_256>;
+
+//===----------------------------------------------------------------------===//
+// VPERMIL - Permute Single and Double Floating-Point Values
+//
+multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
+                      RegisterClass RC, X86MemOperand x86memop_f,
+                      X86MemOperand x86memop_i, PatFrag i_frag,
+                      Intrinsic IntVar, ValueType vt> {
+  def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
+             (ins RC:$src1, RC:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V,
+             Sched<[WriteFShuffle]>;
+  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
+             (ins RC:$src1, x86memop_i:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (IntVar RC:$src1,
+                             (bitconvert (i_frag addr:$src2))))]>, VEX_4V,
+             Sched<[WriteFShuffleLd, ReadAfterLd]>;
+
+  let Predicates = [HasAVX, NoVLX] in {
+    def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
+             (ins RC:$src1, u8imm:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
+             Sched<[WriteFShuffle]>;
+    def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
+             (ins x86memop_f:$src1, u8imm:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst,
+               (vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
+             Sched<[WriteFShuffleLd]>;
+  }// Predicates = [HasAVX, NoVLX]
+}
+
+let ExeDomain = SSEPackedSingle in {
+  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
+                               loadv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
+  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
+                       loadv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in {
+  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
+                               loadv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
+  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
+                       loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
+          (VPERMILPSYrr VR256:$src1, VR256:$src2)>;
+def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+          (VPERMILPSYrm VR256:$src1, addr:$src2)>;
+def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))),
+          (VPERMILPDYrr VR256:$src1, VR256:$src2)>;
+def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))),
+          (VPERMILPDYrm VR256:$src1, addr:$src2)>;
+
+def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPSYri VR256:$src1, imm:$imm)>;
+def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPDYri VR256:$src1, imm:$imm)>;
+def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)),
+                               (i8 imm:$imm))),
+          (VPERMILPSYmi addr:$src1, imm:$imm)>;
+def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))),
+          (VPERMILPDYmi addr:$src1, imm:$imm)>;
+
+def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))),
+          (VPERMILPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
+          (VPERMILPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))),
+          (VPERMILPDrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))),
+          (VPERMILPDrm VR128:$src1, addr:$src2)>;
+
+def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))),
+          (VPERMILPDri VR128:$src1, imm:$imm)>;
+def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))),
+          (VPERMILPDmi addr:$src1, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
+//
+let ExeDomain = SSEPackedSingle in {
+def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
+          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
+                              (i8 imm:$src3))))]>, VEX_4V, VEX_L,
+          Sched<[WriteFShuffle]>;
+def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
+          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
+                             (i8 imm:$src3)))]>, VEX_4V, VEX_L,
+          Sched<[WriteFShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
+                  (loadv4f64 addr:$src2), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
+                  (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
+                  (loadv4i64 addr:$src2), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
+                  (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
+                  (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VZERO - Zero YMM registers
+//
+let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
+  // Zero All YMM registers
+  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
+                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>;
+
+  // Zero Upper bits of YMM registers
+  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
+                     [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//===----------------------------------------------------------------------===//
+multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
+             "vcvtph2ps\t{$src, $dst|$dst, $src}",
+             [(set RC:$dst, (Int VR128:$src))]>,
+             T8PD, VEX, Sched<[WriteCvtF2F]>;
+  let hasSideEffects = 0, mayLoad = 1 in
+  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+             "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX,
+             Sched<[WriteCvtF2FLd]>;
+}
+
+multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
+               (ins RC:$src1, i32u8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
+               TAPD, VEX, Sched<[WriteCvtF2F]>;
+  let hasSideEffects = 0, mayStore = 1,
+      SchedRW = [WriteCvtF2FLd, WriteRMW] in
+  def mr : Ii8<0x1D, MRMDestMem, (outs),
+               (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+               TAPD, VEX;
+}
+
+let Predicates = [HasF16C] in {
+  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
+  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
+  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
+  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
+
+  // Pattern match vcvtph2ps of a scalar i64 load.
+  def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)),
+            (VCVTPH2PSrm addr:$src)>;
+  def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
+            (VCVTPH2PSrm addr:$src)>;
+
+  def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16
+                  (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
+                   addr:$dst),
+                   (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+  def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16
+                  (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
+                   addr:$dst),
+                   (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+  def : Pat<(store (v8i16 (int_x86_vcvtps2ph_256 VR256:$src1, i32:$src2)),
+                   addr:$dst),
+                   (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
+}
+
+// Patterns for  matching conversions from float to half-float and vice versa.
+let Predicates = [HasF16C] in {
+  def : Pat<(fp_to_f16 FR32:$src),
+            (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr
+              (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>;
+
+  def : Pat<(f16_to_fp GR16:$src),
+            (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
+              (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >;
+
+  def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
+            (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
+              (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX2 Instructions
+//===----------------------------------------------------------------------===//
+
+/// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate
+multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                          X86MemOperand x86memop> {
+  let isCommutable = 1 in
+  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, u8imm:$src3),
+        !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
+        Sched<[WriteBlend]>, VEX_4V;
+  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
+        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+        !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set RC:$dst,
+          (OpVT (OpNode RC:$src1,
+           (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
+        Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
+}
+
+defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32,
+                               VR128, loadv2i64, i128mem>;
+defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32,
+                                VR256, loadv4i64, i256mem>, VEX_L;
+
+//===----------------------------------------------------------------------===//
+// VPBROADCAST - Load from memory and broadcast to all elements of the
+//               destination operand
+//
+multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
+                          X86MemOperand x86memop, PatFrag ld_frag,
+                          ValueType OpVT128, ValueType OpVT256, Predicate prd> {
+  let Predicates = [HasAVX2, prd] in {
+    def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR128:$dst,
+                   (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
+                  Sched<[WriteShuffle]>, VEX;
+    def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR128:$dst,
+                   (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
+                  Sched<[WriteLoad]>, VEX;
+    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR256:$dst,
+                    (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
+                   Sched<[WriteShuffle256]>, VEX, VEX_L;
+    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR256:$dst,
+                    (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
+                   Sched<[WriteLoad]>, VEX, VEX_L;
+
+    // Provide aliases for broadcast from the same register class that
+    // automatically does the extract.
+    def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
+              (!cast<Instruction>(NAME#"Yrr")
+                  (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
+  }
+}
+
+defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
+                                    v16i8, v32i8, NoVLX_Or_NoBWI>;
+defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
+                                    v8i16, v16i16, NoVLX_Or_NoBWI>;
+defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
+                                    v4i32, v8i32, NoVLX>;
+defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
+                                    v2i64, v4i64, NoVLX>;
+
+let Predicates = [HasAVX2] in {
+  // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
+  // This means we'll encounter truncated i32 loads; match that here.
+  def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+            (VPBROADCASTWrm addr:$src)>;
+  def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+            (VPBROADCASTWYrm addr:$src)>;
+  def : Pat<(v8i16 (X86VBroadcast
+              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+            (VPBROADCASTWrm addr:$src)>;
+  def : Pat<(v16i16 (X86VBroadcast
+              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+            (VPBROADCASTWYrm addr:$src)>;
+
+  // Provide aliases for broadcast from the same register class that
+  // automatically does the extract.
+  def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
+            (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
+                                                    sub_xmm)))>;
+  def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
+            (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
+                                                    sub_xmm)))>;
+
+  // Provide fallback in case the load node that is used in the patterns above
+  // is used by additional users, which prevents the pattern selection.
+  let AddedComplexity = 20 in {
+    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+              (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+              (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+              (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+    def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+              (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+    def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+              (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+    def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+              (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+
+    def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
+          (VPBROADCASTBrr (COPY_TO_REGCLASS
+                           (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+                           VR128))>;
+    def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
+          (VPBROADCASTBYrr (COPY_TO_REGCLASS
+                            (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+                            VR128))>;
+
+    def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
+          (VPBROADCASTWrr (COPY_TO_REGCLASS
+                           (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
+                           VR128))>;
+    def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
+          (VPBROADCASTWYrr (COPY_TO_REGCLASS
+                            (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
+                            VR128))>;
+
+    // The patterns for VPBROADCASTD are not needed because they would match
+    // the exact same thing as VBROADCASTSS patterns.
+
+    def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
+          (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+    // The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
+  }
+}
+
+// AVX1 broadcast patterns
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSSYrm addr:$src)>;
+def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+          (VBROADCASTSDYrm addr:$src)>;
+def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSSrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  // Provide fallback in case the load node that is used in the patterns above
+  // is used by additional users, which prevents the pattern selection.
+  let AddedComplexity = 20 in {
+  // 128bit broadcasts:
+  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+            (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
+  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
+  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
+
+  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+            (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
+  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
+  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
+  }
+
+  def : Pat<(v2f64 (X86VBroadcast f64:$src)),
+            (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2i64 (X86VBroadcast i64:$src)),
+            (VMOVDDUPrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPERM - Permute instructions
+//
+
+multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+                     ValueType OpVT, X86FoldableSchedWrite Sched> {
+  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
+                   (ins VR256:$src1, VR256:$src2),
+                   !strconcat(OpcodeStr,
+                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set VR256:$dst,
+                     (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
+                   Sched<[Sched]>, VEX_4V, VEX_L;
+  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
+                   (ins VR256:$src1, i256mem:$src2),
+                   !strconcat(OpcodeStr,
+                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set VR256:$dst,
+                     (OpVT (X86VPermv VR256:$src1,
+                            (bitconvert (mem_frag addr:$src2)))))]>,
+                   Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L;
+}
+
+defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>;
+let ExeDomain = SSEPackedSingle in
+defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>;
+
+multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+                         ValueType OpVT, X86FoldableSchedWrite Sched> {
+  def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
+                     (ins VR256:$src1, u8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set VR256:$dst,
+                       (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
+                     Sched<[Sched]>, VEX, VEX_L;
+  def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
+                     (ins i256mem:$src1, u8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set VR256:$dst,
+                       (OpVT (X86VPermi (mem_frag addr:$src1),
+                              (i8 imm:$src2))))]>,
+                     Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L;
+}
+
+defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
+                            WriteShuffle256>, VEX_W;
+let ExeDomain = SSEPackedDouble in
+defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
+                             WriteFShuffle256>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
+//
+def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
+          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
+                            (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
+          VEX_4V, VEX_L;
+def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
+          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
+                             (i8 imm:$src3)))]>,
+          Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
+
+let Predicates = [HasAVX2] in {
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)),
+                  (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
+                   (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)),
+                  (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// VINSERTI128 - Insert packed integer values
+//
+let hasSideEffects = 0 in {
+def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
+          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
+let mayLoad = 1 in
+def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
+          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
+                                   (bc_v4i32 (loadv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
+                                   (bc_v16i8 (loadv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
+                                   (bc_v8i16 (loadv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VEXTRACTI128 - Extract packed integer values
+//
+def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
+          (ins VR256:$src1, u8imm:$src2),
+          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+          Sched<[WriteShuffle256]>, VEX, VEX_L;
+let hasSideEffects = 0, mayStore = 1 in
+def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
+          (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
+          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+          Sched<[WriteStore]>, VEX, VEX_L;
+
+let Predicates = [HasAVX2] in {
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v2i64 (VEXTRACTI128rr
+                    (v4i64 VR256:$src1),
+                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v4i32 (VEXTRACTI128rr
+                    (v8i32 VR256:$src1),
+                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v8i16 (VEXTRACTI128rr
+                    (v16i16 VR256:$src1),
+                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v16i8 (VEXTRACTI128rr
+                    (v32i8 VR256:$src1),
+                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+
+def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
+//
+multiclass avx2_pmovmask<string OpcodeStr,
+                         Intrinsic IntLd128, Intrinsic IntLd256,
+                         Intrinsic IntSt128, Intrinsic IntSt256> {
+  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, i128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V;
+  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, i256mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+             VEX_4V, VEX_L;
+  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
+             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
+  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
+             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
+}
+
+defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
+                                int_x86_avx2_maskload_d,
+                                int_x86_avx2_maskload_d_256,
+                                int_x86_avx2_maskstore_d,
+                                int_x86_avx2_maskstore_d_256>;
+defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
+                                int_x86_avx2_maskload_q,
+                                int_x86_avx2_maskload_q_256,
+                                int_x86_avx2_maskstore_q,
+                                int_x86_avx2_maskstore_q_256>, VEX_W;
+
+def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)),
+         (VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>;
+
+def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)),
+         (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
+
+def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)),
+         (VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>;
+
+def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)),
+         (VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>;
+
+def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
+         (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask),
+                             (bc_v8f32 (v8i32 immAllZerosV)))),
+         (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))),
+         (VBLENDVPSYrr VR256:$src0, (VMASKMOVPSYrm VR256:$mask, addr:$ptr),
+                       VR256:$mask)>;
+
+def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
+         (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 immAllZerosV))),
+         (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0))),
+         (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr),
+                       VR256:$mask)>;
+
+def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)),
+         (VMASKMOVPSrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask),
+                             (bc_v4f32 (v4i32 immAllZerosV)))),
+         (VMASKMOVPSrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src0))),
+         (VBLENDVPSrr VR128:$src0, (VMASKMOVPSrm VR128:$mask, addr:$ptr),
+                       VR128:$mask)>;
+
+def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)),
+         (VPMASKMOVDrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 immAllZerosV))),
+         (VPMASKMOVDrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0))),
+         (VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr),
+                       VR128:$mask)>;
+
+def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)),
+         (VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
+
+def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)),
+         (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>;
+
+def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
+         (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
+                             (v4f64 immAllZerosV))),
+         (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))),
+         (VBLENDVPDYrr VR256:$src0, (VMASKMOVPDYrm VR256:$mask, addr:$ptr),
+                       VR256:$mask)>;
+
+def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
+         (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
+                             (bc_v4i64 (v8i32 immAllZerosV)))),
+         (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
+
+def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0))),
+         (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr),
+                       VR256:$mask)>;
+
+def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)),
+         (VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>;
+
+def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)),
+         (VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>;
+
+def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)),
+         (VMASKMOVPDrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask),
+                             (v2f64 immAllZerosV))),
+         (VMASKMOVPDrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src0))),
+         (VBLENDVPDrr VR128:$src0, (VMASKMOVPDrm VR128:$mask, addr:$ptr),
+                       VR128:$mask)>;
+
+def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)),
+         (VPMASKMOVQrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask),
+                             (bc_v2i64 (v4i32 immAllZerosV)))),
+         (VPMASKMOVQrm VR128:$mask, addr:$ptr)>;
+
+def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src0))),
+         (VBLENDVPDrr VR128:$src0, (VPMASKMOVQrm VR128:$mask, addr:$ptr),
+                       VR128:$mask)>;
+
+//===----------------------------------------------------------------------===//
+// Variable Bit Shifts
+//
+multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          ValueType vt128, ValueType vt256> {
+  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
+             (ins VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst,
+               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
+             VEX_4V, Sched<[WriteVarVecShift]>;
+  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, i128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst,
+               (vt128 (OpNode VR128:$src1,
+                       (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
+             VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
+  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
+             (ins VR256:$src1, VR256:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR256:$dst,
+               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
+             VEX_4V, VEX_L, Sched<[WriteVarVecShift]>;
+  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, i256mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR256:$dst,
+               (vt256 (OpNode VR256:$src1,
+                       (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
+             VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+  defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
+  defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
+  defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
+  defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
+  defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
+}
+//===----------------------------------------------------------------------===//
+// VGATHER - GATHER Operations
+multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
+                       X86MemOperand memop128, X86MemOperand memop256> {
+  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb),
+            (ins VR128:$src1, memop128:$src2, VR128:$mask),
+            !strconcat(OpcodeStr,
+              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+            []>, VEX_4VOp3;
+  def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb),
+            (ins RC256:$src1, memop256:$src2, RC256:$mask),
+            !strconcat(OpcodeStr,
+              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+            []>, VEX_4VOp3, VEX_L;
+}
+
+let mayLoad = 1, Constraints
+  = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
+  in {
+  defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W;
+  defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W;
+  defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>;
+  defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>;
+
+  let ExeDomain = SSEPackedDouble in {
+    defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
+    defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
+  }
+
+  let ExeDomain = SSEPackedSingle in {
+    defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
+    defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Extra selection patterns for FR128, f128, f128mem
+
+// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.
+def : Pat<(store (f128 FR128:$src), addr:$dst),
+          (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>;
+
+def : Pat<(loadf128 addr:$src),
+          (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>;
+
+// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
+def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)),
+          (COPY_TO_REGCLASS
+           (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
+           FR128)>;
+
+def : Pat<(X86fand FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS
+           (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(and FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS
+           (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)),
+          (COPY_TO_REGCLASS
+           (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
+           FR128)>;
+
+def : Pat<(X86for FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS
+           (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                   (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(or FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS
+           (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                   (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)),
+          (COPY_TO_REGCLASS
+           (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
+           FR128)>;
+
+def : Pat<(X86fxor FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS
+           (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(xor FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS
+           (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSVM.td b/contrib/llvm/lib/Target/X86/X86InstrSVM.td
new file mode 100644
index 0000000..c847be7e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSVM.td
@@ -0,0 +1,62 @@
+//===-- X86InstrSVM.td - SVM Instruction Set Extension -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the AMD SVM instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SVM instructions
+
+// 0F 01 D9
+def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", []>, TB;
+
+// 0F 01 DC
+def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", []>, TB;
+
+// 0F 01 DD
+def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB;
+
+// 0F 01 DE
+let Uses = [EAX] in
+def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB;
+
+// 0F 01 D8
+let Uses = [EAX] in
+def VMRUN32 : I<0x01, MRM_D8, (outs), (ins),
+                "vmrun\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX] in
+def VMRUN64 : I<0x01, MRM_D8, (outs), (ins),
+                "vmrun\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DA
+let Uses = [EAX] in
+def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins),
+                "vmload\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX] in
+def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins),
+                "vmload\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DB
+let Uses = [EAX] in
+def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins),
+                "vmsave\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX] in
+def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins),
+                "vmsave\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DF
+let Uses = [EAX, ECX] in
+def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
+                "invlpga\t{%ecx, %eax|eax, ecx}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX, ECX] in
+def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
+                "invlpga\t{%ecx, %rax|rax, ecx}", []>, TB, Requires<[In64BitMode]>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
new file mode 100644
index 0000000..c1df978
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -0,0 +1,969 @@
+//===-- X86InstrShiftRotate.td - Shift and Rotate Instrs ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the shift and rotate instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// FIXME: Someone needs to smear multipattern goodness all over this file.
+
+let Defs = [EFLAGS] in {
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SHL8rCL  : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "shl{b}\t{%cl, $dst|$dst, cl}",
+                 [(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>;
+def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shl{w}\t{%cl, $dst|$dst, cl}",
+                 [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize16;
+def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shl{l}\t{%cl, $dst|$dst, cl}",
+                 [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>, OpSize32;
+def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+                  "shl{q}\t{%cl, $dst|$dst, cl}",
+                  [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>;
+} // Uses = [CL]
+
+def SHL8ri   : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+                   "shl{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
+def SHL16ri  : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+                   "shl{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>,
+                   OpSize16;
+def SHL32ri  : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+                   "shl{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>,
+                   OpSize32;
+def SHL64ri  : RIi8<0xC1, MRM4r, (outs GR64:$dst),
+                    (ins GR64:$src1, u8imm:$src2),
+                    "shl{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))],
+                    IIC_SR>;
+} // isConvertibleToThreeAddress = 1
+
+// NOTE: We don't include patterns for shifts of a register by one, because
+// 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one).
+let hasSideEffects = 0 in {
+def SHL8r1   : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1),
+                 "shl{b}\t$dst", [], IIC_SR>;
+def SHL16r1  : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shl{w}\t$dst", [], IIC_SR>, OpSize16;
+def SHL32r1  : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shl{l}\t$dst", [], IIC_SR>, OpSize32;
+def SHL64r1  : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+                 "shl{q}\t$dst", [], IIC_SR>;
+} // hasSideEffects = 0
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+// FIXME: Why do we need an explicit "Uses = [CL]" when the instr has a pattern
+// using CL?
+let Uses = [CL] in {
+def SHL8mCL  : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
+                 "shl{b}\t{%cl, $dst|$dst, cl}",
+                 [(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
+def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
+                 "shl{w}\t{%cl, $dst|$dst, cl}",
+                 [(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 OpSize16;
+def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
+                 "shl{l}\t{%cl, $dst|$dst, cl}",
+                 [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 OpSize32;
+def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
+                  "shl{q}\t{%cl, $dst|$dst, cl}",
+                  [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
+}
+def SHL8mi   : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, u8imm:$src),
+                   "shl{b}\t{$src, $dst|$dst, $src}",
+                [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                IIC_SR>;
+def SHL16mi  : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, u8imm:$src),
+                   "shl{w}\t{$src, $dst|$dst, $src}",
+               [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>, OpSize16;
+def SHL32mi  : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, u8imm:$src),
+                   "shl{l}\t{$src, $dst|$dst, $src}",
+               [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>, OpSize32;
+def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, u8imm:$src),
+                  "shl{q}\t{$src, $dst|$dst, $src}",
+                 [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                 IIC_SR>;
+
+// Shift by 1
+def SHL8m1   : I<0xD0, MRM4m, (outs), (ins i8mem :$dst),
+                 "shl{b}\t$dst",
+                [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+                IIC_SR>;
+def SHL16m1  : I<0xD1, MRM4m, (outs), (ins i16mem:$dst),
+                 "shl{w}\t$dst",
+               [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>, OpSize16;
+def SHL32m1  : I<0xD1, MRM4m, (outs), (ins i32mem:$dst),
+                 "shl{l}\t$dst",
+               [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>, OpSize32;
+def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
+                  "shl{q}\t$dst",
+                 [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+                 IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SHR8rCL  : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "shr{b}\t{%cl, $dst|$dst, cl}",
+                 [(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>;
+def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shr{w}\t{%cl, $dst|$dst, cl}",
+                 [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize16;
+def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shr{l}\t{%cl, $dst|$dst, cl}",
+                 [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>, OpSize32;
+def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+                  "shr{q}\t{%cl, $dst|$dst, cl}",
+                  [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>;
+}
+
+def SHR8ri   : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$src2),
+                   "shr{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def SHR16ri  : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+                   "shr{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize16;
+def SHR32ri  : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+                   "shr{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize32;
+def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2),
+                  "shr{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>;
+
+// Shift right by 1
+def SHR8r1   : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1),
+                 "shr{b}\t$dst",
+                 [(set GR8:$dst, (srl GR8:$src1, (i8 1)))], IIC_SR>;
+def SHR16r1  : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shr{w}\t$dst",
+                 [(set GR16:$dst, (srl GR16:$src1, (i8 1)))], IIC_SR>, OpSize16;
+def SHR32r1  : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shr{l}\t$dst",
+                 [(set GR32:$dst, (srl GR32:$src1, (i8 1)))], IIC_SR>, OpSize32;
+def SHR64r1  : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+                 "shr{q}\t$dst",
+                 [(set GR64:$dst, (srl GR64:$src1, (i8 1)))], IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SHR8mCL  : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
+                 "shr{b}\t{%cl, $dst|$dst, cl}",
+                 [(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
+def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
+                 "shr{w}\t{%cl, $dst|$dst, cl}",
+                 [(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 OpSize16;
+def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
+                 "shr{l}\t{%cl, $dst|$dst, cl}",
+                 [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 OpSize32;
+def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
+                  "shr{q}\t{%cl, $dst|$dst, cl}",
+                  [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
+}
+def SHR8mi   : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, u8imm:$src),
+                   "shr{b}\t{$src, $dst|$dst, $src}",
+                [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                IIC_SR>;
+def SHR16mi  : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, u8imm:$src),
+                   "shr{w}\t{$src, $dst|$dst, $src}",
+               [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>, OpSize16;
+def SHR32mi  : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, u8imm:$src),
+                   "shr{l}\t{$src, $dst|$dst, $src}",
+               [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>, OpSize32;
+def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, u8imm:$src),
+                  "shr{q}\t{$src, $dst|$dst, $src}",
+                 [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                 IIC_SR>;
+
+// Shift by 1
+def SHR8m1   : I<0xD0, MRM5m, (outs), (ins i8mem :$dst),
+                 "shr{b}\t$dst",
+                [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+                IIC_SR>;
+def SHR16m1  : I<0xD1, MRM5m, (outs), (ins i16mem:$dst),
+                 "shr{w}\t$dst",
+               [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>, OpSize16;
+def SHR32m1  : I<0xD1, MRM5m, (outs), (ins i32mem:$dst),
+                 "shr{l}\t$dst",
+               [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>, OpSize32;
+def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
+                  "shr{q}\t$dst",
+                 [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+                 IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SAR8rCL  : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "sar{b}\t{%cl, $dst|$dst, cl}",
+                 [(set GR8:$dst, (sra GR8:$src1, CL))],
+                 IIC_SR>;
+def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+                 "sar{w}\t{%cl, $dst|$dst, cl}",
+                 [(set GR16:$dst, (sra GR16:$src1, CL))],
+                 IIC_SR>, OpSize16;
+def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+                 "sar{l}\t{%cl, $dst|$dst, cl}",
+                 [(set GR32:$dst, (sra GR32:$src1, CL))],
+                 IIC_SR>, OpSize32;
+def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+                 "sar{q}\t{%cl, $dst|$dst, cl}",
+                 [(set GR64:$dst, (sra GR64:$src1, CL))],
+                 IIC_SR>;
+}
+
+def SAR8ri   : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+                   "sar{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))],
+                   IIC_SR>;
+def SAR16ri  : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+                   "sar{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize16;
+def SAR32ri  : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+                   "sar{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize32;
+def SAR64ri  : RIi8<0xC1, MRM7r, (outs GR64:$dst),
+                    (ins GR64:$src1, u8imm:$src2),
+                    "sar{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))],
+                    IIC_SR>;
+
+// Shift by 1
+def SAR8r1   : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "sar{b}\t$dst",
+                 [(set GR8:$dst, (sra GR8:$src1, (i8 1)))],
+                 IIC_SR>;
+def SAR16r1  : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+                 "sar{w}\t$dst",
+                 [(set GR16:$dst, (sra GR16:$src1, (i8 1)))],
+                 IIC_SR>, OpSize16;
+def SAR32r1  : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+                 "sar{l}\t$dst",
+                 [(set GR32:$dst, (sra GR32:$src1, (i8 1)))],
+                 IIC_SR>, OpSize32;
+def SAR64r1  : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+                 "sar{q}\t$dst",
+                 [(set GR64:$dst, (sra GR64:$src1, (i8 1)))],
+                 IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SAR8mCL  : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
+                 "sar{b}\t{%cl, $dst|$dst, cl}",
+                 [(store (sra (loadi8 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
+                 "sar{w}\t{%cl, $dst|$dst, cl}",
+                 [(store (sra (loadi16 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize16;
+def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst),
+                 "sar{l}\t{%cl, $dst|$dst, cl}",
+                 [(store (sra (loadi32 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize32;
+def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst),
+                 "sar{q}\t{%cl, $dst|$dst, cl}",
+                 [(store (sra (loadi64 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+}
+def SAR8mi   : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, u8imm:$src),
+                   "sar{b}\t{$src, $dst|$dst, $src}",
+                [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                IIC_SR>;
+def SAR16mi  : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, u8imm:$src),
+                   "sar{w}\t{$src, $dst|$dst, $src}",
+               [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>, OpSize16;
+def SAR32mi  : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, u8imm:$src),
+                   "sar{l}\t{$src, $dst|$dst, $src}",
+               [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>, OpSize32;
+def SAR64mi  : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, u8imm:$src),
+                    "sar{q}\t{$src, $dst|$dst, $src}",
+                 [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                 IIC_SR>;
+
+// Shift by 1
+def SAR8m1   : I<0xD0, MRM7m, (outs), (ins i8mem :$dst),
+                 "sar{b}\t$dst",
+                [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+                IIC_SR>;
+def SAR16m1  : I<0xD1, MRM7m, (outs), (ins i16mem:$dst),
+                 "sar{w}\t$dst",
+               [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>, OpSize16;
+def SAR32m1  : I<0xD1, MRM7m, (outs), (ins i32mem:$dst),
+                 "sar{l}\t$dst",
+               [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>, OpSize32;
+def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
+                  "sar{q}\t$dst",
+                 [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+                 IIC_SR>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Rotate instructions
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
+               "rcl{b}\t$dst", [], IIC_SR>;
+def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
+                 "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
+                "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+                "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
+                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+let Uses = [CL] in
+def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+
+def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+                "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
+                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+let Uses = [CL] in
+def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+
+
+def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
+                 "rcl{q}\t$dst", [], IIC_SR>;
+def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
+                   "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+
+def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
+               "rcr{b}\t$dst", [], IIC_SR>;
+def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
+                 "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
+                "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+                "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
+                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+let Uses = [CL] in
+def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+
+def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+                "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
+                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+let Uses = [CL] in
+def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+
+def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
+                 "rcr{q}\t$dst", [], IIC_SR>;
+def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
+                   "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+} // Constraints = "$src = $dst"
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
+               "rcl{b}\t$dst", [], IIC_SR>;
+def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, u8imm:$cnt),
+                 "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst),
+                "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, u8imm:$cnt),
+                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst),
+                "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, u8imm:$cnt),
+                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst),
+                 "rcl{q}\t$dst", [], IIC_SR>;
+def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, u8imm:$cnt),
+                   "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+
+def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst),
+               "rcr{b}\t$dst", [], IIC_SR>;
+def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, u8imm:$cnt),
+                 "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst),
+                "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, u8imm:$cnt),
+                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst),
+                "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, u8imm:$cnt),
+                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst),
+                 "rcr{q}\t$dst", [], IIC_SR>;
+def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt),
+                   "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+
+let Uses = [CL] in {
+def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst),
+                "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
+                 "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst),
+                 "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst),
+                  "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst),
+                "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst),
+                 "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
+                 "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
+                  "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+}
+} // SchedRW
+} // hasSideEffects = 0
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+// FIXME: provide shorter instructions when imm8 == 1
+let Uses = [CL] in {
+def ROL8rCL  : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "rol{b}\t{%cl, $dst|$dst, cl}",
+                 [(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>;
+def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rol{w}\t{%cl, $dst|$dst, cl}",
+                 [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize16;
+def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rol{l}\t{%cl, $dst|$dst, cl}",
+                 [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>, OpSize32;
+def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rol{q}\t{%cl, $dst|$dst, cl}",
+                  [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>;
+}
+
+def ROL8ri   : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+                   "rol{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def ROL16ri  : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+                   "rol{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize16;
+def ROL32ri  : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+                   "rol{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize32;
+def ROL64ri  : RIi8<0xC1, MRM0r, (outs GR64:$dst),
+                    (ins GR64:$src1, u8imm:$src2),
+                    "rol{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))],
+                    IIC_SR>;
+
+// Rotate by 1
+def ROL8r1   : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "rol{b}\t$dst",
+                 [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))],
+                 IIC_SR>;
+def ROL16r1  : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rol{w}\t$dst",
+                 [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))],
+                 IIC_SR>, OpSize16;
+def ROL32r1  : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rol{l}\t$dst",
+                 [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))],
+                 IIC_SR>, OpSize32;
+def ROL64r1  : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rol{q}\t$dst",
+                  [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))],
+                  IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def ROL8mCL  : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
+                 "rol{b}\t{%cl, $dst|$dst, cl}",
+                 [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
+                 "rol{w}\t{%cl, $dst|$dst, cl}",
+                 [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize16;
+def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
+                 "rol{l}\t{%cl, $dst|$dst, cl}",
+                 [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize32;
+def ROL64mCL :  RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
+                   "rol{q}\t{%cl, $dst|$dst, cl}",
+                   [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)],
+                   IIC_SR>;
+}
+def ROL8mi   : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, u8imm:$src1),
+                   "rol{b}\t{$src1, $dst|$dst, $src1}",
+               [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+               IIC_SR>;
+def ROL16mi  : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, u8imm:$src1),
+                   "rol{w}\t{$src1, $dst|$dst, $src1}",
+              [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+              IIC_SR>, OpSize16;
+def ROL32mi  : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, u8imm:$src1),
+                   "rol{l}\t{$src1, $dst|$dst, $src1}",
+              [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+              IIC_SR>, OpSize32;
+def ROL64mi  : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, u8imm:$src1),
+                    "rol{q}\t{$src1, $dst|$dst, $src1}",
+                [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+                IIC_SR>;
+
+// Rotate by 1
+def ROL8m1   : I<0xD0, MRM0m, (outs), (ins i8mem :$dst),
+                 "rol{b}\t$dst",
+               [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+def ROL16m1  : I<0xD1, MRM0m, (outs), (ins i16mem:$dst),
+                 "rol{w}\t$dst",
+              [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+              IIC_SR>, OpSize16;
+def ROL32m1  : I<0xD1, MRM0m, (outs), (ins i32mem:$dst),
+                 "rol{l}\t$dst",
+              [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+              IIC_SR>, OpSize32;
+def ROL64m1  : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
+                 "rol{q}\t$dst",
+               [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def ROR8rCL  : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "ror{b}\t{%cl, $dst|$dst, cl}",
+                 [(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>;
+def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+                 "ror{w}\t{%cl, $dst|$dst, cl}",
+                 [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize16;
+def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+                 "ror{l}\t{%cl, $dst|$dst, cl}",
+                 [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>, OpSize32;
+def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+                  "ror{q}\t{%cl, $dst|$dst, cl}",
+                  [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>;
+}
+
+def ROR8ri   : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+                   "ror{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def ROR16ri  : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+                   "ror{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize16;
+def ROR32ri  : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+                   "ror{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize32;
+def ROR64ri  : RIi8<0xC1, MRM1r, (outs GR64:$dst),
+                    (ins GR64:$src1, u8imm:$src2),
+                    "ror{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))],
+                    IIC_SR>;
+
+// Rotate by 1
+def ROR8r1   : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "ror{b}\t$dst",
+                 [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))],
+                 IIC_SR>;
+def ROR16r1  : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+                 "ror{w}\t$dst",
+                 [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))],
+                 IIC_SR>, OpSize16;
+def ROR32r1  : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+                 "ror{l}\t$dst",
+                 [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))],
+                 IIC_SR>, OpSize32;
+def ROR64r1  : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+                  "ror{q}\t$dst",
+                  [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))],
+                  IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def ROR8mCL  : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
+                 "ror{b}\t{%cl, $dst|$dst, cl}",
+                 [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
+                 "ror{w}\t{%cl, $dst|$dst, cl}",
+                 [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize16;
+def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst),
+                 "ror{l}\t{%cl, $dst|$dst, cl}",
+                 [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize32;
+def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst),
+                  "ror{q}\t{%cl, $dst|$dst, cl}",
+                  [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)],
+                  IIC_SR>;
+}
+def ROR8mi   : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, u8imm:$src),
+                   "ror{b}\t{$src, $dst|$dst, $src}",
+               [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>;
+def ROR16mi  : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, u8imm:$src),
+                   "ror{w}\t{$src, $dst|$dst, $src}",
+              [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+              IIC_SR>, OpSize16;
+def ROR32mi  : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, u8imm:$src),
+                   "ror{l}\t{$src, $dst|$dst, $src}",
+              [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+              IIC_SR>, OpSize32;
+def ROR64mi  : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src),
+                    "ror{q}\t{$src, $dst|$dst, $src}",
+                [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                IIC_SR>;
+
+// Rotate by 1
+def ROR8m1   : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
+                 "ror{b}\t$dst",
+               [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+def ROR16m1  : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
+                 "ror{w}\t$dst",
+              [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+              IIC_SR>, OpSize16;
+def ROR32m1  : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
+                 "ror{l}\t$dst",
+              [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+              IIC_SR>, OpSize32;
+def ROR64m1  : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
+                 "ror{q}\t$dst",
+               [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+} // SchedRW
+
+
+//===----------------------------------------------------------------------===//
+// Double shift instructions (generalizations of rotate)
+//===----------------------------------------------------------------------===//
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+
+let Uses = [CL] in {
+def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst),
+                   (ins GR16:$src1, GR16:$src2),
+                   "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                   [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))],
+                    IIC_SHD16_REG_CL>,
+                   TB, OpSize16;
+def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst),
+                   (ins GR16:$src1, GR16:$src2),
+                   "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                   [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))],
+                    IIC_SHD16_REG_CL>,
+                   TB, OpSize16;
+def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst),
+                   (ins GR32:$src1, GR32:$src2),
+                   "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                   [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))],
+                    IIC_SHD32_REG_CL>, TB, OpSize32;
+def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
+                   (ins GR32:$src1, GR32:$src2),
+                   "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                   [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))],
+                   IIC_SHD32_REG_CL>, TB, OpSize32;
+def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst),
+                    (ins GR64:$src1, GR64:$src2),
+                    "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                    [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))],
+                    IIC_SHD64_REG_CL>,
+                    TB;
+def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
+                    (ins GR64:$src1, GR64:$src2),
+                    "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                    [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))],
+                    IIC_SHD64_REG_CL>,
+                    TB;
+}
+
+let isCommutable = 1 in {  // These instructions commute to each other.
+def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
+                     (outs GR16:$dst),
+                     (ins GR16:$src1, GR16:$src2, u8imm:$src3),
+                     "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
+                                      (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
+                     TB, OpSize16;
+def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
+                     (outs GR16:$dst),
+                     (ins GR16:$src1, GR16:$src2, u8imm:$src3),
+                     "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
+                                      (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
+                     TB, OpSize16;
+def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
+                     (outs GR32:$dst),
+                     (ins GR32:$src1, GR32:$src2, u8imm:$src3),
+                     "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
+                                      (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
+                 TB, OpSize32;
+def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
+                     (outs GR32:$dst),
+                     (ins GR32:$src1, GR32:$src2, u8imm:$src3),
+                     "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
+                                      (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
+                 TB, OpSize32;
+def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
+                      (outs GR64:$dst),
+                      (ins GR64:$src1, GR64:$src2, u8imm:$src3),
+                      "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
+                                       (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
+                 TB;
+def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
+                      (outs GR64:$dst),
+                      (ins GR64:$src1, GR64:$src2, u8imm:$src3),
+                      "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
+                                       (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
+                 TB;
+}
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                   "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                   [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
+                     addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize16;
+def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                  "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                  [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
+                    addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize16;
+
+def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                   "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                   [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
+                     addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32;
+def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                  "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                  [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
+                    addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32;
+
+def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                    "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                    [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
+                      addr:$dst)], IIC_SHD64_MEM_CL>, TB;
+def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                    "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                    [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
+                      addr:$dst)], IIC_SHD64_MEM_CL>, TB;
+}
+
+def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
+                    (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
+                    "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
+                                      (i8 imm:$src3)), addr:$dst)],
+                                      IIC_SHD16_MEM_IM>,
+                    TB, OpSize16;
+def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
+                     (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
+                     "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
+                                      (i8 imm:$src3)), addr:$dst)],
+                                      IIC_SHD16_MEM_IM>,
+                     TB, OpSize16;
+
+def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
+                    (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
+                    "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
+                                      (i8 imm:$src3)), addr:$dst)],
+                                      IIC_SHD32_MEM_IM>,
+                    TB, OpSize32;
+def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
+                     (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
+                     "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
+                                       (i8 imm:$src3)), addr:$dst)],
+                                       IIC_SHD32_MEM_IM>,
+                     TB, OpSize32;
+
+def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
+                      (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
+                      "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shld (loadi64 addr:$dst), GR64:$src2,
+                                       (i8 imm:$src3)), addr:$dst)],
+                                       IIC_SHD64_MEM_IM>,
+                 TB;
+def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
+                      (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
+                      "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
+                                       (i8 imm:$src3)), addr:$dst)],
+                                       IIC_SHD64_MEM_IM>,
+                 TB;
+} // SchedRW
+
+} // Defs = [EFLAGS]
+
+def ROT32L2R_imm8  : SDNodeXForm<imm, [{
+  // Convert a ROTL shamt to a ROTR shamt on 32-bit integer.
+  return getI8Imm(32 - N->getZExtValue(), SDLoc(N));
+}]>;
+
+def ROT64L2R_imm8  : SDNodeXForm<imm, [{
+  // Convert a ROTL shamt to a ROTR shamt on 64-bit integer.
+  return getI8Imm(64 - N->getZExtValue(), SDLoc(N));
+}]>;
+
+multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+  def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
+               !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               []>, TAXD, VEX, Sched<[WriteShift]>;
+  let mayLoad = 1 in
+  def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst),
+               (ins x86memop:$src1, u8imm:$src2),
+               !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               []>, TAXD, VEX, Sched<[WriteShiftLd]>;
+}
+}
+
+multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+  def rr : I<0xF7, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+             !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+             VEX_4VOp3, Sched<[WriteShift]>;
+  let mayLoad = 1 in
+  def rm : I<0xF7, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+             !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+             VEX_4VOp3,
+             Sched<[WriteShiftLd,
+                    // x86memop:$src1
+                    ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                    ReadDefault,
+                    // RC:$src1
+                    ReadAfterLd]>;
+}
+}
+
+let Predicates = [HasBMI2] in {
+  defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem>;
+  defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem>, VEX_W;
+  defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem>, T8XS;
+  defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem>, T8XS, VEX_W;
+  defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem>, T8XD;
+  defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, VEX_W;
+  defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8PD;
+  defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8PD, VEX_W;
+
+  // Prefer RORX which is non-destructive and doesn't update EFLAGS.
+  let AddedComplexity = 10 in {
+    def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
+              (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
+    def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
+              (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
+  }
+
+  def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)),
+            (RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
+  def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)),
+            (RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
+
+  // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not
+  // immedidate shift, i.e. the following code is considered better
+  //
+  //  mov %edi, %esi
+  //  shl $imm, %esi
+  //  ... %edi, ...
+  //
+  // than
+  //
+  //  movb $imm, %sil
+  //  shlx %sil, %edi, %esi
+  //  ... %edi, ...
+  //
+  let AddedComplexity = 1 in {
+    def : Pat<(sra GR32:$src1, GR8:$src2),
+              (SARX32rr GR32:$src1,
+                        (INSERT_SUBREG
+                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+    def : Pat<(sra GR64:$src1, GR8:$src2),
+              (SARX64rr GR64:$src1,
+                        (INSERT_SUBREG
+                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+    def : Pat<(srl GR32:$src1, GR8:$src2),
+              (SHRX32rr GR32:$src1,
+                        (INSERT_SUBREG
+                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+    def : Pat<(srl GR64:$src1, GR8:$src2),
+              (SHRX64rr GR64:$src1,
+                        (INSERT_SUBREG
+                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+    def : Pat<(shl GR32:$src1, GR8:$src2),
+              (SHLX32rr GR32:$src1,
+                        (INSERT_SUBREG
+                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+    def : Pat<(shl GR64:$src1, GR8:$src2),
+              (SHLX64rr GR64:$src1,
+                        (INSERT_SUBREG
+                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  }
+
+  // Patterns on SARXrm/SHRXrm/SHLXrm are explicitly omitted to favor
+  //
+  //  mov (%ecx), %esi
+  //  shl $imm, $esi
+  //
+  // over
+  //
+  //  movb $imm %al
+  //  shlx %al, (%ecx), %esi
+  //
+  // As SARXrr/SHRXrr/SHLXrr is favored on variable shift, the peephole
+  // optimization will fold them into SARXrm/SHRXrm/SHLXrm if possible.
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
new file mode 100644
index 0000000..a97d1e5
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
@@ -0,0 +1,615 @@
+//===-- X86InstrSystem.td - System Instructions ------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instructions that are generally used in
+// privileged modes.  These are not typically used by the compiler, but are
+// supported for the assembler and disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+let SchedRW = [WriteSystem] in {
+let Defs = [RAX, RDX] in
+  def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)], IIC_RDTSC>,
+              TB;
+
+let Defs = [RAX, RCX, RDX] in
+  def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB;
+
+// CPU flow control instructions
+
+let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in {
+  def TRAP    : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
+  def UD2B    : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB;
+}
+
+def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", [], IIC_HLT>;
+def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", [], IIC_RSM>, TB;
+
+// Interrupt and SysCall Instructions.
+let Uses = [EFLAGS] in
+  def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>;
+def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
+              [(int_x86_int (i8 3))], IIC_INT3>;
+} // SchedRW
+
+// The long form of "int $3" turns into int3 as a size optimization.
+// FIXME: This doesn't work because InstAlias can't match immediate constants.
+//def : InstAlias<"int\t$3", (INT3)>;
+
+let SchedRW = [WriteSystem] in {
+
+def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap",
+              [(int_x86_int imm:$trap)], IIC_INT>;
+
+
+def SYSCALL  : I<0x05, RawFrm, (outs), (ins), "syscall", [], IIC_SYSCALL>, TB;
+def SYSRET   : I<0x07, RawFrm, (outs), (ins), "sysret{l}", [], IIC_SYSCALL>, TB;
+def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", [], IIC_SYSCALL>, TB,
+               Requires<[In64BitMode]>;
+
+def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", [],
+                 IIC_SYS_ENTER_EXIT>, TB;
+
+def SYSEXIT   : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [],
+                 IIC_SYS_ENTER_EXIT>, TB;
+def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", [],
+                 IIC_SYS_ENTER_EXIT>, TB, Requires<[In64BitMode]>;
+} // SchedRW
+
+def : Pat<(debugtrap),
+          (INT3)>, Requires<[NotPS4]>;
+def : Pat<(debugtrap),
+          (INT (i8 0x41))>, Requires<[IsPS4]>;
+
+//===----------------------------------------------------------------------===//
+//  Input/Output Instructions.
+//
+let SchedRW = [WriteSystem] in {
+let Defs = [AL], Uses = [DX] in
+def IN8rr  : I<0xEC, RawFrm, (outs), (ins),
+               "in{b}\t{%dx, %al|al, dx}", [], IIC_IN_RR>;
+let Defs = [AX], Uses = [DX] in
+def IN16rr : I<0xED, RawFrm, (outs), (ins),
+               "in{w}\t{%dx, %ax|ax, dx}", [], IIC_IN_RR>,  OpSize16;
+let Defs = [EAX], Uses = [DX] in
+def IN32rr : I<0xED, RawFrm, (outs), (ins),
+               "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>, OpSize32;
+
+let Defs = [AL] in
+def IN8ri  : Ii8<0xE4, RawFrm, (outs), (ins u8imm:$port),
+                  "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>;
+let Defs = [AX] in
+def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port),
+                  "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize16;
+let Defs = [EAX] in
+def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port),
+                  "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>, OpSize32;
+
+let Uses = [DX, AL] in
+def OUT8rr  : I<0xEE, RawFrm, (outs), (ins),
+                "out{b}\t{%al, %dx|dx, al}", [], IIC_OUT_RR>;
+let Uses = [DX, AX] in
+def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
+                "out{w}\t{%ax, %dx|dx, ax}", [], IIC_OUT_RR>, OpSize16;
+let Uses = [DX, EAX] in
+def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
+                "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>, OpSize32;
+
+let Uses = [AL] in
+def OUT8ir  : Ii8<0xE6, RawFrm, (outs), (ins u8imm:$port),
+                   "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>;
+let Uses = [AX] in
+def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
+                   "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize16;
+let Uses = [EAX] in
+def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
+                  "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>, OpSize32;
+
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Moves to and from debug registers
+
+let SchedRW = [WriteSystem] in {
+def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB,
+                Requires<[Not64BitMode]>;
+def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB,
+                Requires<[In64BitMode]>;
+
+def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB,
+                Requires<[Not64BitMode]>;
+def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB,
+                Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Moves to and from control registers
+
+let SchedRW = [WriteSystem] in {
+def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB,
+                Requires<[Not64BitMode]>;
+def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB,
+                Requires<[In64BitMode]>;
+
+def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB,
+                Requires<[Not64BitMode]>;
+def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB,
+                Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Segment override instruction prefixes
+
+def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>;
+def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>;
+def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>;
+def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>;
+def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>;
+def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>;
+
+
+//===----------------------------------------------------------------------===//
+// Moves to and from segment registers.
+//
+
+let SchedRW = [WriteMove] in {
+def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize16;
+def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize32;
+def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
+
+def MOV16ms : I<0x8C, MRMDestMem, (outs i16mem:$dst), (ins SEGMENT_REG:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize16;
+def MOV32ms : I<0x8C, MRMDestMem, (outs i32mem:$dst), (ins SEGMENT_REG:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize32;
+def MOV64ms : RI<0x8C, MRMDestMem, (outs i64mem:$dst), (ins SEGMENT_REG:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
+
+def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize16;
+def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize32;
+def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
+
+def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize16;
+def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize32;
+def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Segmentation support instructions.
+
+let SchedRW = [WriteSystem] in {
+def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB;
+
+def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
+                OpSize16;
+def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
+                OpSize16;
+
+// i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
+def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+                "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
+                OpSize32;
+def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
+                OpSize32;
+// i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
+def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                 "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
+def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+                 "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
+
+def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
+                OpSize16;
+def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
+                OpSize16;
+def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
+                OpSize32;
+def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
+                OpSize32;
+def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                 "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB;
+def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                 "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
+
+def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr",
+               [], IIC_INVLPG>, TB;
+
+def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins),
+               "str{w}\t$dst", [], IIC_STR>, TB, OpSize16;
+def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
+               "str{l}\t$dst", [], IIC_STR>, TB, OpSize32;
+def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
+                "str{q}\t$dst", [], IIC_STR>, TB;
+def STRm   : I<0x00, MRM1m, (outs i16mem:$dst), (ins),
+               "str{w}\t$dst", [], IIC_STR>, TB;
+
+def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
+             "ltr{w}\t$src", [], IIC_LTR>, TB;
+def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
+             "ltr{w}\t$src", [], IIC_LTR>, TB;
+
+def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
+                 "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>,
+                 OpSize16, Requires<[Not64BitMode]>;
+def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins),
+                 "push{l}\t{%cs|cs}", [], IIC_PUSH_CS>,
+                 OpSize32, Requires<[Not64BitMode]>;
+def PUSHSS16 : I<0x16, RawFrm, (outs), (ins),
+                 "push{w}\t{%ss|ss}", [], IIC_PUSH_SR>,
+                 OpSize16, Requires<[Not64BitMode]>;
+def PUSHSS32 : I<0x16, RawFrm, (outs), (ins),
+                 "push{l}\t{%ss|ss}", [], IIC_PUSH_SR>,
+                 OpSize32, Requires<[Not64BitMode]>;
+def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins),
+                 "push{w}\t{%ds|ds}", [], IIC_PUSH_SR>,
+                 OpSize16, Requires<[Not64BitMode]>;
+def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins),
+                 "push{l}\t{%ds|ds}", [], IIC_PUSH_SR>,
+                 OpSize32, Requires<[Not64BitMode]>;
+def PUSHES16 : I<0x06, RawFrm, (outs), (ins),
+                 "push{w}\t{%es|es}", [], IIC_PUSH_SR>,
+                 OpSize16, Requires<[Not64BitMode]>;
+def PUSHES32 : I<0x06, RawFrm, (outs), (ins),
+                 "push{l}\t{%es|es}", [], IIC_PUSH_SR>,
+                 OpSize32, Requires<[Not64BitMode]>;
+def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
+                 "push{w}\t{%fs|fs}", [], IIC_PUSH_SR>, OpSize16, TB;
+def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
+                 "push{l}\t{%fs|fs}", [], IIC_PUSH_SR>, TB,
+               OpSize32, Requires<[Not64BitMode]>;
+def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
+                 "push{w}\t{%gs|gs}", [], IIC_PUSH_SR>, OpSize16, TB;
+def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
+                 "push{l}\t{%gs|gs}", [], IIC_PUSH_SR>, TB,
+               OpSize32, Requires<[Not64BitMode]>;
+def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
+                 "push{q}\t{%fs|fs}", [], IIC_PUSH_SR>, TB,
+               OpSize32, Requires<[In64BitMode]>;
+def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
+                 "push{q}\t{%gs|gs}", [], IIC_PUSH_SR>, TB,
+               OpSize32, Requires<[In64BitMode]>;
+
+// No "pop cs" instruction.
+def POPSS16 : I<0x17, RawFrm, (outs), (ins),
+                "pop{w}\t{%ss|ss}", [], IIC_POP_SR_SS>,
+              OpSize16, Requires<[Not64BitMode]>;
+def POPSS32 : I<0x17, RawFrm, (outs), (ins),
+                "pop{l}\t{%ss|ss}", [], IIC_POP_SR_SS>,
+              OpSize32, Requires<[Not64BitMode]>;
+
+def POPDS16 : I<0x1F, RawFrm, (outs), (ins),
+                "pop{w}\t{%ds|ds}", [], IIC_POP_SR>,
+              OpSize16, Requires<[Not64BitMode]>;
+def POPDS32 : I<0x1F, RawFrm, (outs), (ins),
+                "pop{l}\t{%ds|ds}", [], IIC_POP_SR>,
+              OpSize32, Requires<[Not64BitMode]>;
+
+def POPES16 : I<0x07, RawFrm, (outs), (ins),
+                "pop{w}\t{%es|es}", [], IIC_POP_SR>,
+              OpSize16, Requires<[Not64BitMode]>;
+def POPES32 : I<0x07, RawFrm, (outs), (ins),
+                "pop{l}\t{%es|es}", [], IIC_POP_SR>,
+              OpSize32, Requires<[Not64BitMode]>;
+
+def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
+                "pop{w}\t{%fs|fs}", [], IIC_POP_SR>, OpSize16, TB;
+def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
+                "pop{l}\t{%fs|fs}", [], IIC_POP_SR>, TB,
+              OpSize32, Requires<[Not64BitMode]>;
+def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
+                "pop{q}\t{%fs|fs}", [], IIC_POP_SR>, TB,
+              OpSize32, Requires<[In64BitMode]>;
+
+def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
+                "pop{w}\t{%gs|gs}", [], IIC_POP_SR>, OpSize16, TB;
+def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
+                "pop{l}\t{%gs|gs}", [], IIC_POP_SR>, TB,
+              OpSize32, Requires<[Not64BitMode]>;
+def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
+                "pop{q}\t{%gs|gs}", [], IIC_POP_SR>, TB,
+              OpSize32, Requires<[In64BitMode]>;
+
+
+def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16;
+def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32;
+
+def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
+def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
+def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+                 "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+
+def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16;
+def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32;
+
+def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
+def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
+def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+                 "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+
+def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
+def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
+
+def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+                 "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+
+
+def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg),
+              "verr\t$seg", [], IIC_VERR>, TB;
+def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg),
+              "verr\t$seg", [], IIC_VERR>, TB;
+def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg),
+              "verw\t$seg", [], IIC_VERW_MEM>, TB;
+def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
+              "verw\t$seg", [], IIC_VERW_REG>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Descriptor-table support instructions
+
+let SchedRW = [WriteSystem] in {
+def SGDT16m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
+              "sgdt{w}\t$dst", [], IIC_SGDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def SGDT32m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
+              "sgdt{l}\t$dst", [], IIC_SGDT>, OpSize32, TB, Requires <[Not64BitMode]>;
+def SGDT64m : I<0x01, MRM0m, (outs opaque80mem:$dst), (ins),
+              "sgdt{q}\t$dst", [], IIC_SGDT>, TB, Requires <[In64BitMode]>;
+def SIDT16m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
+              "sidt{w}\t$dst", [], IIC_SIDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def SIDT32m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
+              "sidt{l}\t$dst", []>, OpSize32, TB, Requires <[Not64BitMode]>;
+def SIDT64m : I<0x01, MRM1m, (outs opaque80mem:$dst), (ins),
+              "sidt{q}\t$dst", []>, TB, Requires <[In64BitMode]>;
+def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
+                "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize16;
+def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins),
+                "sldt{w}\t$dst", [], IIC_SLDT>, TB;
+def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
+                "sldt{l}\t$dst", [], IIC_SLDT>, OpSize32, TB;
+
+// LLDT is not interpreted specially in 64-bit mode because there is no sign
+//   extension.
+def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
+                 "sldt{q}\t$dst", [], IIC_SLDT>, TB;
+def SLDT64m : RI<0x00, MRM0m, (outs i16mem:$dst), (ins),
+                 "sldt{q}\t$dst", [], IIC_SLDT>, TB;
+
+def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
+              "lgdt{w}\t$src", [], IIC_LGDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def LGDT32m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
+              "lgdt{l}\t$src", [], IIC_LGDT>, OpSize32, TB, Requires<[Not64BitMode]>;
+def LGDT64m : I<0x01, MRM2m, (outs), (ins opaque80mem:$src),
+              "lgdt{q}\t$src", [], IIC_LGDT>, TB, Requires<[In64BitMode]>;
+def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
+              "lidt{w}\t$src", [], IIC_LIDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def LIDT32m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
+              "lidt{l}\t$src", [], IIC_LIDT>, OpSize32, TB, Requires<[Not64BitMode]>;
+def LIDT64m : I<0x01, MRM3m, (outs), (ins opaque80mem:$src),
+              "lidt{q}\t$src", [], IIC_LIDT>, TB, Requires<[In64BitMode]>;
+def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
+                "lldt{w}\t$src", [], IIC_LLDT_REG>, TB;
+def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
+                "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Specialized register support
+let SchedRW = [WriteSystem] in {
+let Uses = [EAX, ECX, EDX] in
+def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB;
+let Defs = [EAX, EDX], Uses = [ECX] in
+def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB;
+
+let Defs = [RAX, RDX], Uses = [ECX] in
+  def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)], IIC_RDPMC>,
+              TB;
+
+def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins),
+                "smsw{w}\t$dst", [], IIC_SMSW>, OpSize16, TB;
+def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins),
+                "smsw{l}\t$dst", [], IIC_SMSW>, OpSize32, TB;
+// no m form encodable; use SMSW16m
+def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins),
+                 "smsw{q}\t$dst", [], IIC_SMSW>, TB;
+
+// For memory operands, there is only a 16-bit form
+def SMSW16m : I<0x01, MRM4m, (outs i16mem:$dst), (ins),
+                "smsw{w}\t$dst", [], IIC_SMSW>, TB;
+
+def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
+                "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB;
+def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
+                "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB;
+
+let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in
+  def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Cache instructions
+let SchedRW = [WriteSystem] in {
+def INVD : I<0x08, RawFrm, (outs), (ins), "invd", [], IIC_INVD>, TB;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// XSAVE instructions
+let SchedRW = [WriteSystem] in {
+let Predicates = [HasXSAVE] in {
+let Defs = [EDX, EAX], Uses = [ECX] in
+  def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
+
+let Uses = [EDX, EAX, ECX] in
+  def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB;
+}
+
+let Uses = [EDX, EAX] in {
+let Predicates = [HasXSAVE] in {
+  def XSAVE : I<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
+                "xsave\t$dst",
+                [(int_x86_xsave addr:$dst, EDX, EAX)]>, TB;
+  def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
+                   "xsave64\t$dst",
+                   [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+  def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+                 "xrstor\t$dst",
+                 [(int_x86_xrstor addr:$dst, EDX, EAX)]>, TB;
+  def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+                    "xrstor64\t$dst",
+                    [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+}
+let Predicates = [HasXSAVEOPT] in {
+  def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
+                   "xsaveopt\t$dst",
+                   [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS;
+  def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
+                      "xsaveopt64\t$dst",
+                      [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[In64BitMode]>;
+}
+let Predicates = [HasXSAVEC] in {
+  def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
+                 "xsavec\t$dst",
+                 [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB;
+  def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
+                   "xsavec64\t$dst",
+                   [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+}
+let Predicates = [HasXSAVES] in {
+  def XSAVES : I<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
+                 "xsaves\t$dst",
+                 [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB;
+  def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
+                    "xsaves64\t$dst",
+                    [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+  def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
+                  "xrstors\t$dst",
+                  [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB;
+  def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
+                     "xrstors64\t$dst",
+                     [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+}
+} // Uses
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// VIA PadLock crypto instructions
+let Defs = [RAX, RDI], Uses = [RDX, RDI] in
+  def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB;
+
+def : InstAlias<"xstorerng", (XSTORE)>;
+
+let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in {
+  def XCRYPTECB : I<0xa7, MRM_C8, (outs), (ins), "xcryptecb", []>, TB;
+  def XCRYPTCBC : I<0xa7, MRM_D0, (outs), (ins), "xcryptcbc", []>, TB;
+  def XCRYPTCTR : I<0xa7, MRM_D8, (outs), (ins), "xcryptctr", []>, TB;
+  def XCRYPTCFB : I<0xa7, MRM_E0, (outs), (ins), "xcryptcfb", []>, TB;
+  def XCRYPTOFB : I<0xa7, MRM_E8, (outs), (ins), "xcryptofb", []>, TB;
+}
+
+let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in {
+  def XSHA1 : I<0xa6, MRM_C8, (outs), (ins), "xsha1", []>, TB;
+  def XSHA256 : I<0xa6, MRM_D0, (outs), (ins), "xsha256", []>, TB;
+}
+let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
+  def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB;
+//==-----------------------------------------------------------------------===//
+// PKU  - enable protection key
+let usesCustomInserter = 1 in {
+  def WRPKRU : PseudoI<(outs), (ins GR32:$src),
+                [(int_x86_wrpkru GR32:$src)]>;
+  def RDPKRU : PseudoI<(outs GR32:$dst), (ins),
+                [(set GR32:$dst, (int_x86_rdpkru))]>;
+}
+
+let Defs = [EAX, EDX], Uses = [ECX] in 
+  def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB;
+let Uses = [EAX, ECX, EDX] in
+  def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB;
+
+//===----------------------------------------------------------------------===//
+// FS/GS Base Instructions
+let Predicates = [HasFSGSBase, In64BitMode] in {
+  def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins),
+                   "rdfsbase{l}\t$dst",
+                   [(set GR32:$dst, (int_x86_rdfsbase_32))]>, XS;
+  def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins),
+                     "rdfsbase{q}\t$dst",
+                     [(set GR64:$dst, (int_x86_rdfsbase_64))]>, XS;
+  def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins),
+                   "rdgsbase{l}\t$dst",
+                   [(set GR32:$dst, (int_x86_rdgsbase_32))]>, XS;
+  def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins),
+                     "rdgsbase{q}\t$dst",
+                     [(set GR64:$dst, (int_x86_rdgsbase_64))]>, XS;
+  def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src),
+                   "wrfsbase{l}\t$src",
+                   [(int_x86_wrfsbase_32 GR32:$src)]>, XS;
+  def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src),
+                      "wrfsbase{q}\t$src",
+                      [(int_x86_wrfsbase_64 GR64:$src)]>, XS;
+  def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src),
+                   "wrgsbase{l}\t$src",
+                   [(int_x86_wrgsbase_32 GR32:$src)]>, XS;
+  def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src),
+                      "wrgsbase{q}\t$src",
+                      [(int_x86_wrgsbase_64 GR64:$src)]>, XS;
+}
+
+//===----------------------------------------------------------------------===//
+// INVPCID Instruction
+def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+                "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                Requires<[Not64BitMode]>;
+def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+                "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// SMAP Instruction
+let Defs = [EFLAGS] in {
+  def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB;
+  def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// SMX Instruction
+let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in {
+  def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrTSX.td b/contrib/llvm/lib/Target/X86/X86InstrTSX.td
new file mode 100644
index 0000000..7267d75
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrTSX.td
@@ -0,0 +1,50 @@
+//===-- X86InstrVMX.td - TSX Instruction Set Extension -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel TSX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TSX instructions
+
+def X86xtest: SDNode<"X86ISD::XTEST", SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>,
+                     [SDNPHasChain, SDNPSideEffect]>;
+
+let usesCustomInserter = 1 in
+def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins),
+               "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>,
+             Requires<[HasRTM]>;
+
+let isBranch = 1, isTerminator = 1, Defs = [EAX] in {
+def XBEGIN_2 : Ii16PCRel<0xc7, MRM_F8, (outs), (ins brtarget16:$dst),
+                         "xbegin\t$dst", []>, OpSize16, Requires<[HasRTM]>;
+def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst),
+                         "xbegin\t$dst", []>, OpSize32, Requires<[HasRTM]>;
+}
+
+def XEND : I<0x01, MRM_D5, (outs), (ins),
+             "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
+
+let Defs = [EFLAGS] in
+def XTEST : I<0x01, MRM_D6, (outs), (ins),
+              "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasTSX]>;
+
+def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
+                 "xabort\t$imm",
+                 [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>;
+
+// HLE prefixes
+
+let isAsmParserOnly = 1 in {
+def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>, Requires<[HasHLE]>;
+def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>, Requires<[HasHLE]>;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrVMX.td b/contrib/llvm/lib/Target/X86/X86InstrVMX.td
new file mode 100644
index 0000000..79afe9a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrVMX.td
@@ -0,0 +1,66 @@
+//===-- X86InstrVMX.td - VMX Instruction Set Extension -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel VMX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VMX instructions
+
+// 66 0F 38 80
+def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+               "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+               Requires<[Not64BitMode]>;
+def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+               "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+               Requires<[In64BitMode]>;
+// 66 0F 38 81
+def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                Requires<[Not64BitMode]>;
+def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                Requires<[In64BitMode]>;
+// 0F 01 C1
+def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
+def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+  "vmclear\t$vmcs", []>, PD;
+// OF 01 D4
+def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB;
+// 0F 01 C2
+def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
+// 0F 01 C3
+def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB;
+def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+  "vmptrld\t$vmcs", []>, PS;
+def VMPTRSTm : I<0xC7, MRM7m, (outs i64mem:$vmcs), (ins),
+  "vmptrst\t$vmcs", []>, TB;
+def VMREAD64rm : I<0x78, MRMDestMem, (outs i64mem:$dst), (ins GR64:$src),
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMREAD32rm : I<0x78, MRMDestMem, (outs i32mem:$dst), (ins GR32:$src),
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+// 0F 01 C4
+def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB;
+def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon),
+  "vmxon\t$vmxon", []>, XS;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
new file mode 100644
index 0000000..4cb2304
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
@@ -0,0 +1,344 @@
+//===-- X86InstrXOP.td - XOP Instruction Set ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes XOP (eXtended OPerations)
+//
+//===----------------------------------------------------------------------===//
+
+multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int VR128:$src))]>, XOP;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
+}
+
+let ExeDomain = SSEPackedInt in {
+  defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>;
+  defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>;
+  defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>;
+  defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>;
+  defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>;
+  defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>;
+  defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>;
+  defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>;
+  defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>;
+  defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>;
+  defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>;
+  defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>;
+  defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>;
+  defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>;
+  defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>;
+}
+
+// Scalar load 2 addr operand instructions
+multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
+                     Operand memop, ComplexPattern mem_cpat> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int VR128:$src))]>, XOP;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP;
+}
+
+multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
+                     PatFrag memop> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int VR128:$src))]>, XOP;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
+}
+
+multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
+                     PatFrag memop> {
+  def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L;
+  def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L;
+}
+
+let ExeDomain = SSEPackedSingle in {
+  defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
+                           ssmem, sse_load_f32>;
+  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>;
+  defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+  defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
+                           sdmem, sse_load_f64>;
+  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>;
+  defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>;
+}
+
+multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                  ValueType vt128> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2))))]>,
+           XOP_4VOp3, Sched<[WriteVarVecShift]>;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1),
+                             (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
+           XOP_4V, VEX_W, Sched<[WriteVarVecShift, ReadAfterLd]>;
+  def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins i128mem:$src1, VR128:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))),
+                             (vt128 VR128:$src2))))]>,
+             XOP_4VOp3, Sched<[WriteVarVecShift, ReadAfterLd]>;
+}
+
+let ExeDomain = SSEPackedInt in {
+  defm VPROTB : xop3op<0x90, "vprotb", X86vprot, v16i8>;
+  defm VPROTD : xop3op<0x92, "vprotd", X86vprot, v4i32>;
+  defm VPROTQ : xop3op<0x93, "vprotq", X86vprot, v2i64>;
+  defm VPROTW : xop3op<0x91, "vprotw", X86vprot, v8i16>;
+  defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8>;
+  defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32>;
+  defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64>;
+  defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16>;
+  defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8>;
+  defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32>;
+  defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64>;
+  defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16>;
+}
+
+multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                     ValueType vt128> {
+  def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, u8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>, XOP;
+  def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins i128mem:$src1, u8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>, XOP;
+}
+
+let ExeDomain = SSEPackedInt in {
+  defm VPROTB : xop3opimm<0xC0, "vprotb", X86vproti, v16i8>;
+  defm VPROTD : xop3opimm<0xC2, "vprotd", X86vproti, v4i32>;
+  defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vproti, v2i64>;
+  defm VPROTW : xop3opimm<0xC1, "vprotw", X86vproti, v8i16>;
+}
+
+// Instruction where second source can be memory, but third must be register
+multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  let isCommutable = 1 in
+  def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+              (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V, VEX_I8IMM;
+  def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+              (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
+              VR128:$src3))]>, XOP_4V, VEX_I8IMM;
+}
+
+let ExeDomain = SSEPackedInt in {
+  defm VPMADCSWD  : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
+  defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
+  defm VPMACSWW   : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
+  defm VPMACSWD   : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
+  defm VPMACSSWW  : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
+  defm VPMACSSWD  : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
+  defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
+  defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
+  defm VPMACSSDD  : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
+  defm VPMACSDQL  : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
+  defm VPMACSDQH  : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
+  defm VPMACSDD   : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
+}
+
+// Instruction where second source can be memory, third must be imm8
+multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128> {
+  let isCommutable = 1 in
+  def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, XOPCC:$cc),
+           !strconcat("vpcom${cc}", Suffix,
+           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+                             i8immZExt3:$cc)))]>,
+           XOP_4V;
+  def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, XOPCC:$cc),
+           !strconcat("vpcom${cc}", Suffix,
+           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1),
+                             (vt128 (bitconvert (loadv2i64 addr:$src2))),
+                              i8immZExt3:$cc)))]>,
+           XOP_4V;
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+                 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+                 !strconcat("vpcom", Suffix,
+                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                 []>, XOP_4V;
+    let mayLoad = 1 in
+    def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+                 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+                 !strconcat("vpcom", Suffix,
+                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                 []>, XOP_4V;
+  }
+}
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+  defm VPCOMB  : xopvpcom<0xCC, "b", X86vpcom, v16i8>;
+  defm VPCOMW  : xopvpcom<0xCD, "w", X86vpcom, v8i16>;
+  defm VPCOMD  : xopvpcom<0xCE, "d", X86vpcom, v4i32>;
+  defm VPCOMQ  : xopvpcom<0xCF, "q", X86vpcom, v2i64>;
+  defm VPCOMUB : xopvpcom<0xEC, "ub", X86vpcomu, v16i8>;
+  defm VPCOMUW : xopvpcom<0xED, "uw", X86vpcomu, v8i16>;
+  defm VPCOMUD : xopvpcom<0xEE, "ud", X86vpcomu, v4i32>;
+  defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64>;
+}
+
+// Instruction where either second or third source can be memory
+multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, VR128:$src3))]>,
+           XOP_4V, VEX_I8IMM;
+  def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+             (Int VR128:$src1, VR128:$src2,
+              (bitconvert (loadv2i64 addr:$src3))))]>,
+           XOP_4V, VEX_I8IMM, VEX_W, MemOp4;
+  def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+             (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
+              VR128:$src3))]>,
+           XOP_4V, VEX_I8IMM;
+}
+
+let ExeDomain = SSEPackedInt in {
+  defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
+  defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
+}
+
+multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, VR256:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>,
+           XOP_4V, VEX_I8IMM, VEX_L;
+  def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst,
+             (Int VR256:$src1, VR256:$src2,
+              (bitconvert (loadv4i64 addr:$src3))))]>,
+           XOP_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L;
+  def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst,
+             (Int VR256:$src1, (bitconvert (loadv4i64 addr:$src2)),
+              VR256:$src3))]>,
+           XOP_4V, VEX_I8IMM, VEX_L;
+}
+
+let ExeDomain = SSEPackedInt in
+  defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
+
+let Predicates = [HasXOP] in {
+  def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1),
+                       (X86andnp VR128:$src3, VR128:$src2))),
+            (VPCMOVrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+
+  def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1),
+                       (X86andnp VR256:$src3, VR256:$src2))),
+            (VPCMOVrrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+}
+
+multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
+                  Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> {
+  def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
+        (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR128:$dst,
+           (Int128 VR128:$src1, VR128:$src2, VR128:$src3, imm:$src4))]>;
+  def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
+        (ins VR128:$src1, VR128:$src2, f128mem:$src3, u8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR128:$dst,
+           (Int128 VR128:$src1, VR128:$src2, (ld_128 addr:$src3), imm:$src4))]>,
+        VEX_W, MemOp4;
+  def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
+        (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR128:$dst,
+           (Int128 VR128:$src1, (ld_128 addr:$src2), VR128:$src3, imm:$src4))]>;
+  def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
+        (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR256:$dst,
+          (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L;
+  def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
+        (ins VR256:$src1, VR256:$src2, f256mem:$src3, u8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR256:$dst,
+          (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>,
+        VEX_W, MemOp4, VEX_L;
+  def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
+        (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR256:$dst,
+           (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>,
+        VEX_L;
+}
+
+let ExeDomain = SSEPackedDouble in
+  defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd,
+                           int_x86_xop_vpermil2pd_256, loadv2f64, loadv4f64>;
+
+let ExeDomain = SSEPackedSingle in
+  defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps,
+                           int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
new file mode 100644
index 0000000..646b556
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -0,0 +1,2042 @@
+//===-- X86IntrinsicsInfo.h - X86 Intrinsics ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the details for lowering X86 intrinsics
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H
+#define LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H
+
+namespace llvm {
+
+enum IntrinsicType {
+  INTR_NO_TYPE,
+  GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, FPCLASS, FPCLASSS,
+  INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP, INTR_TYPE_4OP,
+  CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, VSHIFT_MASK, COMI, COMI_RM,
+  INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
+  INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK,
+  INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
+  FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK,
+  VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
+  INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
+  COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC,
+  TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
+  EXPAND_FROM_MEM, BLEND, INSERT_SUBVEC,
+  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
+};
+
+struct IntrinsicData {
+
+  unsigned      Id;
+  IntrinsicType Type;
+  unsigned      Opc0;
+  unsigned      Opc1;
+
+  bool operator<(const IntrinsicData &RHS) const {
+    return Id < RHS.Id;
+  }
+  bool operator==(const IntrinsicData &RHS) const {
+    return RHS.Id == Id;
+  }
+};
+
+#define X86_INTRINSIC_DATA(id, type, op0, op1) \
+  { Intrinsic::x86_##id, type, op0, op1 }
+
+/*
+ * IntrinsicsWithChain - the table should be sorted by Intrinsic ID - in
+ * the alphabetical order.
+ */
+static const IntrinsicData IntrinsicsWithChain[] = {
+  X86_INTRINSIC_DATA(addcarry_u32,  ADX, X86ISD::ADC, 0),
+  X86_INTRINSIC_DATA(addcarry_u64,  ADX, X86ISD::ADC, 0),
+  X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0),
+  X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0),
+
+  X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0),
+
+  X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH,
+                     X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm),
+  X86_INTRINSIC_DATA(avx512_gatherpf_dps_512, PREFETCH,
+                     X86::VGATHERPF0DPSm, X86::VGATHERPF1DPSm),
+  X86_INTRINSIC_DATA(avx512_gatherpf_qpd_512, PREFETCH,
+                     X86::VGATHERPF0QPDm, X86::VGATHERPF1QPDm),
+  X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH,
+                     X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm),
+
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_d_128,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_d_256,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_d_512,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_128,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_256,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_512,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_128,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_256,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_512,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_q_128,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_q_256,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_q_512,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_d_128,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_d_256,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_d_512,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_128,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_256,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_512,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_128,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_256,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_512,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_q_128,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_q_256,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_512, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_128, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_256, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_512, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_128, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_256, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_512, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_128, TRUNCATE_TO_MEM_VI32,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_256, TRUNCATE_TO_MEM_VI32,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_512, TRUNCATE_TO_MEM_VI32,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_128, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_256, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_512, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_128, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_256, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_512, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0),
+  X86_INTRINSIC_DATA(avx512_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0),
+  X86_INTRINSIC_DATA(avx512_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0),
+  X86_INTRINSIC_DATA(avx512_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0),
+  X86_INTRINSIC_DATA(avx512_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0),
+  X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0),
+  X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0),
+  X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0),
+  X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, X86::VSCATTERPF0DPDm,
+                     X86::VSCATTERPF1DPDm),
+  X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, X86::VSCATTERPF0DPSm,
+                     X86::VSCATTERPF1DPSm),
+  X86_INTRINSIC_DATA(avx512_scatterpf_qpd_512, PREFETCH, X86::VSCATTERPF0QPDm,
+                     X86::VSCATTERPF1QPDm),
+  X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, X86::VSCATTERPF0QPSm,
+                     X86::VSCATTERPF1QPSm),
+  X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0),
+  X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0),
+  X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0),
+  X86_INTRINSIC_DATA(avx512_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0),
+  X86_INTRINSIC_DATA(avx512_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0),
+  X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0),
+  X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0),
+  X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0),
+
+  X86_INTRINSIC_DATA(rdpmc,     RDPMC,  X86ISD::RDPMC_DAG, 0),
+  X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0),
+  X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0),
+  X86_INTRINSIC_DATA(rdrand_64, RDRAND, X86ISD::RDRAND, 0),
+  X86_INTRINSIC_DATA(rdseed_16, RDSEED, X86ISD::RDSEED, 0),
+  X86_INTRINSIC_DATA(rdseed_32, RDSEED, X86ISD::RDSEED, 0),
+  X86_INTRINSIC_DATA(rdseed_64, RDSEED, X86ISD::RDSEED, 0),
+  X86_INTRINSIC_DATA(rdtsc,     RDTSC,  X86ISD::RDTSC_DAG, 0),
+  X86_INTRINSIC_DATA(rdtscp,    RDTSC,  X86ISD::RDTSCP_DAG, 0),
+
+  X86_INTRINSIC_DATA(subborrow_u32, ADX, X86ISD::SBB, 0),
+  X86_INTRINSIC_DATA(subborrow_u64, ADX, X86ISD::SBB, 0),
+  X86_INTRINSIC_DATA(xtest,     XTEST,  X86ISD::XTEST,  0),
+};
+
+/*
+ * Find Intrinsic data by intrinsic ID
+ */
+static const IntrinsicData* getIntrinsicWithChain(unsigned IntNo) {
+
+  IntrinsicData IntrinsicToFind = {IntNo, INTR_NO_TYPE, 0, 0 };
+  const IntrinsicData *Data =  std::lower_bound(std::begin(IntrinsicsWithChain),
+                                                std::end(IntrinsicsWithChain),
+                                                IntrinsicToFind);
+  if (Data != std::end(IntrinsicsWithChain) && *Data == IntrinsicToFind)
+    return Data;
+  return nullptr;
+}
+
+/*
+ * IntrinsicsWithoutChain - the table should be sorted by Intrinsic ID - in
+ * the alphabetical order.
+ */
+static const IntrinsicData  IntrinsicsWithoutChain[] = {
+  X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx2_pavg_b,  INTR_TYPE_2OP, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx2_pavg_w,  INTR_TYPE_2OP, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
+  X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
+  X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+  X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxs_b, INTR_TYPE_2OP, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxs_d, INTR_TYPE_2OP, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxu_d, INTR_TYPE_2OP, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxu_w, INTR_TYPE_2OP, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmins_b, INTR_TYPE_2OP, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pmins_d, INTR_TYPE_2OP, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
+  X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
+  X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+  X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
+  X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+  X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+  X86_INTRINSIC_DATA(avx2_psign_d, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+  X86_INTRINSIC_DATA(avx2_psign_w, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+  X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx2_pslli_d, VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx2_pslli_q, VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx2_pslli_w, VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx2_psrai_w, VSHIFT, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx2_psrav_d, INTR_TYPE_2OP, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx2_psrav_d_256, INTR_TYPE_2OP, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx2_psrli_d, VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx2_psrli_q, VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx2_psrli_w, VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+  X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
+  X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
+  X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
+  X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
+  X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
+  X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
+  X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2b_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2b_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2b_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2d_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2d_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2d_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2q_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2q_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2q_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2w_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2w_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2w_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtsi2sd32,  INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvtsi2sd64,  INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvtsi2ss32,  INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvtsi2ss64,  INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvtusi2ss,   INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvtw2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtw2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
+  X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD,
+                     X86ISD::FMADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD,
+                     X86ISD::FMADD_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_512, FMA_OP_MASK3, X86ISD::FMSUB,
+                     X86ISD::FMSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB,
+                     X86ISD::FMSUB_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
+                     X86ISD::FMSUBADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
+                     X86ISD::FMSUBADD_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_512, FMA_OP_MASK3, X86ISD::FNMSUB,
+                     X86ISD::FNMSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB,
+                     X86ISD::FNMSUB_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask_add_pd_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_add_pd_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
+  X86ISD::FADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask_add_ps_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_add_ps_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD,
+  X86ISD::FADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FADD,
+  X86ISD::FADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FADD,
+  X86ISD::FADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask_and_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_and_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_and_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_and_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_and_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_and_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_andn_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_andn_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_andn_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_andn_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_andn_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_andn_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_b_128,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_b_256,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_b_512,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_d_128,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_d_256,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_d_512,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_pd_128, BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_pd_256, BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_pd_512, BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_ps_128, BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_ps_256, BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_ps_512, BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_q_128,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_q_256,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_q_512,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_w_128,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_w_256,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_blend_w_512,  BLEND, X86ISD::SELECT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::SUBV_BROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::SUBV_BROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_256, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x8_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_256, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf64x4_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::SUBV_BROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::SUBV_BROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::SUBV_BROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_256, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x8_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_256, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_b_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_b_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_b_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_d_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_d_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_d_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM,
+                     X86ISD::CMPM_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM,
+                     X86ISD::CMPM_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_q_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_q_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_q_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_sd,     CMP_MASK_SCALAR_CC, X86ISD::FSETCC,
+                     X86ISD::FSETCC),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_ss,     CMP_MASK_SCALAR_CC, X86ISD::FSETCC,
+                     X86ISD::FSETCC),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_w_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_w_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_w_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_d_128,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_d_256,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_d_512,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_q_128,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_q_256,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_q_512,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_conflict_d_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CONFLICT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_conflict_d_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CONFLICT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_conflict_d_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CONFLICT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_conflict_q_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CONFLICT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_conflict_q_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CONFLICT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CONFLICT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTDQ2PD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_256, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_512, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0), // no rm
+  X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_128, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_256, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, ISD::SINT_TO_FP), //er
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_256, INTR_TYPE_1OP_MASK,
+                    X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK,
+                    X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps,     INTR_TYPE_1OP_MASK,
+                    X86ISD::VFPROUND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_256, INTR_TYPE_1OP_MASK_RM,
+                     ISD::FP_ROUND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, INTR_TYPE_1OP_MASK_RM,
+                     ISD::FP_ROUND, X86ISD::VFPROUND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VFPEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_EXTEND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_EXTEND, X86ISD::VFPEXT),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_128, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_256, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, ISD::SINT_TO_FP),
+  X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_256, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_512, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, ISD::SINT_TO_FP),
+  X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::VFPROUND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::VFPEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+  X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTUDQ2PD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_256, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_512, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0), // no rm
+  X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_128, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_256, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, ISD::UINT_TO_FP),
+  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_128, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_256, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, ISD::UINT_TO_FP),
+  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_256, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, ISD::UINT_TO_FP),
+  X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_128, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::DBPSADBW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::DBPSADBW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_512, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::DBPSADBW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_div_pd_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_div_pd_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
+                     X86ISD::FDIV_RND),
+  X86_INTRINSIC_DATA(avx512_mask_div_ps_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_div_ps_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
+                     X86ISD::FDIV_RND),
+  X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FDIV,
+                     X86ISD::FDIV_RND),
+  X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FDIV,
+                     X86ISD::FDIV_RND),
+  X86_INTRINSIC_DATA(avx512_mask_expand_d_128,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_d_256,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_d_512,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_pd_128, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_pd_256, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_pd_512, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_ps_128, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_ps_256, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_ps_512, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_q_128,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_q_256,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_q_512,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_512, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_sd, FPCLASSS, X86ISD::VFPCLASSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_ss, FPCLASSS, X86ISD::VFPCLASSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_pd_512, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_ps_128, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_ps_256, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_insertf32x4_256, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_insertf32x4_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_insertf32x8_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_insertf64x2_256, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_insertf64x2_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_insertf64x4_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_inserti32x4_256, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_inserti32x4_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_inserti32x8_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_inserti64x2_256, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_inserti64x2_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_inserti64x4_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK,
+                     ISD::CTLZ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK,
+                     ISD::CTLZ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_512, INTR_TYPE_1OP_MASK,
+                     ISD::CTLZ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_128, INTR_TYPE_1OP_MASK,
+                     ISD::CTLZ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_256, INTR_TYPE_1OP_MASK,
+                     ISD::CTLZ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_512, INTR_TYPE_1OP_MASK,
+                     ISD::CTLZ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
+                     X86ISD::FMAX_RND),
+  X86_INTRINSIC_DATA(avx512_mask_max_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
+                     X86ISD::FMAX_RND),
+  X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FMAX, X86ISD::FMAX_RND),
+  X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FMAX, X86ISD::FMAX_RND),
+  X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
+                     X86ISD::FMIN_RND),
+  X86_INTRINSIC_DATA(avx512_mask_min_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
+                     X86ISD::FMIN_RND),
+  X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FMIN, X86ISD::FMIN_RND),
+  X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FMIN, X86ISD::FMIN_RND),
+  X86_INTRINSIC_DATA(avx512_mask_movddup_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::MOVDDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_movddup_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::MOVDDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_movddup_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::MOVDDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_move_sd, INTR_TYPE_SCALAR_MASK, 
+                     X86ISD::MOVSD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_move_ss, INTR_TYPE_SCALAR_MASK, 
+                     X86ISD::MOVSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_movshdup_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::MOVSHDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_movshdup_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::MOVSHDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_movshdup_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::MOVSHDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_movsldup_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::MOVSLDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_movsldup_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::MOVSLDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_movsldup_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::MOVSLDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
+                     X86ISD::FMUL_RND),
+  X86_INTRINSIC_DATA(avx512_mask_mul_ps_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_mul_ps_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
+                     X86ISD::FMUL_RND),
+  X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FMUL,
+  X86ISD::FMUL_RND),
+  X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FMUL,
+  X86ISD::FMUL_RND),
+  X86_INTRINSIC_DATA(avx512_mask_or_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_or_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_or_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_or_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_or_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_or_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_b_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_b_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_b_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_d_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_d_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_d_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_q_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_q_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_q_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_w_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_w_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_w_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packssdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packssdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packssdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packsswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packsswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packsswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packusdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packusdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packusdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packuswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packuswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packuswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padd_b_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padd_b_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padd_b_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padd_d_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padd_d_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padd_d_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padd_q_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padd_q_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padd_q_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padd_w_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padd_w_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padd_w_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padds_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padds_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padds_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padds_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padds_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_paddus_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_paddus_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_paddus_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_palignr_128, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::PALIGNR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_palignr_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::PALIGNR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_palignr_512, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::PALIGNR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pand_d_128, INTR_TYPE_2OP_MASK, ISD::AND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pand_d_256, INTR_TYPE_2OP_MASK, ISD::AND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pand_d_512, INTR_TYPE_2OP_MASK, ISD::AND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pand_q_128, INTR_TYPE_2OP_MASK, ISD::AND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pand_q_256, INTR_TYPE_2OP_MASK, ISD::AND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pand_q_512, INTR_TYPE_2OP_MASK, ISD::AND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pandn_d_128, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pandn_d_256, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pandn_d_512, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pandn_q_128, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pandn_q_256, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pandn_q_512, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pavg_b_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pavg_b_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pavg_b_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pavg_w_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pavg_w_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pavg_w_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_128,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_256,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_512,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_128,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_256,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_512,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_128,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_256,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_512,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_128,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_256,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_512,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_128,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_256,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_512,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_128,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_256,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_512,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPMADDUBSW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPMADDUBSW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPMADDUBSW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPMADDWD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPMADDWD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPMADDWD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_b_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_b_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_b_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_d_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_d_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_d_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_q_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_q_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_q_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_w_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_w_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_w_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_b_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_b_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_b_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_d_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_d_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_d_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_q_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_q_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_q_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_dw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_dw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_dw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_db_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_db_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_db_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_db_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::PMULDQ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::PMULDQ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmul_dq_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::PMULDQ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_128, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_256, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_512, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulh_w_128, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulh_w_256, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulh_w_512, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_128, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_256, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_512, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmull_d_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmull_d_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmull_d_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmull_q_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmull_q_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmull_q_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmull_w_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmull_w_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmull_w_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::PMULUDQ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::PMULUDQ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::PMULUDQ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_por_d_128, INTR_TYPE_2OP_MASK, ISD::OR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_por_d_256, INTR_TYPE_2OP_MASK, ISD::OR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_por_d_512, INTR_TYPE_2OP_MASK, ISD::OR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_por_q_128, INTR_TYPE_2OP_MASK, ISD::OR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_por_q_256, INTR_TYPE_2OP_MASK, ISD::OR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_por_q_512, INTR_TYPE_2OP_MASK, ISD::OR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshuf_b_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::PSHUFB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshuf_b_256, INTR_TYPE_2OP_MASK,
+                    X86ISD::PSHUFB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pshuf_b_512, INTR_TYPE_2OP_MASK,
+                    X86ISD::PSHUFB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_d,        INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_d_128,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_d_256,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_di_128,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_di_256,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_di_512,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_q,        INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_q_128,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_q_256,    INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_qi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_qi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psll_qi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pslli_d,       VSHIFT_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pslli_q,       VSHIFT_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv_d,       INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psllv_q,       INTR_TYPE_2OP_MASK, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_d,        INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_d_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_d_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_di_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_di_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_di_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_q,        INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_q_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_q_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_qi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_qi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_qi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_w_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_w_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_w_512,    INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_wi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_wi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psra_wi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrai_d,       VSHIFT_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrai_q,       VSHIFT_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav_d,       INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrav_q,       INTR_TYPE_2OP_MASK, ISD::SRA, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_d,        INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_d_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_d_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_di_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_di_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_di_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_q,        INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_q_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_q_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_qi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_qi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_qi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_w_128,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_w_256,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_w_512,    INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_wi_128,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_wi_256,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrl_wi_512,   INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), 
+  X86_INTRINSIC_DATA(avx512_mask_psrli_d,       VSHIFT_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrli_q,       VSHIFT_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv16_hi,    INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv2_di,     INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv32hi,     INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv4_di,     INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv4_si,     INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv8_hi,     INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv8_si,     INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv_d,       INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrlv_q,       INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psub_b_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psub_b_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psub_b_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psub_d_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psub_d_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psub_d_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psub_q_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psub_q_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psub_q_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psub_w_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psub_w_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psub_w_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubs_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubs_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubs_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubs_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubs_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubs_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubus_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubus_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubus_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pternlog_d_128, TERLOG_OP_MASK,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pternlog_d_256, TERLOG_OP_MASK,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pternlog_d_512, TERLOG_OP_MASK,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pternlog_q_128, TERLOG_OP_MASK,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pternlog_q_256, TERLOG_OP_MASK,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pternlog_q_512, TERLOG_OP_MASK,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckld_q_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckld_q_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpckld_q_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pxor_d_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pxor_d_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pxor_d_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pxor_q_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pxor_q_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pxor_q_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_sd,   INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_ss,   INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512_mask_scalef_pd_512, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512_mask_scalef_ps_128, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512_mask_scalef_ps_256, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_pd_128, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUFP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_pd_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUFP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_pd_512, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUFP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_ps_128, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUFP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_ps_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUFP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_ps_512, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUFP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT,
+                     X86ISD::FSQRT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT,
+                     X86ISD::FSQRT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FSQRT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FSQRT_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sub_pd_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sub_pd_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
+                     X86ISD::FSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask_sub_ps_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sub_ps_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
+                     X86ISD::FSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FSUB,
+  X86ISD::FSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FSUB,
+  X86ISD::FSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::UNPCKL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_valign_d_128, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::VALIGN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_valign_d_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::VALIGN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::VALIGN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_valign_q_128, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::VALIGN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_valign_q_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::VALIGN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::VALIGN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM,
+                     ISD::FP16_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM,
+                     ISD::FP16_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_RM,
+                     ISD::FP16_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK_RM,
+                     ISD::FP_TO_FP16, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK_RM,
+                     ISD::FP_TO_FP16, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK_RM,
+                     ISD::FP_TO_FP16, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, X86ISD::FMADD,
+                     X86ISD::FMADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_128, FMA_OP_MASK, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_256, FMA_OP_MASK, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD,
+                     X86ISD::FMADD_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_512, FMA_OP_MASK, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_128, FMA_OP_MASK, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_256, FMA_OP_MASK, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_512, FMA_OP_MASK, X86ISD::FNMADD,
+                     X86ISD::FNMADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_128, FMA_OP_MASK, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_256, FMA_OP_MASK, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_512, FMA_OP_MASK, X86ISD::FNMADD,
+                     X86ISD::FNMADD_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_128, FMA_OP_MASK, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_256, FMA_OP_MASK, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_512, FMA_OP_MASK, X86ISD::FNMSUB,
+                     X86ISD::FNMSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_128, FMA_OP_MASK, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_256, FMA_OP_MASK, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB,
+                     X86ISD::FNMSUB_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_128, INTR_TYPE_2OP_IMM8_MASK,
+                     X86ISD::VPERMILPI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_256, INTR_TYPE_2OP_IMM8_MASK,
+                     X86ISD::VPERMILPI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_512, INTR_TYPE_2OP_IMM8_MASK,
+                     X86ISD::VPERMILPI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_128, INTR_TYPE_2OP_IMM8_MASK,
+                     X86ISD::VPERMILPI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_256, INTR_TYPE_2OP_IMM8_MASK,
+                     X86ISD::VPERMILPI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_512, INTR_TYPE_2OP_IMM8_MASK,
+                     X86ISD::VPERMILPI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_xor_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_xor_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_xor_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_xor_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_xor_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_xor_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_128, TERLOG_OP_MASKZ,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_256, TERLOG_OP_MASKZ,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_512, TERLOG_OP_MASKZ,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_128, TERLOG_OP_MASKZ,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_256, TERLOG_OP_MASKZ,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_512, TERLOG_OP_MASKZ,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, X86ISD::FMADD,
+                     X86ISD::FMADD_RND),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_128, FMA_OP_MASKZ, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_256, FMA_OP_MASKZ, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, X86ISD::FMADD,
+                     X86ISD::FMADD_RND),
+
+  X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
+
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_128, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_256, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_512, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_128, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_256, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_512, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_128, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_256, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_512, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_128, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_256, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_512, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_128, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_256, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_512, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastd_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastd_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastd_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_pbroadcastw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
+  X86_INTRINSIC_DATA(avx512_psll_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSHLDQ, 0),
+  X86_INTRINSIC_DATA(avx512_psrl_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSRLDQ, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
+  X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
+  X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0),
+  X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
+  X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
+  X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
+  X86_INTRINSIC_DATA(avx_hadd_pd_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
+  X86_INTRINSIC_DATA(avx_hadd_ps_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
+  X86_INTRINSIC_DATA(avx_hsub_pd_256,   INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+  X86_INTRINSIC_DATA(avx_hsub_ps_256,   INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+  X86_INTRINSIC_DATA(avx_max_pd_256,    INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx_max_ps_256,    INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx_min_pd_256,    INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx_min_ps_256,    INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx_rcp_ps_256,    INTR_TYPE_1OP, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx_rsqrt_ps_256,  INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx_sqrt_pd_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx_sqrt_ps_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+  X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+  X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+  X86_INTRINSIC_DATA(fma_vfmadd_pd,        INTR_TYPE_3OP, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmadd_pd_256,    INTR_TYPE_3OP, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmadd_ps,        INTR_TYPE_3OP, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmadd_ps_256,    INTR_TYPE_3OP, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmaddsub_pd,     INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmaddsub_ps,     INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmsub_pd,        INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmsub_pd_256,    INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmsub_ps,        INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmsub_ps_256,    INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmsubadd_pd,     INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmsubadd_pd_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmsubadd_ps,     INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmsubadd_ps_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(fma_vfnmadd_pd,       INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfnmadd_pd_256,   INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfnmadd_ps,       INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfnmadd_ps_256,   INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfnmsub_pd,       INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfnmsub_pd_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfnmsub_ps,       INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfnmsub_ps_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(sse2_comieq_sd,    COMI, X86ISD::COMI, ISD::SETEQ),
+  X86_INTRINSIC_DATA(sse2_comige_sd,    COMI, X86ISD::COMI, ISD::SETGE),
+  X86_INTRINSIC_DATA(sse2_comigt_sd,    COMI, X86ISD::COMI, ISD::SETGT),
+  X86_INTRINSIC_DATA(sse2_comile_sd,    COMI, X86ISD::COMI, ISD::SETLE),
+  X86_INTRINSIC_DATA(sse2_comilt_sd,    COMI, X86ISD::COMI, ISD::SETLT),
+  X86_INTRINSIC_DATA(sse2_comineq_sd,   COMI, X86ISD::COMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(sse2_max_pd,       INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(sse2_min_pd,       INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(sse2_pavg_b,       INTR_TYPE_2OP, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(sse2_pavg_w,       INTR_TYPE_2OP, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(sse2_pmaxs_w,      INTR_TYPE_2OP, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(sse2_pmaxu_b,      INTR_TYPE_2OP, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(sse2_pmins_w,      INTR_TYPE_2OP, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(sse2_pminu_b,      INTR_TYPE_2OP, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(sse2_pmulh_w,      INTR_TYPE_2OP, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(sse2_pmulhu_w,     INTR_TYPE_2OP, ISD::MULHU, 0),
+  X86_INTRINSIC_DATA(sse2_pmulu_dq,     INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+  X86_INTRINSIC_DATA(sse2_psad_bw,      INTR_TYPE_2OP, X86ISD::PSADBW, 0),
+  X86_INTRINSIC_DATA(sse2_pshuf_d,      INTR_TYPE_2OP, X86ISD::PSHUFD, 0),
+  X86_INTRINSIC_DATA(sse2_pshufh_w,     INTR_TYPE_2OP, X86ISD::PSHUFHW, 0),
+  X86_INTRINSIC_DATA(sse2_pshufl_w,     INTR_TYPE_2OP, X86ISD::PSHUFLW, 0),
+  X86_INTRINSIC_DATA(sse2_psll_d,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(sse2_psll_q,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(sse2_psll_w,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(sse2_pslli_d,      VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(sse2_pslli_q,      VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(sse2_pslli_w,      VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(sse2_psra_d,       INTR_TYPE_2OP, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(sse2_psra_w,       INTR_TYPE_2OP, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(sse2_psrai_d,      VSHIFT, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(sse2_psrai_w,      VSHIFT, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(sse2_psrl_d,       INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(sse2_psrl_q,       INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(sse2_psrl_w,       INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(sse2_psrli_d,      VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(sse2_psrli_q,      VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(sse2_psrli_w,      VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(sse2_psubus_b,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(sse2_psubus_w,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(sse2_sqrt_pd,      INTR_TYPE_1OP, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(sse2_ucomieq_sd,   COMI, X86ISD::UCOMI, ISD::SETEQ),
+  X86_INTRINSIC_DATA(sse2_ucomige_sd,   COMI, X86ISD::UCOMI, ISD::SETGE),
+  X86_INTRINSIC_DATA(sse2_ucomigt_sd,   COMI, X86ISD::UCOMI, ISD::SETGT),
+  X86_INTRINSIC_DATA(sse2_ucomile_sd,   COMI, X86ISD::UCOMI, ISD::SETLE),
+  X86_INTRINSIC_DATA(sse2_ucomilt_sd,   COMI, X86ISD::UCOMI, ISD::SETLT),
+  X86_INTRINSIC_DATA(sse2_ucomineq_sd,  COMI, X86ISD::UCOMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(sse3_hadd_pd,      INTR_TYPE_2OP, X86ISD::FHADD, 0),
+  X86_INTRINSIC_DATA(sse3_hadd_ps,      INTR_TYPE_2OP, X86ISD::FHADD, 0),
+  X86_INTRINSIC_DATA(sse3_hsub_pd,      INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+  X86_INTRINSIC_DATA(sse3_hsub_ps,      INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+  X86_INTRINSIC_DATA(sse41_insertps,    INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
+  X86_INTRINSIC_DATA(sse41_packusdw,    INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(sse41_pmaxsb,      INTR_TYPE_2OP, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(sse41_pmaxsd,      INTR_TYPE_2OP, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(sse41_pmaxud,      INTR_TYPE_2OP, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(sse41_pmaxuw,      INTR_TYPE_2OP, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(sse41_pminsb,      INTR_TYPE_2OP, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(sse41_pminsd,      INTR_TYPE_2OP, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(sse41_pminud,      INTR_TYPE_2OP, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(sse41_pminuw,      INTR_TYPE_2OP, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(sse41_pmovzxbd,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(sse41_pmovzxbq,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(sse41_pmovzxbw,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(sse41_pmovzxdq,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(sse41_pmovzxwd,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(sse41_pmovzxwq,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
+  X86_INTRINSIC_DATA(sse41_pmuldq,      INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
+  X86_INTRINSIC_DATA(sse4a_extrqi,      INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
+  X86_INTRINSIC_DATA(sse4a_insertqi,    INTR_TYPE_4OP, X86ISD::INSERTQI, 0),
+  X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),
+  X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE),
+  X86_INTRINSIC_DATA(sse_comigt_ss,     COMI, X86ISD::COMI, ISD::SETGT),
+  X86_INTRINSIC_DATA(sse_comile_ss,     COMI, X86ISD::COMI, ISD::SETLE),
+  X86_INTRINSIC_DATA(sse_comilt_ss,     COMI, X86ISD::COMI, ISD::SETLT),
+  X86_INTRINSIC_DATA(sse_comineq_ss,    COMI, X86ISD::COMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(sse_max_ps,        INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(sse_min_ps,        INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(sse_rcp_ps,        INTR_TYPE_1OP, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(sse_rsqrt_ps,      INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(sse_sqrt_ps,       INTR_TYPE_1OP, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(sse_ucomieq_ss,    COMI, X86ISD::UCOMI, ISD::SETEQ),
+  X86_INTRINSIC_DATA(sse_ucomige_ss,    COMI, X86ISD::UCOMI, ISD::SETGE),
+  X86_INTRINSIC_DATA(sse_ucomigt_ss,    COMI, X86ISD::UCOMI, ISD::SETGT),
+  X86_INTRINSIC_DATA(sse_ucomile_ss,    COMI, X86ISD::UCOMI, ISD::SETLE),
+  X86_INTRINSIC_DATA(sse_ucomilt_ss,    COMI, X86ISD::UCOMI, ISD::SETLT),
+  X86_INTRINSIC_DATA(sse_ucomineq_ss,   COMI, X86ISD::UCOMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+  X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+  X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+  X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+  X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+  X86_INTRINSIC_DATA(ssse3_psign_b_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+  X86_INTRINSIC_DATA(ssse3_psign_d_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+  X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0),
+  X86_INTRINSIC_DATA(xop_vpcomb,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+  X86_INTRINSIC_DATA(xop_vpcomd,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+  X86_INTRINSIC_DATA(xop_vpcomq,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+  X86_INTRINSIC_DATA(xop_vpcomub,       INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+  X86_INTRINSIC_DATA(xop_vpcomud,       INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+  X86_INTRINSIC_DATA(xop_vpcomuq,       INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+  X86_INTRINSIC_DATA(xop_vpcomuw,       INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+  X86_INTRINSIC_DATA(xop_vpcomw,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+  X86_INTRINSIC_DATA(xop_vprotb,        INTR_TYPE_2OP, X86ISD::VPROT, 0),
+  X86_INTRINSIC_DATA(xop_vprotbi,       INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+  X86_INTRINSIC_DATA(xop_vprotd,        INTR_TYPE_2OP, X86ISD::VPROT, 0),
+  X86_INTRINSIC_DATA(xop_vprotdi,       INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+  X86_INTRINSIC_DATA(xop_vprotq,        INTR_TYPE_2OP, X86ISD::VPROT, 0),
+  X86_INTRINSIC_DATA(xop_vprotqi,       INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+  X86_INTRINSIC_DATA(xop_vprotw,        INTR_TYPE_2OP, X86ISD::VPROT, 0),
+  X86_INTRINSIC_DATA(xop_vprotwi,       INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+  X86_INTRINSIC_DATA(xop_vpshab,        INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+  X86_INTRINSIC_DATA(xop_vpshad,        INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+  X86_INTRINSIC_DATA(xop_vpshaq,        INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+  X86_INTRINSIC_DATA(xop_vpshaw,        INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+  X86_INTRINSIC_DATA(xop_vpshlb,        INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+  X86_INTRINSIC_DATA(xop_vpshld,        INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+  X86_INTRINSIC_DATA(xop_vpshlq,        INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+  X86_INTRINSIC_DATA(xop_vpshlw,        INTR_TYPE_2OP, X86ISD::VPSHL, 0)
+};
+
+/*
+ * Retrieve data for Intrinsic without chain.
+ * Return nullptr if intrinsic is not defined in the table.
+ */
+static const IntrinsicData* getIntrinsicWithoutChain(unsigned IntNo) {
+  IntrinsicData IntrinsicToFind = { IntNo, INTR_NO_TYPE, 0, 0 };
+  const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithoutChain),
+                                               std::end(IntrinsicsWithoutChain),
+                                               IntrinsicToFind);
+  if (Data != std::end(IntrinsicsWithoutChain) && *Data == IntrinsicToFind)
+    return Data;
+  return nullptr;
+}
+
+static void verifyIntrinsicTables() {
+  assert(std::is_sorted(std::begin(IntrinsicsWithoutChain),
+                        std::end(IntrinsicsWithoutChain)) &&
+         std::is_sorted(std::begin(IntrinsicsWithChain),
+                        std::end(IntrinsicsWithChain)) &&
+         "Intrinsic data tables should be sorted by Intrinsic ID");
+  assert((std::adjacent_find(std::begin(IntrinsicsWithoutChain),
+                             std::end(IntrinsicsWithoutChain)) ==
+          std::end(IntrinsicsWithoutChain)) &&
+         (std::adjacent_find(std::begin(IntrinsicsWithChain),
+                             std::end(IntrinsicsWithChain)) ==
+          std::end(IntrinsicsWithChain)) &&
+         "Intrinsic data tables should have unique entries");
+}
+
+// X86 specific compare constants.
+// They must be kept in synch with avxintrin.h
+#define _X86_CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
+#define _X86_CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
+#define _X86_CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
+#define _X86_CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
+#define _X86_CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
+#define _X86_CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
+#define _X86_CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
+#define _X86_CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
+#define _X86_CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
+#define _X86_CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
+#define _X86_CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
+#define _X86_CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
+#define _X86_CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
+#define _X86_CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
+#define _X86_CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
+#define _X86_CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
+#define _X86_CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
+#define _X86_CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
+#define _X86_CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
+#define _X86_CMP_UNORD_S  0x13 /* Unordered (signaling)  */
+#define _X86_CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
+#define _X86_CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
+#define _X86_CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
+#define _X86_CMP_ORD_S    0x17 /* Ordered (signaling)  */
+#define _X86_CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
+#define _X86_CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
+#define _X86_CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
+#define _X86_CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
+#define _X86_CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
+#define _X86_CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
+#define _X86_CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
+#define _X86_CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
+
+/*
+* Get comparison modifier from _mm_comi_round_sd/ss intrinsic
+* Return tuple <isOrdered, X86 condcode>
+*/
+static std::tuple<bool,unsigned> TranslateX86ConstCondToX86CC(SDValue &imm) {
+  ConstantSDNode *CImm = dyn_cast<ConstantSDNode>(imm);
+  unsigned IntImm = CImm->getZExtValue();
+  // On a floating point condition, the flags are set as follows:
+  // ZF  PF  CF   op
+  //  0 | 0 | 0 | X > Y
+  //  0 | 0 | 1 | X < Y
+  //  1 | 0 | 0 | X == Y
+  //  1 | 1 | 1 | unordered
+  switch (IntImm) {
+  default: llvm_unreachable("Invalid floating point compare value for Comi!");
+  case _X86_CMP_EQ_OQ:      // 0x00 - Equal (ordered, nonsignaling)
+  case _X86_CMP_EQ_OS:      // 0x10 - Equal (ordered, signaling)
+    return std::make_tuple(true, X86::COND_E);
+  case _X86_CMP_EQ_UQ:      // 0x08 - Equal (unordered, non-signaling)
+  case _X86_CMP_EQ_US:      // 0x18 - Equal (unordered, signaling)
+    return std::make_tuple(false , X86::COND_E);
+  case _X86_CMP_LT_OS:      // 0x01 - Less-than (ordered, signaling)
+  case _X86_CMP_LT_OQ:      // 0x11 - Less-than (ordered, nonsignaling)
+    return std::make_tuple(true, X86::COND_B);
+  case _X86_CMP_NGE_US:     // 0x09 - Not-greater-than-or-equal (unordered, signaling)
+  case _X86_CMP_NGE_UQ:     // 0x19 - Not-greater-than-or-equal (unordered, nonsignaling)
+    return std::make_tuple(false , X86::COND_B);
+  case _X86_CMP_LE_OS:      // 0x02 - Less-than-or-equal (ordered, signaling)
+  case _X86_CMP_LE_OQ:      // 0x12 - Less-than-or-equal (ordered, nonsignaling)
+    return std::make_tuple(true, X86::COND_BE);
+  case _X86_CMP_NGT_US:     // 0x0A - Not-greater-than (unordered, signaling)
+  case _X86_CMP_NGT_UQ:     // 0x1A - Not-greater-than (unordered, nonsignaling)
+    return std::make_tuple(false, X86::COND_BE);
+  case _X86_CMP_GT_OS:      // 0x0E - Greater-than (ordered, signaling)
+  case _X86_CMP_GT_OQ:      // 0x1E - Greater-than (ordered, nonsignaling)
+    return std::make_tuple(true, X86::COND_A);
+  case _X86_CMP_NLE_US:     // 0x06 - Not-less-than-or-equal (unordered,signaling)
+  case _X86_CMP_NLE_UQ:     // 0x16 - Not-less-than-or-equal (unordered, nonsignaling)
+    return std::make_tuple(false, X86::COND_A);
+  case _X86_CMP_GE_OS:      // 0x0D - Greater-than-or-equal (ordered, signaling)
+  case _X86_CMP_GE_OQ:      // 0x1D - Greater-than-or-equal (ordered, nonsignaling)
+    return std::make_tuple(true, X86::COND_AE);
+  case _X86_CMP_NLT_US:     // 0x05 - Not-less-than (unordered, signaling)
+  case _X86_CMP_NLT_UQ:     // 0x15 - Not-less-than (unordered, nonsignaling)
+    return std::make_tuple(false, X86::COND_AE);
+  case _X86_CMP_NEQ_OQ:     // 0x0C - Not-equal (ordered, non-signaling)
+  case _X86_CMP_NEQ_OS:     // 0x1C - Not-equal (ordered, signaling)
+    return std::make_tuple(true, X86::COND_NE);
+  case _X86_CMP_NEQ_UQ:     // 0x04 - Not-equal (unordered, nonsignaling)
+  case _X86_CMP_NEQ_US:     // 0x14 - Not-equal (unordered, signaling)
+    return std::make_tuple(false, X86::COND_NE);
+  }
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
new file mode 100644
index 0000000..e1ca558
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -0,0 +1,1459 @@
+//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower X86 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "X86RegisterInfo.h"
+#include "X86ShuffleDecodeConstantPool.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+namespace {
+
+/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class X86MCInstLower {
+  MCContext &Ctx;
+  const MachineFunction &MF;
+  const TargetMachine &TM;
+  const MCAsmInfo &MAI;
+  X86AsmPrinter &AsmPrinter;
+public:
+  X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter);
+
+  Optional<MCOperand> LowerMachineOperand(const MachineInstr *MI,
+                                          const MachineOperand &MO) const;
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+private:
+  MachineModuleInfoMachO &getMachOMMI() const;
+  Mangler *getMang() const {
+    return AsmPrinter.Mang;
+  }
+};
+
+} // end anonymous namespace
+
+// Emit a minimal sequence of nops spanning NumBytes bytes.
+static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
+                     const MCSubtargetInfo &STI);
+
+namespace llvm {
+   X86AsmPrinter::StackMapShadowTracker::StackMapShadowTracker(TargetMachine &TM)
+     : TM(TM), InShadow(false), RequiredShadowSize(0), CurrentShadowSize(0) {}
+
+  X86AsmPrinter::StackMapShadowTracker::~StackMapShadowTracker() {}
+
+  void
+  X86AsmPrinter::StackMapShadowTracker::startFunction(MachineFunction &F) {
+    MF = &F;
+    CodeEmitter.reset(TM.getTarget().createMCCodeEmitter(
+        *MF->getSubtarget().getInstrInfo(),
+        *MF->getSubtarget().getRegisterInfo(), MF->getContext()));
+  }
+
+  void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
+                                                   const MCSubtargetInfo &STI) {
+    if (InShadow) {
+      SmallString<256> Code;
+      SmallVector<MCFixup, 4> Fixups;
+      raw_svector_ostream VecOS(Code);
+      CodeEmitter->encodeInstruction(Inst, VecOS, Fixups, STI);
+      CurrentShadowSize += Code.size();
+      if (CurrentShadowSize >= RequiredShadowSize)
+        InShadow = false; // The shadow is big enough. Stop counting.
+    }
+  }
+
+  void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
+    MCStreamer &OutStreamer, const MCSubtargetInfo &STI) {
+    if (InShadow && CurrentShadowSize < RequiredShadowSize) {
+      InShadow = false;
+      EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
+               MF->getSubtarget<X86Subtarget>().is64Bit(), STI);
+    }
+  }
+
+  void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
+    OutStreamer->EmitInstruction(Inst, getSubtargetInfo());
+    SMShadowTracker.count(Inst, getSubtargetInfo());
+  }
+} // end llvm namespace
+
+X86MCInstLower::X86MCInstLower(const MachineFunction &mf,
+                               X86AsmPrinter &asmprinter)
+    : Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), MAI(*TM.getMCAsmInfo()),
+      AsmPrinter(asmprinter) {}
+
+MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
+  return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>();
+}
+
+
+/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
+/// operand to an MCSymbol.
+MCSymbol *X86MCInstLower::
+GetSymbolFromOperand(const MachineOperand &MO) const {
+  const DataLayout &DL = MF.getDataLayout();
+  assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
+
+  MCSymbol *Sym = nullptr;
+  SmallString<128> Name;
+  StringRef Suffix;
+
+  switch (MO.getTargetFlags()) {
+  case X86II::MO_DLLIMPORT:
+    // Handle dllimport linkage.
+    Name += "__imp_";
+    break;
+  case X86II::MO_DARWIN_STUB:
+    Suffix = "$stub";
+    break;
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+    Suffix = "$non_lazy_ptr";
+    break;
+  }
+
+  if (!Suffix.empty())
+    Name += DL.getPrivateGlobalPrefix();
+
+  unsigned PrefixLen = Name.size();
+
+  if (MO.isGlobal()) {
+    const GlobalValue *GV = MO.getGlobal();
+    AsmPrinter.getNameWithPrefix(Name, GV);
+  } else if (MO.isSymbol()) {
+    Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
+  } else if (MO.isMBB()) {
+    assert(Suffix.empty());
+    Sym = MO.getMBB()->getSymbol();
+  }
+  unsigned OrigLen = Name.size() - PrefixLen;
+
+  Name += Suffix;
+  if (!Sym)
+    Sym = Ctx.getOrCreateSymbol(Name);
+
+  StringRef OrigName = StringRef(Name).substr(PrefixLen, OrigLen);
+
+  // If the target flags on the operand changes the name of the symbol, do that
+  // before we return the symbol.
+  switch (MO.getTargetFlags()) {
+  default: break;
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
+    MachineModuleInfoImpl::StubValueTy &StubSym =
+      getMachOMMI().getGVStubEntry(Sym);
+    if (!StubSym.getPointer()) {
+      assert(MO.isGlobal() && "Extern symbol not handled yet");
+      StubSym =
+        MachineModuleInfoImpl::
+        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
+                    !MO.getGlobal()->hasInternalLinkage());
+    }
+    break;
+  }
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: {
+    MachineModuleInfoImpl::StubValueTy &StubSym =
+      getMachOMMI().getHiddenGVStubEntry(Sym);
+    if (!StubSym.getPointer()) {
+      assert(MO.isGlobal() && "Extern symbol not handled yet");
+      StubSym =
+        MachineModuleInfoImpl::
+        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
+                    !MO.getGlobal()->hasInternalLinkage());
+    }
+    break;
+  }
+  case X86II::MO_DARWIN_STUB: {
+    MachineModuleInfoImpl::StubValueTy &StubSym =
+      getMachOMMI().getFnStubEntry(Sym);
+    if (StubSym.getPointer())
+      return Sym;
+
+    if (MO.isGlobal()) {
+      StubSym =
+        MachineModuleInfoImpl::
+        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
+                    !MO.getGlobal()->hasInternalLinkage());
+    } else {
+      StubSym =
+        MachineModuleInfoImpl::
+        StubValueTy(Ctx.getOrCreateSymbol(OrigName), false);
+    }
+    break;
+  }
+  }
+
+  return Sym;
+}
+
+MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                             MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  const MCExpr *Expr = nullptr;
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case X86II::MO_NO_FLAG:    // No flag.
+  // These affect the name of the symbol, not any suffix.
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DLLIMPORT:
+  case X86II::MO_DARWIN_STUB:
+    break;
+
+  case X86II::MO_TLVP:      RefKind = MCSymbolRefExpr::VK_TLVP; break;
+  case X86II::MO_TLVP_PIC_BASE:
+    Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
+    // Subtract the pic base.
+    Expr = MCBinaryExpr::createSub(Expr,
+                                  MCSymbolRefExpr::create(MF.getPICBaseSymbol(),
+                                                           Ctx),
+                                   Ctx);
+    break;
+  case X86II::MO_SECREL:    RefKind = MCSymbolRefExpr::VK_SECREL; break;
+  case X86II::MO_TLSGD:     RefKind = MCSymbolRefExpr::VK_TLSGD; break;
+  case X86II::MO_TLSLD:     RefKind = MCSymbolRefExpr::VK_TLSLD; break;
+  case X86II::MO_TLSLDM:    RefKind = MCSymbolRefExpr::VK_TLSLDM; break;
+  case X86II::MO_GOTTPOFF:  RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break;
+  case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break;
+  case X86II::MO_TPOFF:     RefKind = MCSymbolRefExpr::VK_TPOFF; break;
+  case X86II::MO_DTPOFF:    RefKind = MCSymbolRefExpr::VK_DTPOFF; break;
+  case X86II::MO_NTPOFF:    RefKind = MCSymbolRefExpr::VK_NTPOFF; break;
+  case X86II::MO_GOTNTPOFF: RefKind = MCSymbolRefExpr::VK_GOTNTPOFF; break;
+  case X86II::MO_GOTPCREL:  RefKind = MCSymbolRefExpr::VK_GOTPCREL; break;
+  case X86II::MO_GOT:       RefKind = MCSymbolRefExpr::VK_GOT; break;
+  case X86II::MO_GOTOFF:    RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
+  case X86II::MO_PLT:       RefKind = MCSymbolRefExpr::VK_PLT; break;
+  case X86II::MO_PIC_BASE_OFFSET:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+    Expr = MCSymbolRefExpr::create(Sym, Ctx);
+    // Subtract the pic base.
+    Expr = MCBinaryExpr::createSub(Expr,
+                            MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx),
+                                   Ctx);
+    if (MO.isJTI()) {
+      assert(MAI.doesSetDirectiveSuppressesReloc());
+      // If .set directive is supported, use it to reduce the number of
+      // relocations the assembler will generate for differences between
+      // local labels. This is only safe when the symbols are in the same
+      // section so we are restricting it to jumptable references.
+      MCSymbol *Label = Ctx.createTempSymbol();
+      AsmPrinter.OutStreamer->EmitAssignment(Label, Expr);
+      Expr = MCSymbolRefExpr::create(Label, Ctx);
+    }
+    break;
+  }
+
+  if (!Expr)
+    Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
+
+  if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
+    Expr = MCBinaryExpr::createAdd(Expr,
+                                   MCConstantExpr::create(MO.getOffset(), Ctx),
+                                   Ctx);
+  return MCOperand::createExpr(Expr);
+}
+
+
+/// \brief Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with
+/// a short fixed-register form.
+static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
+  unsigned ImmOp = Inst.getNumOperands() - 1;
+  assert(Inst.getOperand(0).isReg() &&
+         (Inst.getOperand(ImmOp).isImm() || Inst.getOperand(ImmOp).isExpr()) &&
+         ((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() &&
+           Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) ||
+          Inst.getNumOperands() == 2) && "Unexpected instruction!");
+
+  // Check whether the destination register can be fixed.
+  unsigned Reg = Inst.getOperand(0).getReg();
+  if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
+    return;
+
+  // If so, rewrite the instruction.
+  MCOperand Saved = Inst.getOperand(ImmOp);
+  Inst = MCInst();
+  Inst.setOpcode(Opcode);
+  Inst.addOperand(Saved);
+}
+
+/// \brief If a movsx instruction has a shorter encoding for the used register
+/// simplify the instruction to use it instead.
+static void SimplifyMOVSX(MCInst &Inst) {
+  unsigned NewOpcode = 0;
+  unsigned Op0 = Inst.getOperand(0).getReg(), Op1 = Inst.getOperand(1).getReg();
+  switch (Inst.getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected instruction!");
+  case X86::MOVSX16rr8:  // movsbw %al, %ax   --> cbtw
+    if (Op0 == X86::AX && Op1 == X86::AL)
+      NewOpcode = X86::CBW;
+    break;
+  case X86::MOVSX32rr16: // movswl %ax, %eax  --> cwtl
+    if (Op0 == X86::EAX && Op1 == X86::AX)
+      NewOpcode = X86::CWDE;
+    break;
+  case X86::MOVSX64rr32: // movslq %eax, %rax --> cltq
+    if (Op0 == X86::RAX && Op1 == X86::EAX)
+      NewOpcode = X86::CDQE;
+    break;
+  }
+
+  if (NewOpcode != 0) {
+    Inst = MCInst();
+    Inst.setOpcode(NewOpcode);
+  }
+}
+
+/// \brief Simplify things like MOV32rm to MOV32o32a.
+static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
+                                  unsigned Opcode) {
+  // Don't make these simplifications in 64-bit mode; other assemblers don't
+  // perform them because they make the code larger.
+  if (Printer.getSubtarget().is64Bit())
+    return;
+
+  bool IsStore = Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg();
+  unsigned AddrBase = IsStore;
+  unsigned RegOp = IsStore ? 0 : 5;
+  unsigned AddrOp = AddrBase + 3;
+  assert(Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() &&
+         Inst.getOperand(AddrBase + X86::AddrBaseReg).isReg() &&
+         Inst.getOperand(AddrBase + X86::AddrScaleAmt).isImm() &&
+         Inst.getOperand(AddrBase + X86::AddrIndexReg).isReg() &&
+         Inst.getOperand(AddrBase + X86::AddrSegmentReg).isReg() &&
+         (Inst.getOperand(AddrOp).isExpr() ||
+          Inst.getOperand(AddrOp).isImm()) &&
+         "Unexpected instruction!");
+
+  // Check whether the destination register can be fixed.
+  unsigned Reg = Inst.getOperand(RegOp).getReg();
+  if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
+    return;
+
+  // Check whether this is an absolute address.
+  // FIXME: We know TLVP symbol refs aren't, but there should be a better way
+  // to do this here.
+  bool Absolute = true;
+  if (Inst.getOperand(AddrOp).isExpr()) {
+    const MCExpr *MCE = Inst.getOperand(AddrOp).getExpr();
+    if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(MCE))
+      if (SRE->getKind() == MCSymbolRefExpr::VK_TLVP)
+        Absolute = false;
+  }
+
+  if (Absolute &&
+      (Inst.getOperand(AddrBase + X86::AddrBaseReg).getReg() != 0 ||
+       Inst.getOperand(AddrBase + X86::AddrScaleAmt).getImm() != 1 ||
+       Inst.getOperand(AddrBase + X86::AddrIndexReg).getReg() != 0))
+    return;
+
+  // If so, rewrite the instruction.
+  MCOperand Saved = Inst.getOperand(AddrOp);
+  MCOperand Seg = Inst.getOperand(AddrBase + X86::AddrSegmentReg);
+  Inst = MCInst();
+  Inst.setOpcode(Opcode);
+  Inst.addOperand(Saved);
+  Inst.addOperand(Seg);
+}
+
+static unsigned getRetOpcode(const X86Subtarget &Subtarget) {
+  return Subtarget.is64Bit() ? X86::RETQ : X86::RETL;
+}
+
+Optional<MCOperand>
+X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
+                                    const MachineOperand &MO) const {
+  switch (MO.getType()) {
+  default:
+    MI->dump();
+    llvm_unreachable("unknown operand type");
+  case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
+    if (MO.isImplicit())
+      return None;
+    return MCOperand::createReg(MO.getReg());
+  case MachineOperand::MO_Immediate:
+    return MCOperand::createImm(MO.getImm());
+  case MachineOperand::MO_MachineBasicBlock:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol:
+    return LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
+  case MachineOperand::MO_MCSymbol:
+    return LowerSymbolOperand(MO, MO.getMCSymbol());
+  case MachineOperand::MO_JumpTableIndex:
+    return LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()));
+  case MachineOperand::MO_ConstantPoolIndex:
+    return LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()));
+  case MachineOperand::MO_BlockAddress:
+    return LowerSymbolOperand(
+        MO, AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
+  case MachineOperand::MO_RegisterMask:
+    // Ignore call clobbers.
+    return None;
+  }
+}
+
+void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (const MachineOperand &MO : MI->operands())
+    if (auto MaybeMCOp = LowerMachineOperand(MI, MO))
+      OutMI.addOperand(MaybeMCOp.getValue());
+
+  // Handle a few special cases to eliminate operand modifiers.
+ReSimplify:
+  switch (OutMI.getOpcode()) {
+  case X86::LEA64_32r:
+  case X86::LEA64r:
+  case X86::LEA16r:
+  case X86::LEA32r:
+    // LEA should have a segment register, but it must be empty.
+    assert(OutMI.getNumOperands() == 1+X86::AddrNumOperands &&
+           "Unexpected # of LEA operands");
+    assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
+           "LEA has segment specified!");
+    break;
+
+  // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
+  // if one of the registers is extended, but other isn't.
+  case X86::VMOVZPQILo2PQIrr:
+  case X86::VMOVAPDrr:
+  case X86::VMOVAPDYrr:
+  case X86::VMOVAPSrr:
+  case X86::VMOVAPSYrr:
+  case X86::VMOVDQArr:
+  case X86::VMOVDQAYrr:
+  case X86::VMOVDQUrr:
+  case X86::VMOVDQUYrr:
+  case X86::VMOVUPDrr:
+  case X86::VMOVUPDYrr:
+  case X86::VMOVUPSrr:
+  case X86::VMOVUPSYrr: {
+    if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
+        X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg())) {
+      unsigned NewOpc;
+      switch (OutMI.getOpcode()) {
+      default: llvm_unreachable("Invalid opcode");
+      case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr;   break;
+      case X86::VMOVAPDrr:        NewOpc = X86::VMOVAPDrr_REV;  break;
+      case X86::VMOVAPDYrr:       NewOpc = X86::VMOVAPDYrr_REV; break;
+      case X86::VMOVAPSrr:        NewOpc = X86::VMOVAPSrr_REV;  break;
+      case X86::VMOVAPSYrr:       NewOpc = X86::VMOVAPSYrr_REV; break;
+      case X86::VMOVDQArr:        NewOpc = X86::VMOVDQArr_REV;  break;
+      case X86::VMOVDQAYrr:       NewOpc = X86::VMOVDQAYrr_REV; break;
+      case X86::VMOVDQUrr:        NewOpc = X86::VMOVDQUrr_REV;  break;
+      case X86::VMOVDQUYrr:       NewOpc = X86::VMOVDQUYrr_REV; break;
+      case X86::VMOVUPDrr:        NewOpc = X86::VMOVUPDrr_REV;  break;
+      case X86::VMOVUPDYrr:       NewOpc = X86::VMOVUPDYrr_REV; break;
+      case X86::VMOVUPSrr:        NewOpc = X86::VMOVUPSrr_REV;  break;
+      case X86::VMOVUPSYrr:       NewOpc = X86::VMOVUPSYrr_REV; break;
+      }
+      OutMI.setOpcode(NewOpc);
+    }
+    break;
+  }
+  case X86::VMOVSDrr:
+  case X86::VMOVSSrr: {
+    if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
+        X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) {
+      unsigned NewOpc;
+      switch (OutMI.getOpcode()) {
+      default: llvm_unreachable("Invalid opcode");
+      case X86::VMOVSDrr:   NewOpc = X86::VMOVSDrr_REV;   break;
+      case X86::VMOVSSrr:   NewOpc = X86::VMOVSSrr_REV;   break;
+      }
+      OutMI.setOpcode(NewOpc);
+    }
+    break;
+  }
+
+  // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register
+  // inputs modeled as normal uses instead of implicit uses.  As such, truncate
+  // off all but the first operand (the callee).  FIXME: Change isel.
+  case X86::TAILJMPr64:
+  case X86::TAILJMPr64_REX:
+  case X86::CALL64r:
+  case X86::CALL64pcrel32: {
+    unsigned Opcode = OutMI.getOpcode();
+    MCOperand Saved = OutMI.getOperand(0);
+    OutMI = MCInst();
+    OutMI.setOpcode(Opcode);
+    OutMI.addOperand(Saved);
+    break;
+  }
+
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64: {
+    OutMI = MCInst();
+    OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
+    break;
+  }
+
+  case X86::CLEANUPRET: {
+    // Replace CATCHRET with the appropriate RET.
+    OutMI = MCInst();
+    OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
+    break;
+  }
+
+  case X86::CATCHRET: {
+    // Replace CATCHRET with the appropriate RET.
+    const X86Subtarget &Subtarget = AsmPrinter.getSubtarget();
+    unsigned ReturnReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
+    OutMI = MCInst();
+    OutMI.setOpcode(getRetOpcode(Subtarget));
+    OutMI.addOperand(MCOperand::createReg(ReturnReg));
+    break;
+  }
+
+  // TAILJMPd, TAILJMPd64 - Lower to the correct jump instructions.
+  case X86::TAILJMPr:
+  case X86::TAILJMPd:
+  case X86::TAILJMPd64: {
+    unsigned Opcode;
+    switch (OutMI.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::TAILJMPr: Opcode = X86::JMP32r; break;
+    case X86::TAILJMPd:
+    case X86::TAILJMPd64: Opcode = X86::JMP_1; break;
+    }
+
+    MCOperand Saved = OutMI.getOperand(0);
+    OutMI = MCInst();
+    OutMI.setOpcode(Opcode);
+    OutMI.addOperand(Saved);
+    break;
+  }
+
+  case X86::DEC16r:
+  case X86::DEC32r:
+  case X86::INC16r:
+  case X86::INC32r:
+    // If we aren't in 64-bit mode we can use the 1-byte inc/dec instructions.
+    if (!AsmPrinter.getSubtarget().is64Bit()) {
+      unsigned Opcode;
+      switch (OutMI.getOpcode()) {
+      default: llvm_unreachable("Invalid opcode");
+      case X86::DEC16r: Opcode = X86::DEC16r_alt; break;
+      case X86::DEC32r: Opcode = X86::DEC32r_alt; break;
+      case X86::INC16r: Opcode = X86::INC16r_alt; break;
+      case X86::INC32r: Opcode = X86::INC32r_alt; break;
+      }
+      OutMI.setOpcode(Opcode);
+    }
+    break;
+
+  // These are pseudo-ops for OR to help with the OR->ADD transformation.  We do
+  // this with an ugly goto in case the resultant OR uses EAX and needs the
+  // short form.
+  case X86::ADD16rr_DB:   OutMI.setOpcode(X86::OR16rr); goto ReSimplify;
+  case X86::ADD32rr_DB:   OutMI.setOpcode(X86::OR32rr); goto ReSimplify;
+  case X86::ADD64rr_DB:   OutMI.setOpcode(X86::OR64rr); goto ReSimplify;
+  case X86::ADD16ri_DB:   OutMI.setOpcode(X86::OR16ri); goto ReSimplify;
+  case X86::ADD32ri_DB:   OutMI.setOpcode(X86::OR32ri); goto ReSimplify;
+  case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify;
+  case X86::ADD16ri8_DB:  OutMI.setOpcode(X86::OR16ri8); goto ReSimplify;
+  case X86::ADD32ri8_DB:  OutMI.setOpcode(X86::OR32ri8); goto ReSimplify;
+  case X86::ADD64ri8_DB:  OutMI.setOpcode(X86::OR64ri8); goto ReSimplify;
+
+  // Atomic load and store require a separate pseudo-inst because Acquire
+  // implies mayStore and Release implies mayLoad; fix these to regular MOV
+  // instructions here
+  case X86::ACQUIRE_MOV8rm:    OutMI.setOpcode(X86::MOV8rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV16rm:   OutMI.setOpcode(X86::MOV16rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV32rm:   OutMI.setOpcode(X86::MOV32rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV64rm:   OutMI.setOpcode(X86::MOV64rm); goto ReSimplify;
+  case X86::RELEASE_MOV8mr:    OutMI.setOpcode(X86::MOV8mr); goto ReSimplify;
+  case X86::RELEASE_MOV16mr:   OutMI.setOpcode(X86::MOV16mr); goto ReSimplify;
+  case X86::RELEASE_MOV32mr:   OutMI.setOpcode(X86::MOV32mr); goto ReSimplify;
+  case X86::RELEASE_MOV64mr:   OutMI.setOpcode(X86::MOV64mr); goto ReSimplify;
+  case X86::RELEASE_MOV8mi:    OutMI.setOpcode(X86::MOV8mi); goto ReSimplify;
+  case X86::RELEASE_MOV16mi:   OutMI.setOpcode(X86::MOV16mi); goto ReSimplify;
+  case X86::RELEASE_MOV32mi:   OutMI.setOpcode(X86::MOV32mi); goto ReSimplify;
+  case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify;
+  case X86::RELEASE_ADD8mi:    OutMI.setOpcode(X86::ADD8mi); goto ReSimplify;
+  case X86::RELEASE_ADD8mr:    OutMI.setOpcode(X86::ADD8mr); goto ReSimplify;
+  case X86::RELEASE_ADD32mi:   OutMI.setOpcode(X86::ADD32mi); goto ReSimplify;
+  case X86::RELEASE_ADD32mr:   OutMI.setOpcode(X86::ADD32mr); goto ReSimplify;
+  case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify;
+  case X86::RELEASE_ADD64mr:   OutMI.setOpcode(X86::ADD64mr); goto ReSimplify;
+  case X86::RELEASE_AND8mi:    OutMI.setOpcode(X86::AND8mi); goto ReSimplify;
+  case X86::RELEASE_AND8mr:    OutMI.setOpcode(X86::AND8mr); goto ReSimplify;
+  case X86::RELEASE_AND32mi:   OutMI.setOpcode(X86::AND32mi); goto ReSimplify;
+  case X86::RELEASE_AND32mr:   OutMI.setOpcode(X86::AND32mr); goto ReSimplify;
+  case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify;
+  case X86::RELEASE_AND64mr:   OutMI.setOpcode(X86::AND64mr); goto ReSimplify;
+  case X86::RELEASE_OR8mi:     OutMI.setOpcode(X86::OR8mi); goto ReSimplify;
+  case X86::RELEASE_OR8mr:     OutMI.setOpcode(X86::OR8mr); goto ReSimplify;
+  case X86::RELEASE_OR32mi:    OutMI.setOpcode(X86::OR32mi); goto ReSimplify;
+  case X86::RELEASE_OR32mr:    OutMI.setOpcode(X86::OR32mr); goto ReSimplify;
+  case X86::RELEASE_OR64mi32:  OutMI.setOpcode(X86::OR64mi32); goto ReSimplify;
+  case X86::RELEASE_OR64mr:    OutMI.setOpcode(X86::OR64mr); goto ReSimplify;
+  case X86::RELEASE_XOR8mi:    OutMI.setOpcode(X86::XOR8mi); goto ReSimplify;
+  case X86::RELEASE_XOR8mr:    OutMI.setOpcode(X86::XOR8mr); goto ReSimplify;
+  case X86::RELEASE_XOR32mi:   OutMI.setOpcode(X86::XOR32mi); goto ReSimplify;
+  case X86::RELEASE_XOR32mr:   OutMI.setOpcode(X86::XOR32mr); goto ReSimplify;
+  case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify;
+  case X86::RELEASE_XOR64mr:   OutMI.setOpcode(X86::XOR64mr); goto ReSimplify;
+  case X86::RELEASE_INC8m:     OutMI.setOpcode(X86::INC8m); goto ReSimplify;
+  case X86::RELEASE_INC16m:    OutMI.setOpcode(X86::INC16m); goto ReSimplify;
+  case X86::RELEASE_INC32m:    OutMI.setOpcode(X86::INC32m); goto ReSimplify;
+  case X86::RELEASE_INC64m:    OutMI.setOpcode(X86::INC64m); goto ReSimplify;
+  case X86::RELEASE_DEC8m:     OutMI.setOpcode(X86::DEC8m); goto ReSimplify;
+  case X86::RELEASE_DEC16m:    OutMI.setOpcode(X86::DEC16m); goto ReSimplify;
+  case X86::RELEASE_DEC32m:    OutMI.setOpcode(X86::DEC32m); goto ReSimplify;
+  case X86::RELEASE_DEC64m:    OutMI.setOpcode(X86::DEC64m); goto ReSimplify;
+
+  // We don't currently select the correct instruction form for instructions
+  // which have a short %eax, etc. form. Handle this by custom lowering, for
+  // now.
+  //
+  // Note, we are currently not handling the following instructions:
+  // MOV64ao8, MOV64o8a
+  // XCHG16ar, XCHG32ar, XCHG64ar
+  case X86::MOV8mr_NOREX:
+  case X86::MOV8mr:     SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8o32a); break;
+  case X86::MOV8rm_NOREX:
+  case X86::MOV8rm:     SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8ao32); break;
+  case X86::MOV16mr:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16o32a); break;
+  case X86::MOV16rm:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16ao32); break;
+  case X86::MOV32mr:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32o32a); break;
+  case X86::MOV32rm:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32ao32); break;
+
+  case X86::ADC8ri:     SimplifyShortImmForm(OutMI, X86::ADC8i8);    break;
+  case X86::ADC16ri:    SimplifyShortImmForm(OutMI, X86::ADC16i16);  break;
+  case X86::ADC32ri:    SimplifyShortImmForm(OutMI, X86::ADC32i32);  break;
+  case X86::ADC64ri32:  SimplifyShortImmForm(OutMI, X86::ADC64i32);  break;
+  case X86::ADD8ri:     SimplifyShortImmForm(OutMI, X86::ADD8i8);    break;
+  case X86::ADD16ri:    SimplifyShortImmForm(OutMI, X86::ADD16i16);  break;
+  case X86::ADD32ri:    SimplifyShortImmForm(OutMI, X86::ADD32i32);  break;
+  case X86::ADD64ri32:  SimplifyShortImmForm(OutMI, X86::ADD64i32);  break;
+  case X86::AND8ri:     SimplifyShortImmForm(OutMI, X86::AND8i8);    break;
+  case X86::AND16ri:    SimplifyShortImmForm(OutMI, X86::AND16i16);  break;
+  case X86::AND32ri:    SimplifyShortImmForm(OutMI, X86::AND32i32);  break;
+  case X86::AND64ri32:  SimplifyShortImmForm(OutMI, X86::AND64i32);  break;
+  case X86::CMP8ri:     SimplifyShortImmForm(OutMI, X86::CMP8i8);    break;
+  case X86::CMP16ri:    SimplifyShortImmForm(OutMI, X86::CMP16i16);  break;
+  case X86::CMP32ri:    SimplifyShortImmForm(OutMI, X86::CMP32i32);  break;
+  case X86::CMP64ri32:  SimplifyShortImmForm(OutMI, X86::CMP64i32);  break;
+  case X86::OR8ri:      SimplifyShortImmForm(OutMI, X86::OR8i8);     break;
+  case X86::OR16ri:     SimplifyShortImmForm(OutMI, X86::OR16i16);   break;
+  case X86::OR32ri:     SimplifyShortImmForm(OutMI, X86::OR32i32);   break;
+  case X86::OR64ri32:   SimplifyShortImmForm(OutMI, X86::OR64i32);   break;
+  case X86::SBB8ri:     SimplifyShortImmForm(OutMI, X86::SBB8i8);    break;
+  case X86::SBB16ri:    SimplifyShortImmForm(OutMI, X86::SBB16i16);  break;
+  case X86::SBB32ri:    SimplifyShortImmForm(OutMI, X86::SBB32i32);  break;
+  case X86::SBB64ri32:  SimplifyShortImmForm(OutMI, X86::SBB64i32);  break;
+  case X86::SUB8ri:     SimplifyShortImmForm(OutMI, X86::SUB8i8);    break;
+  case X86::SUB16ri:    SimplifyShortImmForm(OutMI, X86::SUB16i16);  break;
+  case X86::SUB32ri:    SimplifyShortImmForm(OutMI, X86::SUB32i32);  break;
+  case X86::SUB64ri32:  SimplifyShortImmForm(OutMI, X86::SUB64i32);  break;
+  case X86::TEST8ri:    SimplifyShortImmForm(OutMI, X86::TEST8i8);   break;
+  case X86::TEST16ri:   SimplifyShortImmForm(OutMI, X86::TEST16i16); break;
+  case X86::TEST32ri:   SimplifyShortImmForm(OutMI, X86::TEST32i32); break;
+  case X86::TEST64ri32: SimplifyShortImmForm(OutMI, X86::TEST64i32); break;
+  case X86::XOR8ri:     SimplifyShortImmForm(OutMI, X86::XOR8i8);    break;
+  case X86::XOR16ri:    SimplifyShortImmForm(OutMI, X86::XOR16i16);  break;
+  case X86::XOR32ri:    SimplifyShortImmForm(OutMI, X86::XOR32i32);  break;
+  case X86::XOR64ri32:  SimplifyShortImmForm(OutMI, X86::XOR64i32);  break;
+
+  // Try to shrink some forms of movsx.
+  case X86::MOVSX16rr8:
+  case X86::MOVSX32rr16:
+  case X86::MOVSX64rr32:
+    SimplifyMOVSX(OutMI);
+    break;
+  }
+}
+
+void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
+                                 const MachineInstr &MI) {
+
+  bool is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
+                  MI.getOpcode() == X86::TLS_base_addr64;
+
+  bool needsPadding = MI.getOpcode() == X86::TLS_addr64;
+
+  MCContext &context = OutStreamer->getContext();
+
+  if (needsPadding)
+    EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+
+  MCSymbolRefExpr::VariantKind SRVK;
+  switch (MI.getOpcode()) {
+    case X86::TLS_addr32:
+    case X86::TLS_addr64:
+      SRVK = MCSymbolRefExpr::VK_TLSGD;
+      break;
+    case X86::TLS_base_addr32:
+      SRVK = MCSymbolRefExpr::VK_TLSLDM;
+      break;
+    case X86::TLS_base_addr64:
+      SRVK = MCSymbolRefExpr::VK_TLSLD;
+      break;
+    default:
+      llvm_unreachable("unexpected opcode");
+  }
+
+  MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3));
+  const MCSymbolRefExpr *symRef = MCSymbolRefExpr::create(sym, SRVK, context);
+
+  MCInst LEA;
+  if (is64Bits) {
+    LEA.setOpcode(X86::LEA64r);
+    LEA.addOperand(MCOperand::createReg(X86::RDI)); // dest
+    LEA.addOperand(MCOperand::createReg(X86::RIP)); // base
+    LEA.addOperand(MCOperand::createImm(1));        // scale
+    LEA.addOperand(MCOperand::createReg(0));        // index
+    LEA.addOperand(MCOperand::createExpr(symRef));  // disp
+    LEA.addOperand(MCOperand::createReg(0));        // seg
+  } else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) {
+    LEA.setOpcode(X86::LEA32r);
+    LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest
+    LEA.addOperand(MCOperand::createReg(X86::EBX)); // base
+    LEA.addOperand(MCOperand::createImm(1));        // scale
+    LEA.addOperand(MCOperand::createReg(0));        // index
+    LEA.addOperand(MCOperand::createExpr(symRef));  // disp
+    LEA.addOperand(MCOperand::createReg(0));        // seg
+  } else {
+    LEA.setOpcode(X86::LEA32r);
+    LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest
+    LEA.addOperand(MCOperand::createReg(0));        // base
+    LEA.addOperand(MCOperand::createImm(1));        // scale
+    LEA.addOperand(MCOperand::createReg(X86::EBX)); // index
+    LEA.addOperand(MCOperand::createExpr(symRef));  // disp
+    LEA.addOperand(MCOperand::createReg(0));        // seg
+  }
+  EmitAndCountInstruction(LEA);
+
+  if (needsPadding) {
+    EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+    EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+    EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
+  }
+
+  StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr";
+  MCSymbol *tlsGetAddr = context.getOrCreateSymbol(name);
+  const MCSymbolRefExpr *tlsRef =
+    MCSymbolRefExpr::create(tlsGetAddr,
+                            MCSymbolRefExpr::VK_PLT,
+                            context);
+
+  EmitAndCountInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32
+                                                 : X86::CALLpcrel32)
+                            .addExpr(tlsRef));
+}
+
+/// \brief Emit the optimal amount of multi-byte nops on X86.
+static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, const MCSubtargetInfo &STI) {
+  // This works only for 64bit. For 32bit we have to do additional checking if
+  // the CPU supports multi-byte nops.
+  assert(Is64Bit && "EmitNops only supports X86-64");
+  while (NumBytes) {
+    unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
+    Opc = IndexReg = Displacement = SegmentReg = 0;
+    BaseReg = X86::RAX; ScaleVal = 1;
+    switch (NumBytes) {
+    case  0: llvm_unreachable("Zero nops?"); break;
+    case  1: NumBytes -=  1; Opc = X86::NOOP; break;
+    case  2: NumBytes -=  2; Opc = X86::XCHG16ar; break;
+    case  3: NumBytes -=  3; Opc = X86::NOOPL; break;
+    case  4: NumBytes -=  4; Opc = X86::NOOPL; Displacement = 8; break;
+    case  5: NumBytes -=  5; Opc = X86::NOOPL; Displacement = 8;
+             IndexReg = X86::RAX; break;
+    case  6: NumBytes -=  6; Opc = X86::NOOPW; Displacement = 8;
+             IndexReg = X86::RAX; break;
+    case  7: NumBytes -=  7; Opc = X86::NOOPL; Displacement = 512; break;
+    case  8: NumBytes -=  8; Opc = X86::NOOPL; Displacement = 512;
+             IndexReg = X86::RAX; break;
+    case  9: NumBytes -=  9; Opc = X86::NOOPW; Displacement = 512;
+             IndexReg = X86::RAX; break;
+    default: NumBytes -= 10; Opc = X86::NOOPW; Displacement = 512;
+             IndexReg = X86::RAX; SegmentReg = X86::CS; break;
+    }
+
+    unsigned NumPrefixes = std::min(NumBytes, 5U);
+    NumBytes -= NumPrefixes;
+    for (unsigned i = 0; i != NumPrefixes; ++i)
+      OS.EmitBytes("\x66");
+
+    switch (Opc) {
+    default: llvm_unreachable("Unexpected opcode"); break;
+    case X86::NOOP:
+      OS.EmitInstruction(MCInstBuilder(Opc), STI);
+      break;
+    case X86::XCHG16ar:
+      OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX), STI);
+      break;
+    case X86::NOOPL:
+    case X86::NOOPW:
+      OS.EmitInstruction(MCInstBuilder(Opc).addReg(BaseReg)
+                         .addImm(ScaleVal).addReg(IndexReg)
+                         .addImm(Displacement).addReg(SegmentReg), STI);
+      break;
+    }
+  } // while (NumBytes)
+}
+
+void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
+                                    X86MCInstLower &MCIL) {
+  assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64");
+
+  StatepointOpers SOpers(&MI);
+  if (unsigned PatchBytes = SOpers.getNumPatchBytes()) {
+    EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(),
+             getSubtargetInfo());
+  } else {
+    // Lower call target and choose correct opcode
+    const MachineOperand &CallTarget = SOpers.getCallTarget();
+    MCOperand CallTargetMCOp;
+    unsigned CallOpcode;
+    switch (CallTarget.getType()) {
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+      CallTargetMCOp = MCIL.LowerSymbolOperand(
+          CallTarget, MCIL.GetSymbolFromOperand(CallTarget));
+      CallOpcode = X86::CALL64pcrel32;
+      // Currently, we only support relative addressing with statepoints.
+      // Otherwise, we'll need a scratch register to hold the target
+      // address.  You'll fail asserts during load & relocation if this
+      // symbol is to far away. (TODO: support non-relative addressing)
+      break;
+    case MachineOperand::MO_Immediate:
+      CallTargetMCOp = MCOperand::createImm(CallTarget.getImm());
+      CallOpcode = X86::CALL64pcrel32;
+      // Currently, we only support relative addressing with statepoints.
+      // Otherwise, we'll need a scratch register to hold the target
+      // immediate.  You'll fail asserts during load & relocation if this
+      // address is to far away. (TODO: support non-relative addressing)
+      break;
+    case MachineOperand::MO_Register:
+      CallTargetMCOp = MCOperand::createReg(CallTarget.getReg());
+      CallOpcode = X86::CALL64r;
+      break;
+    default:
+      llvm_unreachable("Unsupported operand type in statepoint call target");
+      break;
+    }
+
+    // Emit call
+    MCInst CallInst;
+    CallInst.setOpcode(CallOpcode);
+    CallInst.addOperand(CallTargetMCOp);
+    OutStreamer->EmitInstruction(CallInst, getSubtargetInfo());
+  }
+
+  // Record our statepoint node in the same section used by STACKMAP
+  // and PATCHPOINT
+  SM.recordStatepoint(MI);
+}
+
+void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI,
+                                       X86MCInstLower &MCIL) {
+  // FAULTING_LOAD_OP <def>, <handler label>, <load opcode>, <load operands>
+
+  unsigned LoadDefRegister = MI.getOperand(0).getReg();
+  MCSymbol *HandlerLabel = MI.getOperand(1).getMCSymbol();
+  unsigned LoadOpcode = MI.getOperand(2).getImm();
+  unsigned LoadOperandsBeginIdx = 3;
+
+  FM.recordFaultingOp(FaultMaps::FaultingLoad, HandlerLabel);
+
+  MCInst LoadMI;
+  LoadMI.setOpcode(LoadOpcode);
+
+  if (LoadDefRegister != X86::NoRegister)
+    LoadMI.addOperand(MCOperand::createReg(LoadDefRegister));
+
+  for (auto I = MI.operands_begin() + LoadOperandsBeginIdx,
+            E = MI.operands_end();
+       I != E; ++I)
+    if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, *I))
+      LoadMI.addOperand(MaybeOperand.getValue());
+
+  OutStreamer->EmitInstruction(LoadMI, getSubtargetInfo());
+}
+
+// Lower a stackmap of the form:
+// <id>, <shadowBytes>, ...
+void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
+  SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+  SM.recordStackMap(MI);
+  unsigned NumShadowBytes = MI.getOperand(1).getImm();
+  SMShadowTracker.reset(NumShadowBytes);
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
+void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
+                                    X86MCInstLower &MCIL) {
+  assert(Subtarget->is64Bit() && "Patchpoint currently only supports X86-64");
+
+  SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+
+  SM.recordPatchPoint(MI);
+
+  PatchPointOpers opers(&MI);
+  unsigned ScratchIdx = opers.getNextScratchIdx();
+  unsigned EncodedBytes = 0;
+  const MachineOperand &CalleeMO =
+    opers.getMetaOper(PatchPointOpers::TargetPos);
+
+  // Check for null target. If target is non-null (i.e. is non-zero or is
+  // symbolic) then emit a call.
+  if (!(CalleeMO.isImm() && !CalleeMO.getImm())) {
+    MCOperand CalleeMCOp;
+    switch (CalleeMO.getType()) {
+    default:
+      /// FIXME: Add a verifier check for bad callee types.
+      llvm_unreachable("Unrecognized callee operand type.");
+    case MachineOperand::MO_Immediate:
+      if (CalleeMO.getImm())
+        CalleeMCOp = MCOperand::createImm(CalleeMO.getImm());
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+    case MachineOperand::MO_GlobalAddress:
+      CalleeMCOp =
+        MCIL.LowerSymbolOperand(CalleeMO,
+                                MCIL.GetSymbolFromOperand(CalleeMO));
+      break;
+    }
+
+    // Emit MOV to materialize the target address and the CALL to target.
+    // This is encoded with 12-13 bytes, depending on which register is used.
+    unsigned ScratchReg = MI.getOperand(ScratchIdx).getReg();
+    if (X86II::isX86_64ExtendedReg(ScratchReg))
+      EncodedBytes = 13;
+    else
+      EncodedBytes = 12;
+
+    EmitAndCountInstruction(
+        MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp));
+    EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg));
+  }
+
+  // Emit padding.
+  unsigned NumBytes = opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
+  assert(NumBytes >= EncodedBytes &&
+         "Patchpoint can't request size less than the length of a call.");
+
+  EmitNops(*OutStreamer, NumBytes - EncodedBytes, Subtarget->is64Bit(),
+           getSubtargetInfo());
+}
+
+// Returns instruction preceding MBBI in MachineFunction.
+// If MBBI is the first instruction of the first basic block, returns null.
+static MachineBasicBlock::const_iterator
+PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
+  const MachineBasicBlock *MBB = MBBI->getParent();
+  while (MBBI == MBB->begin()) {
+    if (MBB == MBB->getParent()->begin())
+      return nullptr;
+    MBB = MBB->getPrevNode();
+    MBBI = MBB->end();
+  }
+  return --MBBI;
+}
+
+static const Constant *getConstantFromPool(const MachineInstr &MI,
+                                           const MachineOperand &Op) {
+  if (!Op.isCPI())
+    return nullptr;
+
+  ArrayRef<MachineConstantPoolEntry> Constants =
+      MI.getParent()->getParent()->getConstantPool()->getConstants();
+  const MachineConstantPoolEntry &ConstantEntry =
+      Constants[Op.getIndex()];
+
+  // Bail if this is a machine constant pool entry, we won't be able to dig out
+  // anything useful.
+  if (ConstantEntry.isMachineConstantPoolEntry())
+    return nullptr;
+
+  auto *C = dyn_cast<Constant>(ConstantEntry.Val.ConstVal);
+  assert((!C || ConstantEntry.getType() == C->getType()) &&
+         "Expected a constant of the same type!");
+  return C;
+}
+
+static std::string getShuffleComment(const MachineOperand &DstOp,
+                                     const MachineOperand &SrcOp,
+                                     ArrayRef<int> Mask) {
+  std::string Comment;
+
+  // Compute the name for a register. This is really goofy because we have
+  // multiple instruction printers that could (in theory) use different
+  // names. Fortunately most people use the ATT style (outside of Windows)
+  // and they actually agree on register naming here. Ultimately, this is
+  // a comment, and so its OK if it isn't perfect.
+  auto GetRegisterName = [](unsigned RegNum) -> StringRef {
+    return X86ATTInstPrinter::getRegisterName(RegNum);
+  };
+
+  StringRef DstName = DstOp.isReg() ? GetRegisterName(DstOp.getReg()) : "mem";
+  StringRef SrcName = SrcOp.isReg() ? GetRegisterName(SrcOp.getReg()) : "mem";
+
+  raw_string_ostream CS(Comment);
+  CS << DstName << " = ";
+  bool NeedComma = false;
+  bool InSrc = false;
+  for (int M : Mask) {
+    // Wrap up any prior entry...
+    if (M == SM_SentinelZero && InSrc) {
+      InSrc = false;
+      CS << "]";
+    }
+    if (NeedComma)
+      CS << ",";
+    else
+      NeedComma = true;
+
+    // Print this shuffle...
+    if (M == SM_SentinelZero) {
+      CS << "zero";
+    } else {
+      if (!InSrc) {
+        InSrc = true;
+        CS << SrcName << "[";
+      }
+      if (M == SM_SentinelUndef)
+        CS << "u";
+      else
+        CS << M;
+    }
+  }
+  if (InSrc)
+    CS << "]";
+  CS.flush();
+
+  return Comment;
+}
+
+void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  X86MCInstLower MCInstLowering(*MF, *this);
+  const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo();
+
+  switch (MI->getOpcode()) {
+  case TargetOpcode::DBG_VALUE:
+    llvm_unreachable("Should be handled target independently");
+
+  // Emit nothing here but a comment if we can.
+  case X86::Int_MemBarrier:
+    OutStreamer->emitRawComment("MEMBARRIER");
+    return;
+
+
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64: {
+    // Lower these as normal, but add some comments.
+    unsigned Reg = MI->getOperand(0).getReg();
+    OutStreamer->AddComment(StringRef("eh_return, addr: %") +
+                            X86ATTInstPrinter::getRegisterName(Reg));
+    break;
+  }
+  case X86::CLEANUPRET: {
+    // Lower these as normal, but add some comments.
+    OutStreamer->AddComment("CLEANUPRET");
+    break;
+  }
+
+  case X86::CATCHRET: {
+    // Lower these as normal, but add some comments.
+    OutStreamer->AddComment("CATCHRET");
+    break;
+  }
+
+  case X86::TAILJMPr:
+  case X86::TAILJMPm:
+  case X86::TAILJMPd:
+  case X86::TAILJMPr64:
+  case X86::TAILJMPm64:
+  case X86::TAILJMPd64:
+  case X86::TAILJMPr64_REX:
+  case X86::TAILJMPm64_REX:
+  case X86::TAILJMPd64_REX:
+    // Lower these as normal, but add some comments.
+    OutStreamer->AddComment("TAILCALL");
+    break;
+
+  case X86::TLS_addr32:
+  case X86::TLS_addr64:
+  case X86::TLS_base_addr32:
+  case X86::TLS_base_addr64:
+    return LowerTlsAddr(MCInstLowering, *MI);
+
+  case X86::MOVPC32r: {
+    // This is a pseudo op for a two instruction sequence with a label, which
+    // looks like:
+    //     call "L1$pb"
+    // "L1$pb":
+    //     popl %esi
+
+    // Emit the call.
+    MCSymbol *PICBase = MF->getPICBaseSymbol();
+    // FIXME: We would like an efficient form for this, so we don't have to do a
+    // lot of extra uniquing.
+    EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32)
+      .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
+
+    const X86FrameLowering* FrameLowering =
+        MF->getSubtarget<X86Subtarget>().getFrameLowering();
+    bool hasFP = FrameLowering->hasFP(*MF);
+    
+    // TODO: This is needed only if we require precise CFA.
+    bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
+                               !OutStreamer->getDwarfFrameInfos().back().End;
+
+    int stackGrowth = -RI->getSlotSize();
+
+    if (HasActiveDwarfFrame && !hasFP) {
+      OutStreamer->EmitCFIAdjustCfaOffset(-stackGrowth);
+    }
+
+    // Emit the label.
+    OutStreamer->EmitLabel(PICBase);
+
+    // popl $reg
+    EmitAndCountInstruction(MCInstBuilder(X86::POP32r)
+                            .addReg(MI->getOperand(0).getReg()));
+
+    if (HasActiveDwarfFrame && !hasFP) {
+      OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth);
+    }
+    return;
+  }
+
+  case X86::ADD32ri: {
+    // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
+    if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
+      break;
+
+    // Okay, we have something like:
+    //  EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
+
+    // For this, we want to print something like:
+    //   MYGLOBAL + (. - PICBASE)
+    // However, we can't generate a ".", so just emit a new label here and refer
+    // to it.
+    MCSymbol *DotSym = OutContext.createTempSymbol();
+    OutStreamer->EmitLabel(DotSym);
+
+    // Now that we have emitted the label, lower the complex operand expression.
+    MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
+
+    const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
+    const MCExpr *PICBase =
+      MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
+    DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext);
+
+    DotExpr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(OpSym,OutContext),
+                                      DotExpr, OutContext);
+
+    EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addExpr(DotExpr));
+    return;
+  }
+  case TargetOpcode::STATEPOINT:
+    return LowerSTATEPOINT(*MI, MCInstLowering);
+
+  case TargetOpcode::FAULTING_LOAD_OP:
+    return LowerFAULTING_LOAD_OP(*MI, MCInstLowering);
+
+  case TargetOpcode::STACKMAP:
+    return LowerSTACKMAP(*MI);
+
+  case TargetOpcode::PATCHPOINT:
+    return LowerPATCHPOINT(*MI, MCInstLowering);
+
+  case X86::MORESTACK_RET:
+    EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
+    return;
+
+  case X86::MORESTACK_RET_RESTORE_R10:
+    // Return, then restore R10.
+    EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
+    EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr)
+                            .addReg(X86::R10)
+                            .addReg(X86::RAX));
+    return;
+
+  case X86::SEH_PushReg:
+    OutStreamer->EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm()));
+    return;
+
+  case X86::SEH_SaveReg:
+    OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+                                   MI->getOperand(1).getImm());
+    return;
+
+  case X86::SEH_SaveXMM:
+    OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+                                   MI->getOperand(1).getImm());
+    return;
+
+  case X86::SEH_StackAlloc:
+    OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm());
+    return;
+
+  case X86::SEH_SetFrame:
+    OutStreamer->EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+                                    MI->getOperand(1).getImm());
+    return;
+
+  case X86::SEH_PushFrame:
+    OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm());
+    return;
+
+  case X86::SEH_EndPrologue:
+    OutStreamer->EmitWinCFIEndProlog();
+    return;
+
+  case X86::SEH_Epilogue: {
+    MachineBasicBlock::const_iterator MBBI(MI);
+    // Check if preceded by a call and emit nop if so.
+    for (MBBI = PrevCrossBBInst(MBBI); MBBI; MBBI = PrevCrossBBInst(MBBI)) {
+      // Conservatively assume that pseudo instructions don't emit code and keep
+      // looking for a call. We may emit an unnecessary nop in some cases.
+      if (!MBBI->isPseudo()) {
+        if (MBBI->isCall())
+          EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+        break;
+      }
+    }
+    return;
+  }
+
+  // Lower PSHUFB and VPERMILP normally but add a comment if we can find
+  // a constant shuffle mask. We won't be able to do this at the MC layer
+  // because the mask isn't an immediate.
+  case X86::PSHUFBrm:
+  case X86::VPSHUFBrm:
+  case X86::VPSHUFBYrm:
+  case X86::VPSHUFBZ128rm:
+  case X86::VPSHUFBZ128rmk:
+  case X86::VPSHUFBZ128rmkz:
+  case X86::VPSHUFBZ256rm:
+  case X86::VPSHUFBZ256rmk:
+  case X86::VPSHUFBZ256rmkz:
+  case X86::VPSHUFBZrm:
+  case X86::VPSHUFBZrmk:
+  case X86::VPSHUFBZrmkz: {
+    if (!OutStreamer->isVerboseAsm())
+      break;
+    unsigned SrcIdx, MaskIdx;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::PSHUFBrm:
+    case X86::VPSHUFBrm:
+    case X86::VPSHUFBYrm:
+    case X86::VPSHUFBZ128rm:
+    case X86::VPSHUFBZ256rm:
+    case X86::VPSHUFBZrm:
+      SrcIdx = 1; MaskIdx = 5; break;
+    case X86::VPSHUFBZ128rmkz:
+    case X86::VPSHUFBZ256rmkz:
+    case X86::VPSHUFBZrmkz:
+      SrcIdx = 2; MaskIdx = 6; break;
+    case X86::VPSHUFBZ128rmk:
+    case X86::VPSHUFBZ256rmk:
+    case X86::VPSHUFBZrmk:
+      SrcIdx = 3; MaskIdx = 7; break;
+    }
+
+    assert(MI->getNumOperands() >= 6 &&
+           "We should always have at least 6 operands!");
+    const MachineOperand &DstOp = MI->getOperand(0);
+    const MachineOperand &SrcOp = MI->getOperand(SrcIdx);
+    const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
+
+    if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      SmallVector<int, 16> Mask;
+      DecodePSHUFBMask(C, Mask);
+      if (!Mask.empty())
+        OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, Mask));
+    }
+    break;
+  }
+  case X86::VPERMILPSrm:
+  case X86::VPERMILPDrm:
+  case X86::VPERMILPSYrm:
+  case X86::VPERMILPDYrm: {
+    if (!OutStreamer->isVerboseAsm())
+      break;
+    assert(MI->getNumOperands() > 5 &&
+           "We should always have at least 5 operands!");
+    const MachineOperand &DstOp = MI->getOperand(0);
+    const MachineOperand &SrcOp = MI->getOperand(1);
+    const MachineOperand &MaskOp = MI->getOperand(5);
+
+    unsigned ElSize;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::VPERMILPSrm: case X86::VPERMILPSYrm: ElSize = 32; break;
+    case X86::VPERMILPDrm: case X86::VPERMILPDYrm: ElSize = 64; break;
+    }
+
+    if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      SmallVector<int, 16> Mask;
+      DecodeVPERMILPMask(C, ElSize, Mask);
+      if (!Mask.empty())
+        OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, Mask));
+    }
+    break;
+  }
+
+#define MOV_CASE(Prefix, Suffix)        \
+  case X86::Prefix##MOVAPD##Suffix##rm: \
+  case X86::Prefix##MOVAPS##Suffix##rm: \
+  case X86::Prefix##MOVUPD##Suffix##rm: \
+  case X86::Prefix##MOVUPS##Suffix##rm: \
+  case X86::Prefix##MOVDQA##Suffix##rm: \
+  case X86::Prefix##MOVDQU##Suffix##rm:
+
+#define MOV_AVX512_CASE(Suffix)         \
+  case X86::VMOVDQA64##Suffix##rm:      \
+  case X86::VMOVDQA32##Suffix##rm:      \
+  case X86::VMOVDQU64##Suffix##rm:      \
+  case X86::VMOVDQU32##Suffix##rm:      \
+  case X86::VMOVDQU16##Suffix##rm:      \
+  case X86::VMOVDQU8##Suffix##rm:       \
+  case X86::VMOVAPS##Suffix##rm:        \
+  case X86::VMOVAPD##Suffix##rm:        \
+  case X86::VMOVUPS##Suffix##rm:        \
+  case X86::VMOVUPD##Suffix##rm:
+
+#define CASE_ALL_MOV_RM()               \
+  MOV_CASE(, )   /* SSE */              \
+  MOV_CASE(V, )  /* AVX-128 */          \
+  MOV_CASE(V, Y) /* AVX-256 */          \
+  MOV_AVX512_CASE(Z)                    \
+  MOV_AVX512_CASE(Z256)                 \
+  MOV_AVX512_CASE(Z128)
+
+  // For loads from a constant pool to a vector register, print the constant
+  // loaded.
+  CASE_ALL_MOV_RM()
+    if (!OutStreamer->isVerboseAsm())
+      break;
+    if (MI->getNumOperands() > 4)
+    if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
+      std::string Comment;
+      raw_string_ostream CS(Comment);
+      const MachineOperand &DstOp = MI->getOperand(0);
+      CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
+      if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
+        CS << "[";
+        for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements; ++i) {
+          if (i != 0)
+            CS << ",";
+          if (CDS->getElementType()->isIntegerTy())
+            CS << CDS->getElementAsInteger(i);
+          else if (CDS->getElementType()->isFloatTy())
+            CS << CDS->getElementAsFloat(i);
+          else if (CDS->getElementType()->isDoubleTy())
+            CS << CDS->getElementAsDouble(i);
+          else
+            CS << "?";
+        }
+        CS << "]";
+        OutStreamer->AddComment(CS.str());
+      } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
+        CS << "<";
+        for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) {
+          if (i != 0)
+            CS << ",";
+          Constant *COp = CV->getOperand(i);
+          if (isa<UndefValue>(COp)) {
+            CS << "u";
+          } else if (auto *CI = dyn_cast<ConstantInt>(COp)) {
+            if (CI->getBitWidth() <= 64) {
+              CS << CI->getZExtValue();
+            } else {
+              // print multi-word constant as (w0,w1)
+              auto Val = CI->getValue();
+              CS << "(";
+              for (int i = 0, N = Val.getNumWords(); i < N; ++i) {
+                if (i > 0)
+                  CS << ",";
+                CS << Val.getRawData()[i];
+              }
+              CS << ")";
+            }
+          } else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
+            SmallString<32> Str;
+            CF->getValueAPF().toString(Str);
+            CS << Str;
+          } else {
+            CS << "?";
+          }
+        }
+        CS << ">";
+        OutStreamer->AddComment(CS.str());
+      }
+    }
+    break;
+  }
+
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+
+  // Stackmap shadows cannot include branch targets, so we can count the bytes
+  // in a call towards the shadow, but must ensure that the no thread returns
+  // in to the stackmap shadow.  The only way to achieve this is if the call
+  // is at the end of the shadow.
+  if (MI->isCall()) {
+    // Count then size of the call towards the shadow
+    SMShadowTracker.count(TmpInst, getSubtargetInfo());
+    // Then flush the shadow so that we fill with nops before the call, not
+    // after it.
+    SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+    // Then emit the call
+    OutStreamer->EmitInstruction(TmpInst, getSubtargetInfo());
+    return;
+  }
+
+  EmitAndCountInstruction(TmpInst);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
new file mode 100644
index 0000000..c9e636f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -0,0 +1,33 @@
+//===-- X86MachineFunctionInfo.cpp - X86 machine function info ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+void X86MachineFunctionInfo::anchor() { }
+
+void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) {
+  if (!RestoreBasePointerOffset) {
+    const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      MF->getSubtarget().getRegisterInfo());
+    unsigned SlotSize = RegInfo->getSlotSize();
+    for (const MCPhysReg *CSR =
+      RegInfo->X86RegisterInfo::getCalleeSavedRegs(MF);
+      unsigned Reg = *CSR;
+       ++CSR)
+    {
+      if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+        RestoreBasePointerOffset -= SlotSize;
+    }
+  }
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
new file mode 100644
index 0000000..3a7a98d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -0,0 +1,167 @@
+//===-- X86MachineFunctionInfo.h - X86 machine function info ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares X86-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include <vector>
+
+namespace llvm {
+
+/// X86MachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private X86 target-specific information for each MachineFunction.
+class X86MachineFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  /// ForceFramePointer - True if the function is required to use of frame
+  /// pointer for reasons other than it containing dynamic allocation or
+  /// that FP eliminatation is turned off. For example, Cygwin main function
+  /// contains stack pointer re-alignment code which requires FP.
+  bool ForceFramePointer = false;
+
+  /// RestoreBasePointerOffset - Non-zero if the function has base pointer
+  /// and makes call to llvm.eh.sjlj.setjmp. When non-zero, the value is a
+  /// displacement from the frame pointer to a slot where the base pointer
+  /// is stashed.
+  signed char RestoreBasePointerOffset = 0;
+
+  /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+  /// stack frame in bytes.
+  unsigned CalleeSavedFrameSize = 0;
+
+  /// BytesToPopOnReturn - Number of bytes function pops on return (in addition
+  /// to the space used by the return address).
+  /// Used on windows platform for stdcall & fastcall name decoration
+  unsigned BytesToPopOnReturn = 0;
+
+  /// ReturnAddrIndex - FrameIndex for return slot.
+  int ReturnAddrIndex = 0;
+
+  /// \brief FrameIndex for return slot.
+  int FrameAddrIndex = 0;
+
+  /// TailCallReturnAddrDelta - The number of bytes by which return address
+  /// stack slot is moved as the result of tail call optimization.
+  int TailCallReturnAddrDelta = 0;
+
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg = 0;
+
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg = 0;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex = 0;
+  /// RegSaveFrameIndex - X86-64 vararg func register save area.
+  int RegSaveFrameIndex = 0;
+  /// VarArgsGPOffset - X86-64 vararg func int reg offset.
+  unsigned VarArgsGPOffset = 0;
+  /// VarArgsFPOffset - X86-64 vararg func fp reg offset.
+  unsigned VarArgsFPOffset = 0;
+  /// ArgumentStackSize - The number of bytes on stack consumed by the arguments
+  /// being passed on the stack.
+  unsigned ArgumentStackSize = 0;
+  /// NumLocalDynamics - Number of local-dynamic TLS accesses.
+  unsigned NumLocalDynamics = 0;
+  /// HasPushSequences - Keeps track of whether this function uses sequences
+  /// of pushes to pass function parameters.
+  bool HasPushSequences = false;
+
+  /// True if the function recovers from an SEH exception, and therefore needs
+  /// to spill and restore the frame pointer.
+  bool HasSEHFramePtrSave = false;
+
+  /// The frame index of a stack object containing the original frame pointer
+  /// used to address arguments in a function using a base pointer.
+  int SEHFramePtrSaveIndex = 0;
+
+private:
+  /// ForwardedMustTailRegParms - A list of virtual and physical registers
+  /// that must be forwarded to every musttail call.
+  SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;
+
+public:
+  X86MachineFunctionInfo() = default;
+
+  explicit X86MachineFunctionInfo(MachineFunction &MF) {}
+
+  bool getForceFramePointer() const { return ForceFramePointer;}
+  void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
+
+  bool getHasPushSequences() const { return HasPushSequences; }
+  void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; }
+
+  bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; }
+  void setRestoreBasePointer(const MachineFunction *MF);
+  int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; }
+
+  unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+  void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+  unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+  void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;}
+
+  int getRAIndex() const { return ReturnAddrIndex; }
+  void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+
+  int getFAIndex() const { return FrameAddrIndex; }
+  void setFAIndex(int Index) { FrameAddrIndex = Index; }
+
+  int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
+  void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+  unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+  void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
+
+  int getRegSaveFrameIndex() const { return RegSaveFrameIndex; }
+  void setRegSaveFrameIndex(int Idx) { RegSaveFrameIndex = Idx; }
+
+  unsigned getVarArgsGPOffset() const { return VarArgsGPOffset; }
+  void setVarArgsGPOffset(unsigned Offset) { VarArgsGPOffset = Offset; }
+
+  unsigned getVarArgsFPOffset() const { return VarArgsFPOffset; }
+  void setVarArgsFPOffset(unsigned Offset) { VarArgsFPOffset = Offset; }
+
+  unsigned getArgumentStackSize() const { return ArgumentStackSize; }
+  void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+
+  unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+
+  bool getHasSEHFramePtrSave() const { return HasSEHFramePtrSave; }
+  void setHasSEHFramePtrSave(bool V) { HasSEHFramePtrSave = V; }
+
+  int getSEHFramePtrSaveIndex() const { return SEHFramePtrSaveIndex; }
+  void setSEHFramePtrSaveIndex(int Index) { SEHFramePtrSaveIndex = Index; }
+
+  SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
+    return ForwardedMustTailRegParms;
+  }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
new file mode 100644
index 0000000..58020d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -0,0 +1,326 @@
+//===-- X86OptimizeLEAs.cpp - optimize usage of LEA instructions ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass that performs some optimizations with LEA
+// instructions in order to improve code size.
+// Currently, it does one thing:
+// 1) Address calculations in load and store instructions are replaced by
+//    existing LEA def registers where possible.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-optimize-LEAs"
+
+static cl::opt<bool> EnableX86LEAOpt("enable-x86-lea-opt", cl::Hidden,
+                                     cl::desc("X86: Enable LEA optimizations."),
+                                     cl::init(false));
+
+STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions");
+
+namespace {
+class OptimizeLEAPass : public MachineFunctionPass {
+public:
+  OptimizeLEAPass() : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override { return "X86 LEA Optimize"; }
+
+  /// \brief Loop over all of the basic blocks, replacing address
+  /// calculations in load and store instructions, if it's already
+  /// been calculated by LEA. Also, remove redundant LEAs.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  /// \brief Returns a distance between two instructions inside one basic block.
+  /// Negative result means, that instructions occur in reverse order.
+  int calcInstrDist(const MachineInstr &First, const MachineInstr &Last);
+
+  /// \brief Choose the best \p LEA instruction from the \p List to replace
+  /// address calculation in \p MI instruction. Return the address displacement
+  /// and the distance between \p MI and the choosen \p LEA in \p AddrDispShift
+  /// and \p Dist.
+  bool chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
+                     const MachineInstr &MI, MachineInstr *&LEA,
+                     int64_t &AddrDispShift, int &Dist);
+
+  /// \brief Returns true if two machine operand are identical and they are not
+  /// physical registers.
+  bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2);
+
+  /// \brief Returns true if the instruction is LEA.
+  bool isLEA(const MachineInstr &MI);
+
+  /// \brief Returns true if two instructions have memory operands that only
+  /// differ by displacement. The numbers of the first memory operands for both
+  /// instructions are specified through \p N1 and \p N2. The address
+  /// displacement is returned through AddrDispShift.
+  bool isSimilarMemOp(const MachineInstr &MI1, unsigned N1,
+                      const MachineInstr &MI2, unsigned N2,
+                      int64_t &AddrDispShift);
+
+  /// \brief Find all LEA instructions in the basic block.
+  void findLEAs(const MachineBasicBlock &MBB,
+                SmallVectorImpl<MachineInstr *> &List);
+
+  /// \brief Removes redundant address calculations.
+  bool removeRedundantAddrCalc(const SmallVectorImpl<MachineInstr *> &List);
+
+  MachineRegisterInfo *MRI;
+  const X86InstrInfo *TII;
+  const X86RegisterInfo *TRI;
+
+  static char ID;
+};
+char OptimizeLEAPass::ID = 0;
+}
+
+FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); }
+
+int OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
+                                   const MachineInstr &Last) {
+  const MachineBasicBlock *MBB = First.getParent();
+
+  // Both instructions must be in the same basic block.
+  assert(Last.getParent() == MBB &&
+         "Instructions are in different basic blocks");
+
+  return std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&Last)) -
+         std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&First));
+}
+
+// Find the best LEA instruction in the List to replace address recalculation in
+// MI. Such LEA must meet these requirements:
+// 1) The address calculated by the LEA differs only by the displacement from
+//    the address used in MI.
+// 2) The register class of the definition of the LEA is compatible with the
+//    register class of the address base register of MI.
+// 3) Displacement of the new memory operand should fit in 1 byte if possible.
+// 4) The LEA should be as close to MI as possible, and prior to it if
+//    possible.
+bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
+                                    const MachineInstr &MI, MachineInstr *&LEA,
+                                    int64_t &AddrDispShift, int &Dist) {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const MCInstrDesc &Desc = MI.getDesc();
+  int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()) +
+                X86II::getOperandBias(Desc);
+
+  LEA = nullptr;
+
+  // Loop over all LEA instructions.
+  for (auto DefMI : List) {
+    int64_t AddrDispShiftTemp = 0;
+
+    // Compare instructions memory operands.
+    if (!isSimilarMemOp(MI, MemOpNo, *DefMI, 1, AddrDispShiftTemp))
+      continue;
+
+    // Make sure address displacement fits 4 bytes.
+    if (!isInt<32>(AddrDispShiftTemp))
+      continue;
+
+    // Check that LEA def register can be used as MI address base. Some
+    // instructions can use a limited set of registers as address base, for
+    // example MOV8mr_NOREX. We could constrain the register class of the LEA
+    // def to suit MI, however since this case is very rare and hard to
+    // reproduce in a test it's just more reliable to skip the LEA.
+    if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI, *MF) !=
+        MRI->getRegClass(DefMI->getOperand(0).getReg()))
+      continue;
+
+    // Choose the closest LEA instruction from the list, prior to MI if
+    // possible. Note that we took into account resulting address displacement
+    // as well. Also note that the list is sorted by the order in which the LEAs
+    // occur, so the break condition is pretty simple.
+    int DistTemp = calcInstrDist(*DefMI, MI);
+    assert(DistTemp != 0 &&
+           "The distance between two different instructions cannot be zero");
+    if (DistTemp > 0 || LEA == nullptr) {
+      // Do not update return LEA, if the current one provides a displacement
+      // which fits in 1 byte, while the new candidate does not.
+      if (LEA != nullptr && !isInt<8>(AddrDispShiftTemp) &&
+          isInt<8>(AddrDispShift))
+        continue;
+
+      LEA = DefMI;
+      AddrDispShift = AddrDispShiftTemp;
+      Dist = DistTemp;
+    }
+
+    // FIXME: Maybe we should not always stop at the first LEA after MI.
+    if (DistTemp < 0)
+      break;
+  }
+
+  return LEA != nullptr;
+}
+
+bool OptimizeLEAPass::isIdenticalOp(const MachineOperand &MO1,
+                                    const MachineOperand &MO2) {
+  return MO1.isIdenticalTo(MO2) &&
+         (!MO1.isReg() ||
+          !TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+}
+
+bool OptimizeLEAPass::isLEA(const MachineInstr &MI) {
+  unsigned Opcode = MI.getOpcode();
+  return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
+         Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
+}
+
+// Check if MI1 and MI2 have memory operands which represent addresses that
+// differ only by displacement.
+bool OptimizeLEAPass::isSimilarMemOp(const MachineInstr &MI1, unsigned N1,
+                                     const MachineInstr &MI2, unsigned N2,
+                                     int64_t &AddrDispShift) {
+  // Address base, scale, index and segment operands must be identical.
+  static const int IdenticalOpNums[] = {X86::AddrBaseReg, X86::AddrScaleAmt,
+                                        X86::AddrIndexReg, X86::AddrSegmentReg};
+  for (auto &N : IdenticalOpNums)
+    if (!isIdenticalOp(MI1.getOperand(N1 + N), MI2.getOperand(N2 + N)))
+      return false;
+
+  // Address displacement operands may differ by a constant.
+  const MachineOperand *Op1 = &MI1.getOperand(N1 + X86::AddrDisp);
+  const MachineOperand *Op2 = &MI2.getOperand(N2 + X86::AddrDisp);
+  if (!isIdenticalOp(*Op1, *Op2)) {
+    if (Op1->isImm() && Op2->isImm())
+      AddrDispShift = Op1->getImm() - Op2->getImm();
+    else if (Op1->isGlobal() && Op2->isGlobal() &&
+             Op1->getGlobal() == Op2->getGlobal())
+      AddrDispShift = Op1->getOffset() - Op2->getOffset();
+    else
+      return false;
+  }
+
+  return true;
+}
+
+void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB,
+                               SmallVectorImpl<MachineInstr *> &List) {
+  for (auto &MI : MBB) {
+    if (isLEA(MI))
+      List.push_back(const_cast<MachineInstr *>(&MI));
+  }
+}
+
+// Try to find load and store instructions which recalculate addresses already
+// calculated by some LEA and replace their memory operands with its def
+// register.
+bool OptimizeLEAPass::removeRedundantAddrCalc(
+    const SmallVectorImpl<MachineInstr *> &List) {
+  bool Changed = false;
+
+  assert(List.size() > 0);
+  MachineBasicBlock *MBB = List[0]->getParent();
+
+  // Process all instructions in basic block.
+  for (auto I = MBB->begin(), E = MBB->end(); I != E;) {
+    MachineInstr &MI = *I++;
+    unsigned Opcode = MI.getOpcode();
+
+    // Instruction must be load or store.
+    if (!MI.mayLoadOrStore())
+      continue;
+
+    // Get the number of the first memory operand.
+    const MCInstrDesc &Desc = MI.getDesc();
+    int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, Opcode);
+
+    // If instruction has no memory operand - skip it.
+    if (MemOpNo < 0)
+      continue;
+
+    MemOpNo += X86II::getOperandBias(Desc);
+
+    // Get the best LEA instruction to replace address calculation.
+    MachineInstr *DefMI;
+    int64_t AddrDispShift;
+    int Dist;
+    if (!chooseBestLEA(List, MI, DefMI, AddrDispShift, Dist))
+      continue;
+
+    // If LEA occurs before current instruction, we can freely replace
+    // the instruction. If LEA occurs after, we can lift LEA above the
+    // instruction and this way to be able to replace it. Since LEA and the
+    // instruction have similar memory operands (thus, the same def
+    // instructions for these operands), we can always do that, without
+    // worries of using registers before their defs.
+    if (Dist < 0) {
+      DefMI->removeFromParent();
+      MBB->insert(MachineBasicBlock::iterator(&MI), DefMI);
+    }
+
+    // Since we can possibly extend register lifetime, clear kill flags.
+    MRI->clearKillFlags(DefMI->getOperand(0).getReg());
+
+    ++NumSubstLEAs;
+    DEBUG(dbgs() << "OptimizeLEAs: Candidate to replace: "; MI.dump(););
+
+    // Change instruction operands.
+    MI.getOperand(MemOpNo + X86::AddrBaseReg)
+        .ChangeToRegister(DefMI->getOperand(0).getReg(), false);
+    MI.getOperand(MemOpNo + X86::AddrScaleAmt).ChangeToImmediate(1);
+    MI.getOperand(MemOpNo + X86::AddrIndexReg)
+        .ChangeToRegister(X86::NoRegister, false);
+    MI.getOperand(MemOpNo + X86::AddrDisp).ChangeToImmediate(AddrDispShift);
+    MI.getOperand(MemOpNo + X86::AddrSegmentReg)
+        .ChangeToRegister(X86::NoRegister, false);
+
+    DEBUG(dbgs() << "OptimizeLEAs: Replaced by: "; MI.dump(););
+
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+
+  // Perform this optimization only if we care about code size.
+  if (!EnableX86LEAOpt || !MF.getFunction()->optForSize())
+    return false;
+
+  MRI = &MF.getRegInfo();
+  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+  TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+
+  // Process all basic blocks.
+  for (auto &MBB : MF) {
+    SmallVector<MachineInstr *, 16> LEAs;
+
+    // Find all LEA instructions in basic block.
+    findLEAs(MBB, LEAs);
+
+    // If current basic block has no LEAs, move on to the next one.
+    if (LEAs.empty())
+      continue;
+
+    // Remove redundant address calculations.
+    Changed |= removeRedundantAddrCalc(LEAs);
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
new file mode 100644
index 0000000..0f425e2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -0,0 +1,213 @@
+//===-------- X86PadShortFunction.cpp - pad short functions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which will pad short functions to prevent
+// a stall if a function returns before the return address is ready. This
+// is needed for some Intel Atom processors.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-pad-short-functions"
+
+STATISTIC(NumBBsPadded, "Number of basic blocks padded");
+
+namespace {
+  struct VisitedBBInfo {
+    // HasReturn - Whether the BB contains a return instruction
+    bool HasReturn;
+
+    // Cycles - Number of cycles until return if HasReturn is true, otherwise
+    // number of cycles until end of the BB
+    unsigned int Cycles;
+
+    VisitedBBInfo() : HasReturn(false), Cycles(0) {}
+    VisitedBBInfo(bool HasReturn, unsigned int Cycles)
+      : HasReturn(HasReturn), Cycles(Cycles) {}
+  };
+
+  struct PadShortFunc : public MachineFunctionPass {
+    static char ID;
+    PadShortFunc() : MachineFunctionPass(ID)
+                   , Threshold(4), STI(nullptr), TII(nullptr) {}
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    const char *getPassName() const override {
+      return "X86 Atom pad short functions";
+    }
+
+  private:
+    void findReturns(MachineBasicBlock *MBB,
+                     unsigned int Cycles = 0);
+
+    bool cyclesUntilReturn(MachineBasicBlock *MBB,
+                           unsigned int &Cycles);
+
+    void addPadding(MachineBasicBlock *MBB,
+                    MachineBasicBlock::iterator &MBBI,
+                    unsigned int NOOPsToAdd);
+
+    const unsigned int Threshold;
+
+    // ReturnBBs - Maps basic blocks that return to the minimum number of
+    // cycles until the return, starting from the entry block.
+    DenseMap<MachineBasicBlock*, unsigned int> ReturnBBs;
+
+    // VisitedBBs - Cache of previously visited BBs.
+    DenseMap<MachineBasicBlock*, VisitedBBInfo> VisitedBBs;
+
+    const X86Subtarget *STI;
+    const TargetInstrInfo *TII;
+  };
+
+  char PadShortFunc::ID = 0;
+}
+
+FunctionPass *llvm::createX86PadShortFunctions() {
+  return new PadShortFunc();
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, inserting
+/// NOOP instructions before early exits.
+bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getFunction()->optForSize()) {
+    return false;
+  }
+
+  STI = &MF.getSubtarget<X86Subtarget>();
+  if (!STI->padShortFunctions())
+    return false;
+
+  TII = STI->getInstrInfo();
+
+  // Search through basic blocks and mark the ones that have early returns
+  ReturnBBs.clear();
+  VisitedBBs.clear();
+  findReturns(&MF.front());
+
+  bool MadeChange = false;
+
+  MachineBasicBlock *MBB;
+  unsigned int Cycles = 0;
+
+  // Pad the identified basic blocks with NOOPs
+  for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin();
+       I != ReturnBBs.end(); ++I) {
+    MBB = I->first;
+    Cycles = I->second;
+
+    if (Cycles < Threshold) {
+      // BB ends in a return. Skip over any DBG_VALUE instructions
+      // trailing the terminator.
+      assert(MBB->size() > 0 &&
+             "Basic block should contain at least a RET but is empty");
+      MachineBasicBlock::iterator ReturnLoc = --MBB->end();
+
+      while (ReturnLoc->isDebugValue())
+        --ReturnLoc;
+      assert(ReturnLoc->isReturn() && !ReturnLoc->isCall() &&
+             "Basic block does not end with RET");
+
+      addPadding(MBB, ReturnLoc, Threshold - Cycles);
+      NumBBsPadded++;
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+/// findReturn - Starting at MBB, follow control flow and add all
+/// basic blocks that contain a return to ReturnBBs.
+void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) {
+  // If this BB has a return, note how many cycles it takes to get there.
+  bool hasReturn = cyclesUntilReturn(MBB, Cycles);
+  if (Cycles >= Threshold)
+    return;
+
+  if (hasReturn) {
+    ReturnBBs[MBB] = std::max(ReturnBBs[MBB], Cycles);
+    return;
+  }
+
+  // Follow branches in BB and look for returns
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin();
+       I != MBB->succ_end(); ++I) {
+    if (*I == MBB)
+      continue;
+    findReturns(*I, Cycles);
+  }
+}
+
+/// cyclesUntilReturn - return true if the MBB has a return instruction,
+/// and return false otherwise.
+/// Cycles will be incremented by the number of cycles taken to reach the
+/// return or the end of the BB, whichever occurs first.
+bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB,
+                                     unsigned int &Cycles) {
+  // Return cached result if BB was previously visited
+  DenseMap<MachineBasicBlock*, VisitedBBInfo>::iterator it
+    = VisitedBBs.find(MBB);
+  if (it != VisitedBBs.end()) {
+    VisitedBBInfo BBInfo = it->second;
+    Cycles += BBInfo.Cycles;
+    return BBInfo.HasReturn;
+  }
+
+  unsigned int CyclesToEnd = 0;
+
+  for (MachineBasicBlock::iterator MBBI = MBB->begin();
+        MBBI != MBB->end(); ++MBBI) {
+    MachineInstr *MI = MBBI;
+    // Mark basic blocks with a return instruction. Calls to other
+    // functions do not count because the called function will be padded,
+    // if necessary.
+    if (MI->isReturn() && !MI->isCall()) {
+      VisitedBBs[MBB] = VisitedBBInfo(true, CyclesToEnd);
+      Cycles += CyclesToEnd;
+      return true;
+    }
+
+    CyclesToEnd += TII->getInstrLatency(STI->getInstrItineraryData(), MI);
+  }
+
+  VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd);
+  Cycles += CyclesToEnd;
+  return false;
+}
+
+/// addPadding - Add the given number of NOOP instructions to the function
+/// just prior to the return at MBBI
+void PadShortFunc::addPadding(MachineBasicBlock *MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              unsigned int NOOPsToAdd) {
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  while (NOOPsToAdd-- > 0) {
+    BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
+    BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
new file mode 100644
index 0000000..5840443
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -0,0 +1,639 @@
+//===-- X86RegisterInfo.cpp - X86 Register Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+// This file is responsible for the frame pointer elimination optimization
+// on X86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86RegisterInfo.h"
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_TARGET_DESC
+#include "X86GenRegisterInfo.inc"
+
+static cl::opt<bool>
+EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
+          cl::desc("Enable use of a base pointer for complex stack frames"));
+
+X86RegisterInfo::X86RegisterInfo(const Triple &TT)
+    : X86GenRegisterInfo((TT.isArch64Bit() ? X86::RIP : X86::EIP),
+                         X86_MC::getDwarfRegFlavour(TT, false),
+                         X86_MC::getDwarfRegFlavour(TT, true),
+                         (TT.isArch64Bit() ? X86::RIP : X86::EIP)) {
+  X86_MC::InitLLVM2SEHRegisterMapping(this);
+
+  // Cache some information.
+  Is64Bit = TT.isArch64Bit();
+  IsWin64 = Is64Bit && TT.isOSWindows();
+
+  // Use a callee-saved register as the base pointer.  These registers must
+  // not conflict with any ABI requirements.  For example, in 32-bit mode PIC
+  // requires GOT in the EBX register before function calls via PLT GOT pointer.
+  if (Is64Bit) {
+    SlotSize = 8;
+    // This matches the simplified 32-bit pointer code in the data layout
+    // computation.
+    // FIXME: Should use the data layout?
+    bool Use64BitReg = TT.getEnvironment() != Triple::GNUX32;
+    StackPtr = Use64BitReg ? X86::RSP : X86::ESP;
+    FramePtr = Use64BitReg ? X86::RBP : X86::EBP;
+    BasePtr = Use64BitReg ? X86::RBX : X86::EBX;
+  } else {
+    SlotSize = 4;
+    StackPtr = X86::ESP;
+    FramePtr = X86::EBP;
+    BasePtr = X86::ESI;
+  }
+}
+
+bool
+X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  // ExeDepsFixer and PostRAScheduler require liveness.
+  return true;
+}
+
+int
+X86RegisterInfo::getSEHRegNum(unsigned i) const {
+  return getEncodingValue(i);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC,
+                                       unsigned Idx) const {
+  // The sub_8bit sub-register index is more constrained in 32-bit mode.
+  // It behaves just like the sub_8bit_hi index.
+  if (!Is64Bit && Idx == X86::sub_8bit)
+    Idx = X86::sub_8bit_hi;
+
+  // Forward to TableGen's default version.
+  return X86GenRegisterInfo::getSubClassWithSubReg(RC, Idx);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+                                          const TargetRegisterClass *B,
+                                          unsigned SubIdx) const {
+  // The sub_8bit sub-register index is more constrained in 32-bit mode.
+  if (!Is64Bit && SubIdx == X86::sub_8bit) {
+    A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi);
+    if (!A)
+      return nullptr;
+  }
+  return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
+                                           const MachineFunction &MF) const {
+  // Don't allow super-classes of GR8_NOREX.  This class is only used after
+  // extracting sub_8bit_hi sub-registers.  The H sub-registers cannot be copied
+  // to the full GR8 register class in 64-bit mode, so we cannot allow the
+  // reigster class inflation.
+  //
+  // The GR8_NOREX class is always used in a way that won't be constrained to a
+  // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the
+  // full GR8 class.
+  if (RC == &X86::GR8_NOREXRegClass)
+    return RC;
+
+  const TargetRegisterClass *Super = RC;
+  TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
+  do {
+    switch (Super->getID()) {
+    case X86::GR8RegClassID:
+    case X86::GR16RegClassID:
+    case X86::GR32RegClassID:
+    case X86::GR64RegClassID:
+    case X86::FR32RegClassID:
+    case X86::FR64RegClassID:
+    case X86::RFP32RegClassID:
+    case X86::RFP64RegClassID:
+    case X86::RFP80RegClassID:
+    case X86::VR128RegClassID:
+    case X86::VR256RegClassID:
+      // Don't return a super-class that would shrink the spill size.
+      // That can happen with the vector and float classes.
+      if (Super->getSize() == RC->getSize())
+        return Super;
+    }
+    Super = *I++;
+  } while (Super);
+  return RC;
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                    unsigned Kind) const {
+  const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+  switch (Kind) {
+  default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
+  case 0: // Normal GPRs.
+    if (Subtarget.isTarget64BitLP64())
+      return &X86::GR64RegClass;
+    return &X86::GR32RegClass;
+  case 1: // Normal GPRs except the stack pointer (for encoding reasons).
+    if (Subtarget.isTarget64BitLP64())
+      return &X86::GR64_NOSPRegClass;
+    return &X86::GR32_NOSPRegClass;
+  case 2: // NOREX GPRs.
+    if (Subtarget.isTarget64BitLP64())
+      return &X86::GR64_NOREXRegClass;
+    return &X86::GR32_NOREXRegClass;
+  case 3: // NOREX GPRs except the stack pointer (for encoding reasons).
+    if (Subtarget.isTarget64BitLP64())
+      return &X86::GR64_NOREX_NOSPRegClass;
+    return &X86::GR32_NOREX_NOSPRegClass;
+  case 4: // Available for tailcall (not callee-saved GPRs).
+    return getGPRsForTailCall(MF);
+  }
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
+  const Function *F = MF.getFunction();
+  if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64))
+    return &X86::GR64_TCW64RegClass;
+  else if (Is64Bit)
+    return &X86::GR64_TCRegClass;
+
+  bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false);
+  if (hasHipeCC)
+    return &X86::GR32RegClass;
+  return &X86::GR32_TCRegClass;
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &X86::CCRRegClass) {
+    if (Is64Bit)
+      return &X86::GR64RegClass;
+    else
+      return &X86::GR32RegClass;
+  }
+  return RC;
+}
+
+unsigned
+X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                     MachineFunction &MF) const {
+  const X86FrameLowering *TFI = getFrameLowering(MF);
+
+  unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0;
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case X86::GR32RegClassID:
+    return 4 - FPDiff;
+  case X86::GR64RegClassID:
+    return 12 - FPDiff;
+  case X86::VR128RegClassID:
+    return Is64Bit ? 10 : 4;
+  case X86::VR64RegClassID:
+    return 4;
+  }
+}
+
+const MCPhysReg *
+X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  const X86Subtarget &Subtarget = MF->getSubtarget<X86Subtarget>();
+  bool HasSSE = Subtarget.hasSSE1();
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
+  bool CallsEHReturn = MF->getMMI().callsEHReturn();
+
+  assert(MF && "MachineFunction required");
+  switch (MF->getFunction()->getCallingConv()) {
+  case CallingConv::GHC:
+  case CallingConv::HiPE:
+    return CSR_NoRegs_SaveList;
+  case CallingConv::AnyReg:
+    if (HasAVX)
+      return CSR_64_AllRegs_AVX_SaveList;
+    return CSR_64_AllRegs_SaveList;
+  case CallingConv::PreserveMost:
+    return CSR_64_RT_MostRegs_SaveList;
+  case CallingConv::PreserveAll:
+    if (HasAVX)
+      return CSR_64_RT_AllRegs_AVX_SaveList;
+    return CSR_64_RT_AllRegs_SaveList;
+  case CallingConv::CXX_FAST_TLS:
+    if (Is64Bit)
+      return CSR_64_TLS_Darwin_SaveList;
+    break;
+  case CallingConv::Intel_OCL_BI: {
+    if (HasAVX512 && IsWin64)
+      return CSR_Win64_Intel_OCL_BI_AVX512_SaveList;
+    if (HasAVX512 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_AVX512_SaveList;
+    if (HasAVX && IsWin64)
+      return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
+    if (HasAVX && Is64Bit)
+      return CSR_64_Intel_OCL_BI_AVX_SaveList;
+    if (!HasAVX && !IsWin64 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_SaveList;
+    break;
+  }
+  case CallingConv::HHVM:
+    return CSR_64_HHVM_SaveList;
+  case CallingConv::Cold:
+    if (Is64Bit)
+      return CSR_64_MostRegs_SaveList;
+    break;
+  case CallingConv::X86_64_Win64:
+    return CSR_Win64_SaveList;
+  case CallingConv::X86_64_SysV:
+    if (CallsEHReturn)
+      return CSR_64EHRet_SaveList;
+    return CSR_64_SaveList;
+  case CallingConv::X86_INTR:
+    if (Is64Bit) {
+      if (HasAVX)
+        return CSR_64_AllRegs_AVX_SaveList;
+      else
+        return CSR_64_AllRegs_SaveList;
+    } else {
+      if (HasSSE)
+        return CSR_32_AllRegs_SSE_SaveList;
+      else
+        return CSR_32_AllRegs_SaveList;
+    }
+  default:
+    break;
+  }
+
+  if (Is64Bit) {
+    if (IsWin64)
+      return CSR_Win64_SaveList;
+    if (CallsEHReturn)
+      return CSR_64EHRet_SaveList;
+    return CSR_64_SaveList;
+  }
+  if (CallsEHReturn)
+    return CSR_32EHRet_SaveList;
+  return CSR_32_SaveList;
+}
+
+const uint32_t *
+X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                      CallingConv::ID CC) const {
+  const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+  bool HasSSE = Subtarget.hasSSE1();
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
+
+  switch (CC) {
+  case CallingConv::GHC:
+  case CallingConv::HiPE:
+    return CSR_NoRegs_RegMask;
+  case CallingConv::AnyReg:
+    if (HasAVX)
+      return CSR_64_AllRegs_AVX_RegMask;
+    return CSR_64_AllRegs_RegMask;
+  case CallingConv::PreserveMost:
+    return CSR_64_RT_MostRegs_RegMask;
+  case CallingConv::PreserveAll:
+    if (HasAVX)
+      return CSR_64_RT_AllRegs_AVX_RegMask;
+    return CSR_64_RT_AllRegs_RegMask;
+  case CallingConv::CXX_FAST_TLS:
+    if (Is64Bit)
+      return CSR_64_TLS_Darwin_RegMask;
+    break;
+  case CallingConv::Intel_OCL_BI: {
+    if (HasAVX512 && IsWin64)
+      return CSR_Win64_Intel_OCL_BI_AVX512_RegMask;
+    if (HasAVX512 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_AVX512_RegMask;
+    if (HasAVX && IsWin64)
+      return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
+    if (HasAVX && Is64Bit)
+      return CSR_64_Intel_OCL_BI_AVX_RegMask;
+    if (!HasAVX && !IsWin64 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_RegMask;
+    break;
+  }
+  case CallingConv::HHVM:
+    return CSR_64_HHVM_RegMask;
+  case CallingConv::Cold:
+    if (Is64Bit)
+      return CSR_64_MostRegs_RegMask;
+    break;
+  case CallingConv::X86_64_Win64:
+    return CSR_Win64_RegMask;
+  case CallingConv::X86_64_SysV:
+    return CSR_64_RegMask;
+  case CallingConv::X86_INTR:
+    if (Is64Bit) {
+      if (HasAVX)
+        return CSR_64_AllRegs_AVX_RegMask;
+      else
+        return CSR_64_AllRegs_RegMask;
+    } else {
+      if (HasSSE)
+        return CSR_32_AllRegs_SSE_RegMask;
+      else
+        return CSR_32_AllRegs_RegMask;
+    }
+    default:
+      break;
+  }
+
+  // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check
+  // callsEHReturn().
+  if (Is64Bit) {
+    if (IsWin64)
+      return CSR_Win64_RegMask;
+    return CSR_64_RegMask;
+  }
+  return CSR_32_RegMask;
+}
+
+const uint32_t*
+X86RegisterInfo::getNoPreservedMask() const {
+  return CSR_NoRegs_RegMask;
+}
+
+const uint32_t *X86RegisterInfo::getDarwinTLSCallPreservedMask() const {
+  return CSR_64_TLS_Darwin_RegMask;
+}
+
+BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const X86FrameLowering *TFI = getFrameLowering(MF);
+
+  // Set the stack-pointer register and its aliases as reserved.
+  for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid();
+       ++I)
+    Reserved.set(*I);
+
+  // Set the instruction pointer register and its aliases as reserved.
+  for (MCSubRegIterator I(X86::RIP, this, /*IncludeSelf=*/true); I.isValid();
+       ++I)
+    Reserved.set(*I);
+
+  // Set the frame-pointer register and its aliases as reserved if needed.
+  if (TFI->hasFP(MF)) {
+    for (MCSubRegIterator I(X86::RBP, this, /*IncludeSelf=*/true); I.isValid();
+         ++I)
+      Reserved.set(*I);
+  }
+
+  // Set the base-pointer register and its aliases as reserved if needed.
+  if (hasBasePointer(MF)) {
+    CallingConv::ID CC = MF.getFunction()->getCallingConv();
+    const uint32_t *RegMask = getCallPreservedMask(MF, CC);
+    if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister()))
+      report_fatal_error(
+        "Stack realignment in presence of dynamic allocas is not supported with"
+        "this calling convention.");
+
+    unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), 64);
+    for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true);
+         I.isValid(); ++I)
+      Reserved.set(*I);
+  }
+
+  // Mark the segment registers as reserved.
+  Reserved.set(X86::CS);
+  Reserved.set(X86::SS);
+  Reserved.set(X86::DS);
+  Reserved.set(X86::ES);
+  Reserved.set(X86::FS);
+  Reserved.set(X86::GS);
+
+  // Mark the floating point stack registers as reserved.
+  for (unsigned n = 0; n != 8; ++n)
+    Reserved.set(X86::ST0 + n);
+
+  // Reserve the registers that only exist in 64-bit mode.
+  if (!Is64Bit) {
+    // These 8-bit registers are part of the x86-64 extension even though their
+    // super-registers are old 32-bits.
+    Reserved.set(X86::SIL);
+    Reserved.set(X86::DIL);
+    Reserved.set(X86::BPL);
+    Reserved.set(X86::SPL);
+
+    for (unsigned n = 0; n != 8; ++n) {
+      // R8, R9, ...
+      for (MCRegAliasIterator AI(X86::R8 + n, this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
+
+      // XMM8, XMM9, ...
+      for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
+    }
+  }
+  if (!Is64Bit || !MF.getSubtarget<X86Subtarget>().hasAVX512()) {
+    for (unsigned n = 16; n != 32; ++n) {
+      for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
+    }
+  }
+
+  return Reserved;
+}
+
+void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
+  // Check if the EFLAGS register is marked as live-out. This shouldn't happen,
+  // because the calling convention defines the EFLAGS register as NOT
+  // preserved.
+  //
+  // Unfortunatelly the EFLAGS show up as live-out after branch folding. Adding
+  // an assert to track this and clear the register afterwards to avoid
+  // unnecessary crashes during release builds.
+  assert(!(Mask[X86::EFLAGS / 32] & (1U << (X86::EFLAGS % 32))) &&
+         "EFLAGS are not live-out from a patchpoint.");
+
+  // Also clean other registers that don't need preserving (IP).
+  for (auto Reg : {X86::EFLAGS, X86::RIP, X86::EIP, X86::IP})
+    Mask[Reg / 32] &= ~(1U << (Reg % 32));
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+static bool CantUseSP(const MachineFrameInfo *MFI) {
+  return MFI->hasVarSizedObjects() || MFI->hasOpaqueSPAdjustment();
+}
+
+bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+   const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+   if (!EnableBasePointer)
+     return false;
+
+   // When we need stack realignment, we can't address the stack from the frame
+   // pointer.  When we have dynamic allocas or stack-adjusting inline asm, we
+   // can't address variables from the stack pointer.  MS inline asm can
+   // reference locals while also adjusting the stack pointer.  When we can't
+   // use both the SP and the FP, we need a separate base pointer register.
+   bool CantUseFP = needsStackRealignment(MF);
+   return CantUseFP && CantUseSP(MFI);
+}
+
+bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  if (!TargetRegisterInfo::canRealignStack(MF))
+    return false;
+
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const MachineRegisterInfo *MRI = &MF.getRegInfo();
+
+  // Stack realignment requires a frame pointer.  If we already started
+  // register allocation with frame pointer elimination, it is too late now.
+  if (!MRI->canReserveReg(FramePtr))
+    return false;
+
+  // If a base pointer is necessary.  Check that it isn't too late to reserve
+  // it.
+  if (CantUseSP(MFI))
+    return MRI->canReserveReg(BasePtr);
+  return true;
+}
+
+bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
+                                           unsigned Reg, int &FrameIdx) const {
+  // Since X86 defines assignCalleeSavedSpillSlots which always return true
+  // this function neither used nor tested.
+  llvm_unreachable("Unused function on X86. Otherwise need a test case.");
+}
+
+void
+X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                     int SPAdj, unsigned FIOperandNum,
+                                     RegScavenger *RS) const {
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  const X86FrameLowering *TFI = getFrameLowering(MF);
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  unsigned BasePtr;
+
+  unsigned Opc = MI.getOpcode();
+  bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm ||
+                    Opc == X86::TCRETURNmi || Opc == X86::TCRETURNmi64;
+
+  if (hasBasePointer(MF))
+    BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister());
+  else if (needsStackRealignment(MF))
+    BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
+  else if (AfterFPPop)
+    BasePtr = StackPtr;
+  else
+    BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr);
+
+  // LOCAL_ESCAPE uses a single offset, with no register. It only works in the
+  // simple FP case, and doesn't work with stack realignment. On 32-bit, the
+  // offset is from the traditional base pointer location.  On 64-bit, the
+  // offset is from the SP at the end of the prologue, not the FP location. This
+  // matches the behavior of llvm.frameaddress.
+  unsigned IgnoredFrameReg;
+  if (Opc == TargetOpcode::LOCAL_ESCAPE) {
+    MachineOperand &FI = MI.getOperand(FIOperandNum);
+    int Offset;
+    Offset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
+    FI.ChangeToImmediate(Offset);
+    return;
+  }
+
+  // For LEA64_32r when BasePtr is 32-bits (X32) we can use full-size 64-bit
+  // register as source operand, semantic is the same and destination is
+  // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided.
+  if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr))
+    BasePtr = getX86SubSuperRegister(BasePtr, 64);
+
+  // This must be part of a four operand memory reference.  Replace the
+  // FrameIndex with base register with EBP.  Add an offset to the offset.
+  MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+
+  // Now add the frame object offset to the offset from EBP.
+  int FIOffset;
+  if (AfterFPPop) {
+    // Tail call jmp happens after FP is popped.
+    const MachineFrameInfo *MFI = MF.getFrameInfo();
+    FIOffset = MFI->getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea();
+  } else
+    FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
+
+  if (BasePtr == StackPtr)
+    FIOffset += SPAdj;
+
+  // The frame index format for stackmaps and patchpoints is different from the
+  // X86 format. It only has a FI and an offset.
+  if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) {
+    assert(BasePtr == FramePtr && "Expected the FP as base register");
+    int64_t Offset = MI.getOperand(FIOperandNum + 1).getImm() + FIOffset;
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  if (MI.getOperand(FIOperandNum+3).isImm()) {
+    // Offset is a 32-bit integer.
+    int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm());
+    int Offset = FIOffset + Imm;
+    assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) &&
+           "Requesting 64-bit offset in 32-bit immediate!");
+    MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset);
+  } else {
+    // Offset is symbolic. This is extremely rare.
+    uint64_t Offset = FIOffset +
+      (uint64_t)MI.getOperand(FIOperandNum+3).getOffset();
+    MI.getOperand(FIOperandNum + 3).setOffset(Offset);
+  }
+}
+
+unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const X86FrameLowering *TFI = getFrameLowering(MF);
+  return TFI->hasFP(MF) ? FramePtr : StackPtr;
+}
+
+unsigned
+X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const {
+  const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+  unsigned FrameReg = getFrameRegister(MF);
+  if (Subtarget.isTarget64BitILP32())
+    FrameReg = getX86SubSuperRegister(FrameReg, 32);
+  return FrameReg;
+}
+
+unsigned llvm::get512BitSuperRegister(unsigned Reg) {
+  if (Reg >= X86::XMM0 && Reg <= X86::XMM31)
+    return X86::ZMM0 + (Reg - X86::XMM0);
+  if (Reg >= X86::YMM0 && Reg <= X86::YMM31)
+    return X86::ZMM0 + (Reg - X86::YMM0);
+  if (Reg >= X86::ZMM0 && Reg <= X86::ZMM31)
+    return Reg;
+  llvm_unreachable("Unexpected SIMD register");
+}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
new file mode 100644
index 0000000..f014c8f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -0,0 +1,143 @@
+//===-- X86RegisterInfo.h - X86 Register Information Impl -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
+#define LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "X86GenRegisterInfo.inc"
+
+namespace llvm {
+  class Triple;
+
+class X86RegisterInfo final : public X86GenRegisterInfo {
+private:
+  /// Is64Bit - Is the target 64-bits.
+  ///
+  bool Is64Bit;
+
+  /// IsWin64 - Is the target on of win64 flavours
+  ///
+  bool IsWin64;
+
+  /// SlotSize - Stack slot size in bytes.
+  ///
+  unsigned SlotSize;
+
+  /// StackPtr - X86 physical register used as stack ptr.
+  ///
+  unsigned StackPtr;
+
+  /// FramePtr - X86 physical register used as frame ptr.
+  ///
+  unsigned FramePtr;
+
+  /// BasePtr - X86 physical register used as a base ptr in complex stack
+  /// frames. I.e., when we need a 3rd base, not just SP and FP, due to
+  /// variable size stack objects.
+  unsigned BasePtr;
+
+public:
+  X86RegisterInfo(const Triple &TT);
+
+  // FIXME: This should be tablegen'd like getDwarfRegNum is
+  int getSEHRegNum(unsigned i) const;
+
+  /// Code Generation virtual methods...
+  ///
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+  /// getMatchingSuperRegClass - Return a subclass of the specified register
+  /// class A so that each register in it has a sub-register of the
+  /// specified sub-register index which is in the specified register class B.
+  const TargetRegisterClass *
+  getMatchingSuperRegClass(const TargetRegisterClass *A,
+                           const TargetRegisterClass *B,
+                           unsigned Idx) const override;
+
+  const TargetRegisterClass *
+  getSubClassWithSubReg(const TargetRegisterClass *RC,
+                        unsigned Idx) const override;
+
+  const TargetRegisterClass *
+  getLargestLegalSuperClass(const TargetRegisterClass *RC,
+                            const MachineFunction &MF) const override;
+
+  /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
+  /// values.
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+
+  /// getCrossCopyRegClass - Returns a legal register class to copy a register
+  /// in the specified class to or from. Returns NULL if it is possible to copy
+  /// between a two registers of the specified class.
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
+  /// getGPRsForTailCall - Returns a register class with registers that can be
+  /// used in forming tail calls.
+  const TargetRegisterClass *
+  getGPRsForTailCall(const MachineFunction &MF) const;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
+
+  /// getCalleeSavedRegs - Return a null-terminated list of all of the
+  /// callee-save registers on this target.
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction* MF) const override;
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID) const override;
+  const uint32_t *getNoPreservedMask() const override;
+
+  // Calls involved in thread-local variable lookup save more registers than
+  // normal calls, so they need a different mask to represent this.
+  const uint32_t *getDarwinTLSCallPreservedMask() const;
+
+  /// getReservedRegs - Returns a bitset indexed by physical register number
+  /// indicating if a register is a special register that has particular uses and
+  /// should be considered unavailable at all times, e.g. SP, RA. This is used by
+  /// register scavenger to determine what registers are free.
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  void adjustStackMapLiveOutMask(uint32_t *Mask) const override;
+
+  bool hasBasePointer(const MachineFunction &MF) const;
+
+  bool canRealignStack(const MachineFunction &MF) const override;
+
+  bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
+                            int &FrameIdx) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+  unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const;
+  unsigned getStackRegister() const { return StackPtr; }
+  unsigned getBaseRegister() const { return BasePtr; }
+  // FIXME: Move to FrameInfok
+  unsigned getSlotSize() const { return SlotSize; }
+};
+
+//get512BitRegister - X86 utility - returns 512-bit super register
+unsigned get512BitSuperRegister(unsigned Reg);
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
new file mode 100644
index 0000000..56f0d93
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -0,0 +1,497 @@
+//===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 Register file, defining the registers themselves,
+// aliases between the registers, and the register classes built out of the
+// registers.
+//
+//===----------------------------------------------------------------------===//
+
+class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n> {
+  let Namespace = "X86";
+  let HWEncoding = Enc;
+  let SubRegs = subregs;
+}
+
+// Subregister indices.
+let Namespace = "X86" in {
+  def sub_8bit    : SubRegIndex<8>;
+  def sub_8bit_hi : SubRegIndex<8, 8>;
+  def sub_16bit   : SubRegIndex<16>;
+  def sub_32bit   : SubRegIndex<32>;
+  def sub_xmm     : SubRegIndex<128>;
+  def sub_ymm     : SubRegIndex<256>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Register definitions...
+//
+
+// In the register alias definitions below, we define which registers alias
+// which others.  We only specify which registers the small registers alias,
+// because the register file generator is smart enough to figure out that
+// AL aliases AX if we tell it that AX aliased AL (for example).
+
+// Dwarf numbering is different for 32-bit and 64-bit, and there are
+// variations by target as well. Currently the first entry is for X86-64,
+// second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux
+// and debug information on X86-32/Darwin)
+
+// 8-bit registers
+// Low registers
+def AL : X86Reg<"al", 0>;
+def DL : X86Reg<"dl", 2>;
+def CL : X86Reg<"cl", 1>;
+def BL : X86Reg<"bl", 3>;
+
+// High registers. On x86-64, these cannot be used in any instruction
+// with a REX prefix.
+def AH : X86Reg<"ah", 4>;
+def DH : X86Reg<"dh", 6>;
+def CH : X86Reg<"ch", 5>;
+def BH : X86Reg<"bh", 7>;
+
+// X86-64 only, requires REX.
+let CostPerUse = 1 in {
+def SIL  : X86Reg<"sil",   6>;
+def DIL  : X86Reg<"dil",   7>;
+def BPL  : X86Reg<"bpl",   5>;
+def SPL  : X86Reg<"spl",   4>;
+def R8B  : X86Reg<"r8b",   8>;
+def R9B  : X86Reg<"r9b",   9>;
+def R10B : X86Reg<"r10b", 10>;
+def R11B : X86Reg<"r11b", 11>;
+def R12B : X86Reg<"r12b", 12>;
+def R13B : X86Reg<"r13b", 13>;
+def R14B : X86Reg<"r14b", 14>;
+def R15B : X86Reg<"r15b", 15>;
+}
+
+// 16-bit registers
+let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in {
+def AX : X86Reg<"ax", 0, [AL,AH]>;
+def DX : X86Reg<"dx", 2, [DL,DH]>;
+def CX : X86Reg<"cx", 1, [CL,CH]>;
+def BX : X86Reg<"bx", 3, [BL,BH]>;
+}
+let SubRegIndices = [sub_8bit] in {
+def SI : X86Reg<"si", 6, [SIL]>;
+def DI : X86Reg<"di", 7, [DIL]>;
+def BP : X86Reg<"bp", 5, [BPL]>;
+def SP : X86Reg<"sp", 4, [SPL]>;
+}
+def IP : X86Reg<"ip", 0>;
+
+// X86-64 only, requires REX.
+let SubRegIndices = [sub_8bit], CostPerUse = 1 in {
+def R8W  : X86Reg<"r8w",   8, [R8B]>;
+def R9W  : X86Reg<"r9w",   9, [R9B]>;
+def R10W : X86Reg<"r10w", 10, [R10B]>;
+def R11W : X86Reg<"r11w", 11, [R11B]>;
+def R12W : X86Reg<"r12w", 12, [R12B]>;
+def R13W : X86Reg<"r13w", 13, [R13B]>;
+def R14W : X86Reg<"r14w", 14, [R14B]>;
+def R15W : X86Reg<"r15w", 15, [R15B]>;
+}
+
+// 32-bit registers
+let SubRegIndices = [sub_16bit] in {
+def EAX : X86Reg<"eax", 0, [AX]>, DwarfRegNum<[-2, 0, 0]>;
+def EDX : X86Reg<"edx", 2, [DX]>, DwarfRegNum<[-2, 2, 2]>;
+def ECX : X86Reg<"ecx", 1, [CX]>, DwarfRegNum<[-2, 1, 1]>;
+def EBX : X86Reg<"ebx", 3, [BX]>, DwarfRegNum<[-2, 3, 3]>;
+def ESI : X86Reg<"esi", 6, [SI]>, DwarfRegNum<[-2, 6, 6]>;
+def EDI : X86Reg<"edi", 7, [DI]>, DwarfRegNum<[-2, 7, 7]>;
+def EBP : X86Reg<"ebp", 5, [BP]>, DwarfRegNum<[-2, 4, 5]>;
+def ESP : X86Reg<"esp", 4, [SP]>, DwarfRegNum<[-2, 5, 4]>;
+def EIP : X86Reg<"eip", 0, [IP]>, DwarfRegNum<[-2, 8, 8]>;
+
+// X86-64 only, requires REX
+let CostPerUse = 1 in {
+def R8D  : X86Reg<"r8d",   8, [R8W]>;
+def R9D  : X86Reg<"r9d",   9, [R9W]>;
+def R10D : X86Reg<"r10d", 10, [R10W]>;
+def R11D : X86Reg<"r11d", 11, [R11W]>;
+def R12D : X86Reg<"r12d", 12, [R12W]>;
+def R13D : X86Reg<"r13d", 13, [R13W]>;
+def R14D : X86Reg<"r14d", 14, [R14W]>;
+def R15D : X86Reg<"r15d", 15, [R15W]>;
+}}
+
+// 64-bit registers, X86-64 only
+let SubRegIndices = [sub_32bit] in {
+def RAX : X86Reg<"rax", 0, [EAX]>, DwarfRegNum<[0, -2, -2]>;
+def RDX : X86Reg<"rdx", 2, [EDX]>, DwarfRegNum<[1, -2, -2]>;
+def RCX : X86Reg<"rcx", 1, [ECX]>, DwarfRegNum<[2, -2, -2]>;
+def RBX : X86Reg<"rbx", 3, [EBX]>, DwarfRegNum<[3, -2, -2]>;
+def RSI : X86Reg<"rsi", 6, [ESI]>, DwarfRegNum<[4, -2, -2]>;
+def RDI : X86Reg<"rdi", 7, [EDI]>, DwarfRegNum<[5, -2, -2]>;
+def RBP : X86Reg<"rbp", 5, [EBP]>, DwarfRegNum<[6, -2, -2]>;
+def RSP : X86Reg<"rsp", 4, [ESP]>, DwarfRegNum<[7, -2, -2]>;
+
+// These also require REX.
+let CostPerUse = 1 in {
+def R8  : X86Reg<"r8",   8, [R8D]>,  DwarfRegNum<[ 8, -2, -2]>;
+def R9  : X86Reg<"r9",   9, [R9D]>,  DwarfRegNum<[ 9, -2, -2]>;
+def R10 : X86Reg<"r10", 10, [R10D]>, DwarfRegNum<[10, -2, -2]>;
+def R11 : X86Reg<"r11", 11, [R11D]>, DwarfRegNum<[11, -2, -2]>;
+def R12 : X86Reg<"r12", 12, [R12D]>, DwarfRegNum<[12, -2, -2]>;
+def R13 : X86Reg<"r13", 13, [R13D]>, DwarfRegNum<[13, -2, -2]>;
+def R14 : X86Reg<"r14", 14, [R14D]>, DwarfRegNum<[14, -2, -2]>;
+def R15 : X86Reg<"r15", 15, [R15D]>, DwarfRegNum<[15, -2, -2]>;
+def RIP : X86Reg<"rip",  0, [EIP]>,  DwarfRegNum<[16, -2, -2]>;
+}}
+
+// MMX Registers. These are actually aliased to ST0 .. ST7
+def MM0 : X86Reg<"mm0", 0>, DwarfRegNum<[41, 29, 29]>;
+def MM1 : X86Reg<"mm1", 1>, DwarfRegNum<[42, 30, 30]>;
+def MM2 : X86Reg<"mm2", 2>, DwarfRegNum<[43, 31, 31]>;
+def MM3 : X86Reg<"mm3", 3>, DwarfRegNum<[44, 32, 32]>;
+def MM4 : X86Reg<"mm4", 4>, DwarfRegNum<[45, 33, 33]>;
+def MM5 : X86Reg<"mm5", 5>, DwarfRegNum<[46, 34, 34]>;
+def MM6 : X86Reg<"mm6", 6>, DwarfRegNum<[47, 35, 35]>;
+def MM7 : X86Reg<"mm7", 7>, DwarfRegNum<[48, 36, 36]>;
+
+// Pseudo Floating Point registers
+def FP0 : X86Reg<"fp0", 0>;
+def FP1 : X86Reg<"fp1", 0>;
+def FP2 : X86Reg<"fp2", 0>;
+def FP3 : X86Reg<"fp3", 0>;
+def FP4 : X86Reg<"fp4", 0>;
+def FP5 : X86Reg<"fp5", 0>;
+def FP6 : X86Reg<"fp6", 0>;
+def FP7 : X86Reg<"fp7", 0>;
+
+// XMM Registers, used by the various SSE instruction set extensions.
+def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>;
+def XMM1: X86Reg<"xmm1", 1>, DwarfRegNum<[18, 22, 22]>;
+def XMM2: X86Reg<"xmm2", 2>, DwarfRegNum<[19, 23, 23]>;
+def XMM3: X86Reg<"xmm3", 3>, DwarfRegNum<[20, 24, 24]>;
+def XMM4: X86Reg<"xmm4", 4>, DwarfRegNum<[21, 25, 25]>;
+def XMM5: X86Reg<"xmm5", 5>, DwarfRegNum<[22, 26, 26]>;
+def XMM6: X86Reg<"xmm6", 6>, DwarfRegNum<[23, 27, 27]>;
+def XMM7: X86Reg<"xmm7", 7>, DwarfRegNum<[24, 28, 28]>;
+
+// X86-64 only
+let CostPerUse = 1 in {
+def XMM8:  X86Reg<"xmm8",   8>, DwarfRegNum<[25, -2, -2]>;
+def XMM9:  X86Reg<"xmm9",   9>, DwarfRegNum<[26, -2, -2]>;
+def XMM10: X86Reg<"xmm10", 10>, DwarfRegNum<[27, -2, -2]>;
+def XMM11: X86Reg<"xmm11", 11>, DwarfRegNum<[28, -2, -2]>;
+def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>;
+def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>;
+def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>;
+def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>;
+
+def XMM16:  X86Reg<"xmm16", 16>, DwarfRegNum<[60, -2, -2]>;
+def XMM17:  X86Reg<"xmm17", 17>, DwarfRegNum<[61, -2, -2]>;
+def XMM18:  X86Reg<"xmm18", 18>, DwarfRegNum<[62, -2, -2]>;
+def XMM19:  X86Reg<"xmm19", 19>, DwarfRegNum<[63, -2, -2]>;
+def XMM20:  X86Reg<"xmm20", 20>, DwarfRegNum<[64, -2, -2]>;
+def XMM21:  X86Reg<"xmm21", 21>, DwarfRegNum<[65, -2, -2]>;
+def XMM22:  X86Reg<"xmm22", 22>, DwarfRegNum<[66, -2, -2]>;
+def XMM23:  X86Reg<"xmm23", 23>, DwarfRegNum<[67, -2, -2]>;
+def XMM24:  X86Reg<"xmm24", 24>, DwarfRegNum<[68, -2, -2]>;
+def XMM25:  X86Reg<"xmm25", 25>, DwarfRegNum<[69, -2, -2]>;
+def XMM26:  X86Reg<"xmm26", 26>, DwarfRegNum<[70, -2, -2]>;
+def XMM27:  X86Reg<"xmm27", 27>, DwarfRegNum<[71, -2, -2]>;
+def XMM28:  X86Reg<"xmm28", 28>, DwarfRegNum<[72, -2, -2]>;
+def XMM29:  X86Reg<"xmm29", 29>, DwarfRegNum<[73, -2, -2]>;
+def XMM30:  X86Reg<"xmm30", 30>, DwarfRegNum<[74, -2, -2]>;
+def XMM31:  X86Reg<"xmm31", 31>, DwarfRegNum<[75, -2, -2]>;
+
+} // CostPerUse
+
+// YMM0-15 registers, used by AVX instructions and
+// YMM16-31 registers, used by AVX-512 instructions.
+let SubRegIndices = [sub_xmm] in {
+  foreach  Index = 0-31 in {
+    def YMM#Index : X86Reg<"ymm"#Index, Index, [!cast<X86Reg>("XMM"#Index)]>,
+                    DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+  }
+}
+
+// ZMM Registers, used by AVX-512 instructions.
+let SubRegIndices = [sub_ymm] in {
+  foreach  Index = 0-31 in {
+    def ZMM#Index : X86Reg<"zmm"#Index, Index, [!cast<X86Reg>("YMM"#Index)]>,
+                    DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+  }
+}
+
+// Mask Registers, used by AVX-512 instructions.
+def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>;
+def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>;
+def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>;
+def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>;
+def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>;
+def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>;
+def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>;
+def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>;
+
+// Floating point stack registers. These don't map one-to-one to the FP
+// pseudo registers, but we still mark them as aliasing FP registers. That
+// way both kinds can be live without exceeding the stack depth. ST registers
+// are only live around inline assembly.
+def ST0 : X86Reg<"st(0)", 0>, DwarfRegNum<[33, 12, 11]>;
+def ST1 : X86Reg<"st(1)", 1>, DwarfRegNum<[34, 13, 12]>;
+def ST2 : X86Reg<"st(2)", 2>, DwarfRegNum<[35, 14, 13]>;
+def ST3 : X86Reg<"st(3)", 3>, DwarfRegNum<[36, 15, 14]>;
+def ST4 : X86Reg<"st(4)", 4>, DwarfRegNum<[37, 16, 15]>;
+def ST5 : X86Reg<"st(5)", 5>, DwarfRegNum<[38, 17, 16]>;
+def ST6 : X86Reg<"st(6)", 6>, DwarfRegNum<[39, 18, 17]>;
+def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>;
+
+// Floating-point status word
+def FPSW : X86Reg<"fpsw", 0>;
+
+// Status flags register
+def EFLAGS : X86Reg<"flags", 0>;
+
+// Segment registers
+def CS : X86Reg<"cs", 1>;
+def DS : X86Reg<"ds", 3>;
+def SS : X86Reg<"ss", 2>;
+def ES : X86Reg<"es", 0>;
+def FS : X86Reg<"fs", 4>;
+def GS : X86Reg<"gs", 5>;
+
+// Debug registers
+def DR0  : X86Reg<"dr0",   0>;
+def DR1  : X86Reg<"dr1",   1>;
+def DR2  : X86Reg<"dr2",   2>;
+def DR3  : X86Reg<"dr3",   3>;
+def DR4  : X86Reg<"dr4",   4>;
+def DR5  : X86Reg<"dr5",   5>;
+def DR6  : X86Reg<"dr6",   6>;
+def DR7  : X86Reg<"dr7",   7>;
+def DR8  : X86Reg<"dr8",   8>;
+def DR9  : X86Reg<"dr9",   9>;
+def DR10 : X86Reg<"dr10", 10>;
+def DR11 : X86Reg<"dr11", 11>;
+def DR12 : X86Reg<"dr12", 12>;
+def DR13 : X86Reg<"dr13", 13>;
+def DR14 : X86Reg<"dr14", 14>;
+def DR15 : X86Reg<"dr15", 15>;
+
+// Control registers
+def CR0  : X86Reg<"cr0",   0>;
+def CR1  : X86Reg<"cr1",   1>;
+def CR2  : X86Reg<"cr2",   2>;
+def CR3  : X86Reg<"cr3",   3>;
+def CR4  : X86Reg<"cr4",   4>;
+def CR5  : X86Reg<"cr5",   5>;
+def CR6  : X86Reg<"cr6",   6>;
+def CR7  : X86Reg<"cr7",   7>;
+def CR8  : X86Reg<"cr8",   8>;
+def CR9  : X86Reg<"cr9",   9>;
+def CR10 : X86Reg<"cr10", 10>;
+def CR11 : X86Reg<"cr11", 11>;
+def CR12 : X86Reg<"cr12", 12>;
+def CR13 : X86Reg<"cr13", 13>;
+def CR14 : X86Reg<"cr14", 14>;
+def CR15 : X86Reg<"cr15", 15>;
+
+// Pseudo index registers
+def EIZ : X86Reg<"eiz", 4>;
+def RIZ : X86Reg<"riz", 4>;
+
+// Bound registers, used in MPX instructions
+def BND0 : X86Reg<"bnd0",   0>;
+def BND1 : X86Reg<"bnd1",   1>;
+def BND2 : X86Reg<"bnd2",   2>;
+def BND3 : X86Reg<"bnd3",   3>;
+
+//===----------------------------------------------------------------------===//
+// Register Class Definitions... now that we have all of the pieces, define the
+// top-level register classes.  The order specified in the register list is
+// implicitly defined to be the register allocation order.
+//
+
+// List call-clobbered registers before callee-save registers. RBX, RBP, (and
+// R12, R13, R14, and R15 for X86-64) are callee-save registers.
+// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and
+// R8B, ... R15B.
+// Allocate R12 and R13 last, as these require an extra byte when
+// encoded in x86_64 instructions.
+// FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in
+// 64-bit mode. The main complication is that they cannot be encoded in an
+// instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc.
+// require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d"
+// cannot be encoded.
+def GR8 : RegisterClass<"X86", [i8],  8,
+                        (add AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL,
+                             R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> {
+  let AltOrders = [(sub GR8, AH, BH, CH, DH)];
+  let AltOrderSelect = [{
+    return MF.getSubtarget<X86Subtarget>().is64Bit();
+  }];
+}
+
+def GR16 : RegisterClass<"X86", [i16], 16,
+                         (add AX, CX, DX, SI, DI, BX, BP, SP,
+                              R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)>;
+
+def GR32 : RegisterClass<"X86", [i32], 32,
+                         (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
+                              R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)>;
+
+// GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
+// RIP isn't really a register and it can't be used anywhere except in an
+// address, but it doesn't cause trouble.
+def GR64 : RegisterClass<"X86", [i64], 64,
+                         (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+                              RBX, R14, R15, R12, R13, RBP, RSP, RIP)>;
+
+// Segment registers for use by MOV instructions (and others) that have a
+//   segment register as one operand.  Always contain a 16-bit segment
+//   descriptor.
+def SEGMENT_REG : RegisterClass<"X86", [i16], 16, (add CS, DS, SS, ES, FS, GS)>;
+
+// Debug registers.
+def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 7)>;
+
+// Control registers.
+def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>;
+
+// GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of
+// GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d"
+// registers. On x86-32, GR16_ABCD and GR32_ABCD are classes for registers
+// that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD,
+// and GR64_ABCD are classes for registers that support 8-bit h-register
+// operations.
+def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>;
+def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>;
+def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>;
+def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>;
+def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>;
+def GR32_TC   : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>;
+def GR64_TC   : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
+                                                     R8, R9, R11, RIP)>;
+def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
+                                                      R8, R9, R10, R11, RIP)>;
+
+// GR8_NOREX - GR8 registers which do not require a REX prefix.
+def GR8_NOREX : RegisterClass<"X86", [i8], 8,
+                              (add AL, CL, DL, AH, CH, DH, BL, BH)> {
+  let AltOrders = [(sub GR8_NOREX, AH, BH, CH, DH)];
+  let AltOrderSelect = [{
+    return MF.getSubtarget<X86Subtarget>().is64Bit();
+  }];
+}
+// GR16_NOREX - GR16 registers which do not require a REX prefix.
+def GR16_NOREX : RegisterClass<"X86", [i16], 16,
+                               (add AX, CX, DX, SI, DI, BX, BP, SP)>;
+// GR32_NOREX - GR32 registers which do not require a REX prefix.
+def GR32_NOREX : RegisterClass<"X86", [i32], 32,
+                               (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)>;
+// GR64_NOREX - GR64 registers which do not require a REX prefix.
+def GR64_NOREX : RegisterClass<"X86", [i64], 64,
+                            (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)>;
+
+// GR32_NOAX - GR32 registers except EAX. Used by AddRegFrm of XCHG32 in 64-bit
+// mode to prevent encoding using the 0x90 NOP encoding. xchg %eax, %eax needs
+// to clear upper 32-bits of RAX so is not a NOP.
+def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)>;
+
+// GR32_NOSP - GR32 registers except ESP.
+def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)>;
+
+// GR64_NOSP - GR64 registers except RSP (and RIP).
+def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)>;
+
+// GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except
+// ESP.
+def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32,
+                                    (and GR32_NOREX, GR32_NOSP)>;
+
+// GR64_NOREX_NOSP - GR64_NOREX registers except RSP.
+def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
+                                    (and GR64_NOREX, GR64_NOSP)>;
+
+// A class to support the 'A' assembler constraint: EAX then EDX.
+def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>;
+
+// Scalar SSE2 floating point registers.
+def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
+
+def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>;
+
+def FR128 : RegisterClass<"X86", [i128, f128], 128, (add FR32)>;
+
+
+// FIXME: This sets up the floating point register files as though they are f64
+// values, though they really are f80 values.  This will cause us to spill
+// values as 64-bit quantities instead of 80-bit quantities, which is much much
+// faster on common hardware.  In reality, this should be controlled by a
+// command line option or something.
+
+def RFP32 : RegisterClass<"X86",[f32], 32, (sequence "FP%u", 0, 6)>;
+def RFP64 : RegisterClass<"X86",[f64], 32, (add RFP32)>;
+def RFP80 : RegisterClass<"X86",[f80], 32, (add RFP32)>;
+
+// Floating point stack registers (these are not allocatable by the
+// register allocator - the floating point stackifier is responsible
+// for transforming FPn allocations to STn registers)
+def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> {
+  let isAllocatable = 0;
+}
+
+// Generic vector registers: VR64 and VR128.
+// Ensure that float types are declared first - only float is legal on SSE1.
+def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
+def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+                          128, (add FR32)>;
+def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+                          256, (sequence "YMM%u", 0, 15)>;
+
+// Status flags registers.
+def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+  let isAllocatable = 0;
+}
+def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+  let isAllocatable = 0;
+}
+
+// AVX-512 vector/mask registers.
+def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64],
+                          512, (sequence "ZMM%u", 0, 31)>;
+
+// Scalar AVX-512 floating point registers.
+def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
+
+def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
+
+// Extended VR128 and VR256 for AVX-512 instructions
+def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+                           128, (add FR32X)>;
+def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+                           256, (sequence "YMM%u", 0, 31)>;
+
+// Mask registers
+def VK1     : RegisterClass<"X86", [i1],    8,  (sequence "K%u", 0, 7)> {let Size = 8;}
+def VK2     : RegisterClass<"X86", [v2i1],  8,  (add VK1)> {let Size = 8;}
+def VK4     : RegisterClass<"X86", [v4i1],  8,  (add VK2)> {let Size = 8;}
+def VK8     : RegisterClass<"X86", [v8i1],  8,  (add VK4)> {let Size = 8;}
+def VK16    : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;}
+def VK32    : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;}
+def VK64    : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;}
+
+def VK1WM   : RegisterClass<"X86", [i1],    8,  (sub VK1, K0)> {let Size = 8;}
+def VK2WM   : RegisterClass<"X86", [v2i1],  8,  (sub VK2, K0)> {let Size = 8;}
+def VK4WM   : RegisterClass<"X86", [v4i1],  8,  (sub VK4, K0)> {let Size = 8;}
+def VK8WM   : RegisterClass<"X86", [v8i1],  8,  (sub VK8, K0)> {let Size = 8;}
+def VK16WM  : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>   {let Size = 16;}
+def VK32WM  : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;}
+def VK64WM  : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
+
+// Bound registers
+def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
diff --git a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
new file mode 100644
index 0000000..677e824
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -0,0 +1,2147 @@
+//=- X86SchedHaswell.td - X86 Haswell Scheduling -------------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Haswell to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def HaswellModel : SchedMachineModel {
+  // All x86 instructions are modeled as a single micro-op, and HW can decode 4
+  // instructions per cycle.
+  let IssueWidth = 4;
+  let MicroOpBufferSize = 192; // Based on the reorder buffer.
+  let LoadLatency = 4;
+  let MispredictPenalty = 16;
+
+  // Based on the LSD (loop-stream detector) queue size and benchmarking data.
+  let LoopMicroOpBufferSize = 50;
+
+  // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
+}
+
+let SchedModel = HaswellModel in {
+
+// Haswell can issue micro-ops to 8 different ports in one cycle.
+
+// Ports 0, 1, 5, and 6 handle all computation.
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores. Port 7 can handle address calculations.
+def HWPort0 : ProcResource<1>;
+def HWPort1 : ProcResource<1>;
+def HWPort2 : ProcResource<1>;
+def HWPort3 : ProcResource<1>;
+def HWPort4 : ProcResource<1>;
+def HWPort5 : ProcResource<1>;
+def HWPort6 : ProcResource<1>;
+def HWPort7 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def HWPort01  : ProcResGroup<[HWPort0, HWPort1]>;
+def HWPort23  : ProcResGroup<[HWPort2, HWPort3]>;
+def HWPort237 : ProcResGroup<[HWPort2, HWPort3, HWPort7]>;
+def HWPort04  : ProcResGroup<[HWPort0, HWPort4]>;
+def HWPort05  : ProcResGroup<[HWPort0, HWPort5]>;
+def HWPort06  : ProcResGroup<[HWPort0, HWPort6]>;
+def HWPort15  : ProcResGroup<[HWPort1, HWPort5]>;
+def HWPort16  : ProcResGroup<[HWPort1, HWPort6]>;
+def HWPort56  : ProcResGroup<[HWPort5, HWPort6]>;
+def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>;
+def HWPort056 : ProcResGroup<[HWPort0, HWPort5, HWPort6]>;
+def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>;
+
+// 60 Entry Unified Scheduler
+def HWPortAny : ProcResGroup<[HWPort0, HWPort1, HWPort2, HWPort3, HWPort4,
+                              HWPort5, HWPort6, HWPort7]> {
+  let BufferSize=60;
+}
+
+// Integer division issued on port 0.
+def HWDivider : ProcResource<1>;
+
+// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [HWPort23, ExePort]> {
+     let Latency = !add(Lat, 4);
+  }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [HWPort4]>;
+
+// Store_addr on 237.
+// Store_data on 4.
+def : WriteRes<WriteStore, [HWPort237, HWPort4]>;
+def : WriteRes<WriteLoad,  [HWPort23]> { let Latency = 4; }
+def : WriteRes<WriteMove,  [HWPort0156]>;
+def : WriteRes<WriteZero,  []>;
+
+defm : HWWriteResPair<WriteALU,   HWPort0156, 1>;
+defm : HWWriteResPair<WriteIMul,  HWPort1,   3>;
+def  : WriteRes<WriteIMulH, []> { let Latency = 3; }
+defm : HWWriteResPair<WriteShift, HWPort06,  1>;
+defm : HWWriteResPair<WriteJump,  HWPort06,   1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [HWPort15]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [HWPort0, HWDivider]> {
+  let Latency = 25;
+  let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [HWPort23, HWPort0, HWDivider]> {
+  let Latency = 29;
+  let ResourceCycles = [1, 1, 10];
+}
+
+// Scalar and vector floating point.
+defm : HWWriteResPair<WriteFAdd,   HWPort1, 3>;
+defm : HWWriteResPair<WriteFMul,   HWPort0, 5>;
+defm : HWWriteResPair<WriteFDiv,   HWPort0, 12>; // 10-14 cycles.
+defm : HWWriteResPair<WriteFRcp,   HWPort0, 5>;
+defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>;
+defm : HWWriteResPair<WriteFSqrt,  HWPort0, 15>;
+defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
+defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
+defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>;
+defm : HWWriteResPair<WriteFShuffle,  HWPort5,  1>;
+defm : HWWriteResPair<WriteFBlend,  HWPort015,  1>;
+defm : HWWriteResPair<WriteFShuffle256,  HWPort5,  3>;
+
+def : WriteRes<WriteFVarBlend, [HWPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteFVarBlendLd, [HWPort5, HWPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [2, 1];
+}
+
+// Vector integer operations.
+defm : HWWriteResPair<WriteVecShift, HWPort0,  1>;
+defm : HWWriteResPair<WriteVecLogic, HWPort015, 1>;
+defm : HWWriteResPair<WriteVecALU,   HWPort15,  1>;
+defm : HWWriteResPair<WriteVecIMul,  HWPort0,   5>;
+defm : HWWriteResPair<WriteShuffle,  HWPort5,  1>;
+defm : HWWriteResPair<WriteBlend,  HWPort15,  1>;
+defm : HWWriteResPair<WriteShuffle256,  HWPort5,  3>;
+
+def : WriteRes<WriteVarBlend, [HWPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteVarBlendLd, [HWPort5, HWPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteVarVecShift, [HWPort0, HWPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteVarVecShiftLd, [HWPort0, HWPort5, HWPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [2, 1, 1];
+}
+
+def : WriteRes<WriteMPSAD, [HWPort0, HWPort5]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteMPSADLd, [HWPort23, HWPort0, HWPort5]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 2];
+}
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [HWPort0]> {
+  let Latency = 10;
+  let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [HWPort0, HWPort23]> {
+  let Latency = 10;
+  let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [HWPort0, HWPort16, HWPort5]> {
+  let Latency = 10;
+  let ResourceCycles = [3, 2, 4];
+}
+def : WriteRes<WritePCmpEStrMLd, [HWPort05, HWPort16, HWPort23]> {
+  let Latency = 10;
+  let ResourceCycles = [6, 2, 1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [HWPort0]> {
+  let Latency = 11;
+  let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [HWPort0, HWPort23]> {
+  let Latency = 11;
+  let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [HWPort05, HWPort16]> {
+  let Latency = 11;
+  let ResourceCycles = [6, 2];
+}
+def : WriteRes<WritePCmpEStrILd, [HWPort0, HWPort16, HWPort5, HWPort23]> {
+  let Latency = 11;
+  let ResourceCycles = [3, 2, 2, 1];
+}
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [HWPort5]> {
+  let Latency = 7;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESDecEncLd, [HWPort5, HWPort23]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 1];
+}
+
+def : WriteRes<WriteAESIMC, [HWPort5]> {
+  let Latency = 14;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [HWPort5, HWPort23]> {
+  let Latency = 14;
+  let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [HWPort0, HWPort5]> {
+  let Latency = 10;
+  let ResourceCycles = [2, 8];
+}
+def : WriteRes<WriteAESKeyGenLd, [HWPort0, HWPort5, HWPort23]> {
+  let Latency = 10;
+  let ResourceCycles = [2, 7, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [HWPort0, HWPort5]> {
+  let Latency = 7;
+  let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteCLMulLd, [HWPort0, HWPort5, HWPort23]> {
+  let Latency = 7;
+  let ResourceCycles = [2, 1, 1];
+}
+
+def : WriteRes<WriteSystem,     [HWPort0156]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; }
+def : WriteRes<WriteFence,  [HWPort23, HWPort4]>;
+def : WriteRes<WriteNop, []>;
+
+//================ Exceptions ================//
+
+//-- Specific Scheduling Models --//
+
+// Starting with P0.
+def WriteP0 : SchedWriteRes<[HWPort0]>;
+
+def WriteP0_P1_Lat4 : SchedWriteRes<[HWPort0, HWPort1]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+
+def WriteP0_P1_Lat4Ld : SchedWriteRes<[HWPort0, HWPort1, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+
+def WriteP01 : SchedWriteRes<[HWPort01]>;
+
+def Write2P01 : SchedWriteRes<[HWPort01]> {
+  let NumMicroOps = 2;
+}
+def Write3P01 : SchedWriteRes<[HWPort01]> {
+  let NumMicroOps = 3;
+}
+
+def WriteP015 : SchedWriteRes<[HWPort015]>;
+
+def WriteP01_P5 : SchedWriteRes<[HWPort01, HWPort5]> {
+  let NumMicroOps = 2;
+}
+def WriteP06 : SchedWriteRes<[HWPort06]>;
+
+def Write2P06 : SchedWriteRes<[HWPort06]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+
+def Write3P06_Lat2 : SchedWriteRes<[HWPort06]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [3];
+}
+
+def WriteP0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
+  let NumMicroOps = 2;
+}
+
+def Write2P0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+
+def Write2P0156_Lat2 : SchedWriteRes<[HWPort0156]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def Write2P0156_Lat2Ld : SchedWriteRes<[HWPort0156, HWPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [2, 1];
+}
+
+def Write5P0156 : SchedWriteRes<[HWPort0156]> {
+  let NumMicroOps = 5;
+  let ResourceCycles = [5];
+}
+
+def WriteP0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
+  let Latency = 1;
+  let ResourceCycles = [1, 2, 1];
+}
+
+def Write2P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
+  let Latency = 1;
+  let ResourceCycles = [2, 2, 1];
+}
+
+def Write3P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
+  let Latency = 1;
+  let ResourceCycles = [3, 2, 1];
+}
+
+// Starting with P1.
+def WriteP1 : SchedWriteRes<[HWPort1]>;
+
+def WriteP1_P23 : SchedWriteRes<[HWPort1, HWPort23]> {
+  let NumMicroOps = 2;
+}
+def WriteP1_Lat3 : SchedWriteRes<[HWPort1]> {
+  let Latency = 3;
+}
+def WriteP1_Lat3Ld : SchedWriteRes<[HWPort1, HWPort23]> {
+  let Latency = 7;
+}
+
+def Write2P1 : SchedWriteRes<[HWPort1]> {
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def Write2P1_P23 : SchedWriteRes<[HWPort1, HWPort23]> {
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def WriteP15 : SchedWriteRes<[HWPort15]>;
+def WriteP15Ld : SchedWriteRes<[HWPort15, HWPort23]> {
+  let Latency = 4;
+}
+
+def WriteP1_P5_Lat4 : SchedWriteRes<[HWPort1, HWPort5]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+
+def WriteP1_P5_Lat4Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+
+def WriteP1_P5_Lat6 : SchedWriteRes<[HWPort1, HWPort5]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+
+def WriteP1_P5_Lat6Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+
+// Starting with P2.
+def Write2P237_P4 : SchedWriteRes<[HWPort237, HWPort4]> {
+  let Latency = 1;
+  let ResourceCycles = [2, 1];
+}
+
+// Starting with P5.
+def WriteP5 : SchedWriteRes<[HWPort5]>;
+def WriteP5Ld : SchedWriteRes<[HWPort5, HWPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+
+// Notation:
+// - r: register.
+// - mm: 64 bit mmx register.
+// - x = 128 bit xmm register.
+// - (x)mm = mmx or xmm register.
+// - y = 256 bit ymm register.
+// - v = any vector register.
+// - m = memory.
+
+//=== Integer Instructions ===//
+//-- Move instructions --//
+
+// MOV.
+// r16,m.
+def : InstRW<[WriteALULd], (instregex "MOV16rm")>;
+
+// MOVSX, MOVZX.
+// r,m.
+def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>;
+
+// CMOVcc.
+// r,r.
+def : InstRW<[Write2P0156_Lat2],
+      (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rr")>;
+// r,m.
+def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd],
+      (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rm")>;
+
+// XCHG.
+// r,r.
+def WriteXCHG : SchedWriteRes<[HWPort0156]> {
+  let Latency = 2;
+  let ResourceCycles = [3];
+}
+
+def : InstRW<[WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>;
+
+// r,m.
+def WriteXCHGrm : SchedWriteRes<[]> {
+  let Latency = 21;
+  let NumMicroOps = 8;
+}
+def : InstRW<[WriteXCHGrm], (instregex "XCHG(8|16|32|64)rm")>;
+
+// XLAT.
+def WriteXLAT : SchedWriteRes<[]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteXLAT], (instregex "XLAT")>;
+
+// PUSH.
+// m.
+def : InstRW<[Write2P237_P4], (instregex "PUSH(16|32)rmm")>;
+
+// PUSHF.
+def WritePushF : SchedWriteRes<[HWPort1, HWPort4, HWPort237, HWPort06]> {
+  let NumMicroOps = 4;
+}
+def : InstRW<[WritePushF], (instregex "PUSHF(16|32)")>;
+
+// PUSHA.
+def WritePushA : SchedWriteRes<[]> {
+  let NumMicroOps = 19;
+}
+def : InstRW<[WritePushA], (instregex "PUSHA(16|32)")>;
+
+// POP.
+// m.
+def : InstRW<[Write2P237_P4], (instregex "POP(16|32)rmm")>;
+
+// POPF.
+def WritePopF : SchedWriteRes<[]> {
+  let NumMicroOps = 9;
+}
+def : InstRW<[WritePopF], (instregex "POPF(16|32)")>;
+
+// POPA.
+def WritePopA : SchedWriteRes<[]> {
+  let NumMicroOps = 18;
+}
+def : InstRW<[WritePopA], (instregex "POPA(16|32)")>;
+
+// LAHF SAHF.
+def : InstRW<[WriteP06], (instregex "(S|L)AHF")>;
+
+// BSWAP.
+// r32.
+def WriteBSwap32 : SchedWriteRes<[HWPort15]>;
+def : InstRW<[WriteBSwap32], (instregex "BSWAP32r")>;
+
+// r64.
+def WriteBSwap64 : SchedWriteRes<[HWPort06, HWPort15]> {
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteBSwap64], (instregex "BSWAP64r")>;
+
+// MOVBE.
+// r16,m16 / r64,m64.
+def : InstRW<[Write2P0156_Lat2Ld], (instregex "MOVBE(16|64)rm")>;
+
+// r32, m32.
+def WriteMoveBE32rm : SchedWriteRes<[HWPort15, HWPort23]> {
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteMoveBE32rm], (instregex "MOVBE32rm")>;
+
+// m16,r16.
+def WriteMoveBE16mr : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteMoveBE16mr], (instregex "MOVBE16mr")>;
+
+// m32,r32.
+def WriteMoveBE32mr : SchedWriteRes<[HWPort15, HWPort237, HWPort4]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteMoveBE32mr], (instregex "MOVBE32mr")>;
+
+// m64,r64.
+def WriteMoveBE64mr : SchedWriteRes<[HWPort06, HWPort15, HWPort237, HWPort4]> {
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteMoveBE64mr], (instregex "MOVBE64mr")>;
+
+//-- Arithmetic instructions --//
+
+// ADD SUB.
+// m,r/i.
+def : InstRW<[Write2P0156_2P237_P4],
+              (instregex "(ADD|SUB)(8|16|32|64)m(r|i)",
+              "(ADD|SUB)(8|16|32|64)mi8", "(ADD|SUB)64mi32")>;
+
+// ADC SBB.
+// r,r/i.
+def : InstRW<[Write2P0156_Lat2], (instregex "(ADC|SBB)(8|16|32|64)r(r|i)",
+                           "(ADC|SBB)(16|32|64)ri8",
+                           "(ADC|SBB)64ri32",
+                           "(ADC|SBB)(8|16|32|64)rr_REV")>;
+
+// r,m.
+def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd], (instregex "(ADC|SBB)(8|16|32|64)rm")>;
+
+// m,r/i.
+def : InstRW<[Write3P0156_2P237_P4],
+             (instregex "(ADC|SBB)(8|16|32|64)m(r|i)",
+              "(ADC|SBB)(16|32|64)mi8",
+              "(ADC|SBB)64mi32")>;
+
+// INC DEC NOT NEG.
+// m.
+def : InstRW<[WriteP0156_2P237_P4],
+             (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m",
+              "(INC|DEC)64(16|32)m")>;
+
+// MUL IMUL.
+// r16.
+def WriteMul16 : SchedWriteRes<[HWPort1, HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteMul16], (instregex "IMUL16r", "MUL16r")>;
+
+// m16.
+def WriteMul16Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 5;
+}
+def : InstRW<[WriteMul16Ld], (instregex "IMUL16m", "MUL16m")>;
+
+// r32.
+def WriteMul32 : SchedWriteRes<[HWPort1, HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteMul32], (instregex "IMUL32r", "MUL32r")>;
+
+// m32.
+def WriteMul32Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteMul32Ld], (instregex "IMUL32m", "MUL32m")>;
+
+// r64.
+def WriteMul64 : SchedWriteRes<[HWPort1, HWPort6]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteMul64], (instregex "IMUL64r", "MUL64r")>;
+
+// m64.
+def WriteMul64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteMul64Ld], (instregex "IMUL64m", "MUL64m")>;
+
+// r16,r16.
+def WriteMul16rri : SchedWriteRes<[HWPort1, HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteMul16rri], (instregex "IMUL16rri", "IMUL16rri8")>;
+
+// r16,m16.
+def WriteMul16rmi : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteMul16rmi], (instregex "IMUL16rmi", "IMUL16rmi8")>;
+
+// MULX.
+// r32,r32,r32.
+def WriteMulX32 : SchedWriteRes<[HWPort1, HWPort056]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2];
+}
+def : InstRW<[WriteMulX32], (instregex "MULX32rr")>;
+
+// r32,r32,m32.
+def WriteMulX32Ld : SchedWriteRes<[HWPort1, HWPort056, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WriteMulX32Ld], (instregex "MULX32rm")>;
+
+// r64,r64,r64.
+def WriteMulX64 : SchedWriteRes<[HWPort1, HWPort6]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteMulX64], (instregex "MULX64rr")>;
+
+// r64,r64,m64.
+def WriteMulX64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteMulX64Ld], (instregex "MULX64rm")>;
+
+// DIV.
+// r8.
+def WriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 22;
+  let NumMicroOps = 9;
+}
+def : InstRW<[WriteDiv8], (instregex "DIV8r")>;
+
+// r16.
+def WriteDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 23;
+  let NumMicroOps = 10;
+}
+def : InstRW<[WriteDiv16], (instregex "DIV16r")>;
+
+// r32.
+def WriteDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 22;
+  let NumMicroOps = 10;
+}
+def : InstRW<[WriteDiv32], (instregex "DIV32r")>;
+
+// r64.
+def WriteDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 32;
+  let NumMicroOps = 36;
+}
+def : InstRW<[WriteDiv64], (instregex "DIV64r")>;
+
+// IDIV.
+// r8.
+def WriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 23;
+  let NumMicroOps = 9;
+}
+def : InstRW<[WriteIDiv8], (instregex "IDIV8r")>;
+
+// r16.
+def WriteIDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 23;
+  let NumMicroOps = 10;
+}
+def : InstRW<[WriteIDiv16], (instregex "IDIV16r")>;
+
+// r32.
+def WriteIDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 22;
+  let NumMicroOps = 9;
+}
+def : InstRW<[WriteIDiv32], (instregex "IDIV32r")>;
+
+// r64.
+def WriteIDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 39;
+  let NumMicroOps = 59;
+}
+def : InstRW<[WriteIDiv64], (instregex "IDIV64r")>;
+
+//-- Logic instructions --//
+
+// AND OR XOR.
+// m,r/i.
+def : InstRW<[Write2P0156_2P237_P4],
+             (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)",
+              "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>;
+
+// SHR SHL SAR.
+// m,i.
+def WriteShiftRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteShiftRMW], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>;
+
+// r,cl.
+def : InstRW<[Write3P06_Lat2], (instregex "S(A|H)(R|L)(8|16|32|64)rCL")>;
+
+// m,cl.
+def WriteShiftClLdRMW : SchedWriteRes<[HWPort06, HWPort23, HWPort4]> {
+  let NumMicroOps = 6;
+  let ResourceCycles = [3, 2, 1];
+}
+def : InstRW<[WriteShiftClLdRMW], (instregex "S(A|H)(R|L)(8|16|32|64)mCL")>;
+
+// ROR ROL.
+// r,1.
+def : InstRW<[Write2P06], (instregex "RO(R|L)(8|16|32|64)r1")>;
+
+// m,i.
+def WriteRotateRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+  let NumMicroOps = 5;
+  let ResourceCycles = [2, 2, 1];
+}
+def : InstRW<[WriteRotateRMW], (instregex "RO(R|L)(8|16|32|64)mi")>;
+
+// r,cl.
+def : InstRW<[Write3P06_Lat2], (instregex "RO(R|L)(8|16|32|64)rCL")>;
+
+// m,cl.
+def WriteRotateRMWCL : SchedWriteRes<[]> {
+  let NumMicroOps = 6;
+}
+def : InstRW<[WriteRotateRMWCL], (instregex "RO(R|L)(8|16|32|64)mCL")>;
+
+// RCR RCL.
+// r,1.
+def WriteRCr1 : SchedWriteRes<[HWPort06, HWPort0156]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteRCr1], (instregex "RC(R|L)(8|16|32|64)r1")>;
+
+// m,1.
+def WriteRCm1 : SchedWriteRes<[]> {
+  let NumMicroOps = 6;
+}
+def : InstRW<[WriteRCm1], (instregex "RC(R|L)(8|16|32|64)m1")>;
+
+// r,i.
+def WriteRCri : SchedWriteRes<[HWPort0156]> {
+  let Latency = 6;
+  let NumMicroOps = 8;
+}
+def : InstRW<[WriteRCri], (instregex "RC(R|L)(8|16|32|64)r(i|CL)")>;
+
+// m,i.
+def WriteRCmi : SchedWriteRes<[]> {
+  let NumMicroOps = 11;
+}
+def : InstRW<[WriteRCmi], (instregex "RC(R|L)(8|16|32|64)m(i|CL)")>;
+
+// SHRD SHLD.
+// r,r,i.
+def WriteShDrr : SchedWriteRes<[HWPort1]> {
+  let Latency = 3;
+}
+def : InstRW<[WriteShDrr], (instregex "SH(R|L)D(16|32|64)rri8")>;
+
+// m,r,i.
+def WriteShDmr : SchedWriteRes<[]> {
+  let NumMicroOps = 5;
+}
+def : InstRW<[WriteShDmr], (instregex "SH(R|L)D(16|32|64)mri8")>;
+
+// r,r,cl.
+def WriteShlDCL : SchedWriteRes<[HWPort0156]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteShlDCL], (instregex "SHLD(16|32|64)rrCL")>;
+
+// r,r,cl.
+def WriteShrDCL : SchedWriteRes<[HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteShrDCL], (instregex "SHRD(16|32|64)rrCL")>;
+
+// m,r,cl.
+def WriteShDmrCL : SchedWriteRes<[]> {
+  let NumMicroOps = 7;
+}
+def : InstRW<[WriteShDmrCL], (instregex "SH(R|L)D(16|32|64)mrCL")>;
+
+// BT.
+// r,r/i.
+def : InstRW<[WriteShift], (instregex "BT(16|32|64)r(r|i8)")>;
+
+// m,r.
+def WriteBTmr : SchedWriteRes<[]> {
+  let NumMicroOps = 10;
+}
+def : InstRW<[WriteBTmr], (instregex "BT(16|32|64)mr")>;
+
+// m,i.
+def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>;
+
+// BTR BTS BTC.
+// r,r,i.
+def : InstRW<[WriteShift], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>;
+
+// m,r.
+def WriteBTRSCmr : SchedWriteRes<[]> {
+  let NumMicroOps = 11;
+}
+def : InstRW<[WriteBTRSCmr], (instregex "BT(R|S|C)(16|32|64)mr")>;
+
+// m,i.
+def : InstRW<[WriteShiftLd], (instregex "BT(R|S|C)(16|32|64)mi8")>;
+
+// BSF BSR.
+// r,r.
+def : InstRW<[WriteP1_Lat3], (instregex "BS(R|F)(16|32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP1_Lat3Ld], (instregex "BS(R|F)(16|32|64)rm")>;
+
+// SETcc.
+// r.
+def : InstRW<[WriteShift],
+             (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)r")>;
+// m.
+def WriteSetCCm : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteSetCCm],
+             (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)m")>;
+
+// CLD STD.
+def WriteCldStd : SchedWriteRes<[HWPort15, HWPort6]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteCldStd], (instregex "STD", "CLD")>;
+
+// LZCNT TZCNT.
+// r,r.
+def : InstRW<[WriteP1_Lat3], (instregex "(L|TZCNT)(16|32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP1_Lat3Ld], (instregex "(L|TZCNT)(16|32|64)rm")>;
+
+// ANDN.
+// r,r.
+def : InstRW<[WriteP15], (instregex "ANDN(32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP15Ld], (instregex "ANDN(32|64)rm")>;
+
+// BLSI BLSMSK BLSR.
+// r,r.
+def : InstRW<[WriteP15], (instregex "BLS(I|MSK|R)(32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP15Ld], (instregex "BLS(I|MSK|R)(32|64)rm")>;
+
+// BEXTR.
+// r,r,r.
+def : InstRW<[Write2P0156_Lat2], (instregex "BEXTR(32|64)rr")>;
+// r,m,r.
+def : InstRW<[Write2P0156_Lat2Ld], (instregex "BEXTR(32|64)rm")>;
+
+// BZHI.
+// r,r,r.
+def : InstRW<[WriteP15], (instregex "BZHI(32|64)rr")>;
+// r,m,r.
+def : InstRW<[WriteP15Ld], (instregex "BZHI(32|64)rm")>;
+
+// PDEP PEXT.
+// r,r,r.
+def : InstRW<[WriteP1_Lat3], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>;
+// r,m,r.
+def : InstRW<[WriteP1_Lat3Ld], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>;
+
+//-- Control transfer instructions --//
+
+// J(E|R)CXZ.
+def WriteJCXZ : SchedWriteRes<[HWPort0156, HWPort6]> {
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteJCXZ], (instregex "JCXZ", "JECXZ_(32|64)", "JRCXZ")>;
+
+// LOOP.
+def WriteLOOP : SchedWriteRes<[]> {
+  let NumMicroOps = 7;
+}
+def : InstRW<[WriteLOOP], (instregex "LOOP")>;
+
+// LOOP(N)E
+def WriteLOOPE : SchedWriteRes<[]> {
+  let NumMicroOps = 11;
+}
+def : InstRW<[WriteLOOPE], (instregex "LOOPE", "LOOPNE")>;
+
+// CALL.
+// r.
+def WriteCALLr : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteCALLr], (instregex "CALL(16|32)r")>;
+
+// m.
+def WriteCALLm : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteCALLm], (instregex "CALL(16|32)m")>;
+
+// RET.
+def WriteRET : SchedWriteRes<[HWPort237, HWPort6]> {
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)")>;
+
+// i.
+def WriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> {
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WriteRETI], (instregex "RETI(L|Q|W)", "LRETI(L|Q|W)")>;
+
+// BOUND.
+// r,m.
+def WriteBOUND : SchedWriteRes<[]> {
+  let NumMicroOps = 15;
+}
+def : InstRW<[WriteBOUND], (instregex "BOUNDS(16|32)rm")>;
+
+// INTO.
+def WriteINTO : SchedWriteRes<[]> {
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteINTO], (instregex "INTO")>;
+
+//-- String instructions --//
+
+// LODSB/W.
+def : InstRW<[Write2P0156_P23], (instregex "LODS(B|W)")>;
+
+// LODSD/Q.
+def : InstRW<[WriteP0156_P23], (instregex "LODS(L|Q)")>;
+
+// STOS.
+def WriteSTOS : SchedWriteRes<[HWPort23, HWPort0156, HWPort4]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteSTOS], (instregex "STOS(B|L|Q|W)")>;
+
+// MOVS.
+def WriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 5;
+  let ResourceCycles = [2, 1, 2];
+}
+def : InstRW<[WriteMOVS], (instregex "MOVS(B|L|Q|W)")>;
+
+// SCAS.
+def : InstRW<[Write2P0156_P23], (instregex "SCAS(B|W|L|Q)")>;
+
+// CMPS.
+def WriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 5;
+  let ResourceCycles = [2, 3];
+}
+def : InstRW<[WriteCMPS], (instregex "CMPS(B|L|Q|W)")>;
+
+//-- Synchronization instructions --//
+
+// XADD.
+def WriteXADD : SchedWriteRes<[]> {
+  let NumMicroOps = 5;
+}
+def : InstRW<[WriteXADD], (instregex "XADD(8|16|32|64)rm")>;
+
+// CMPXCHG.
+def WriteCMPXCHG : SchedWriteRes<[]> {
+  let NumMicroOps = 6;
+}
+def : InstRW<[WriteCMPXCHG], (instregex "CMPXCHG(8|16|32|64)rm")>;
+
+// CMPXCHG8B.
+def WriteCMPXCHG8B : SchedWriteRes<[]> {
+  let NumMicroOps = 15;
+}
+def : InstRW<[WriteCMPXCHG8B], (instregex "CMPXCHG8B")>;
+
+// CMPXCHG16B.
+def WriteCMPXCHG16B : SchedWriteRes<[]> {
+  let NumMicroOps = 22;
+}
+def : InstRW<[WriteCMPXCHG16B], (instregex "CMPXCHG16B")>;
+
+//-- Other --//
+
+// PAUSE.
+def WritePAUSE : SchedWriteRes<[HWPort05, HWPort6]> {
+  let NumMicroOps = 5;
+  let ResourceCycles = [1, 3];
+}
+def : InstRW<[WritePAUSE], (instregex "PAUSE")>;
+
+// LEAVE.
+def : InstRW<[Write2P0156_P23], (instregex "LEAVE")>;
+
+// XGETBV.
+def WriteXGETBV : SchedWriteRes<[]> {
+  let NumMicroOps = 8;
+}
+def : InstRW<[WriteXGETBV], (instregex "XGETBV")>;
+
+// RDTSC.
+def WriteRDTSC : SchedWriteRes<[]> {
+  let NumMicroOps = 15;
+}
+def : InstRW<[WriteRDTSC], (instregex "RDTSC")>;
+
+// RDPMC.
+def WriteRDPMC : SchedWriteRes<[]> {
+  let NumMicroOps = 34;
+}
+def : InstRW<[WriteRDPMC], (instregex "RDPMC")>;
+
+// RDRAND.
+def WriteRDRAND : SchedWriteRes<[HWPort23, HWPort015]> {
+  let NumMicroOps = 17;
+  let ResourceCycles = [1, 16];
+}
+def : InstRW<[WriteRDRAND], (instregex "RDRAND(16|32|64)r")>;
+
+//=== Floating Point x87 Instructions ===//
+//-- Move instructions --//
+
+// FLD.
+// m80.
+def : InstRW<[WriteP01], (instregex "LD_Frr")>;
+
+def WriteLD_F80m : SchedWriteRes<[HWPort01, HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 2];
+}
+def : InstRW<[WriteLD_F80m], (instregex "LD_F80m")>;
+
+// FBLD.
+// m80.
+def WriteFBLD : SchedWriteRes<[]> {
+  let Latency = 47;
+  let NumMicroOps = 43;
+}
+def : InstRW<[WriteFBLD], (instregex "FBLDm")>;
+
+// FST(P).
+// r.
+def : InstRW<[WriteP01], (instregex "ST_(F|FP)rr")>;
+
+// m80.
+def WriteST_FP80m : SchedWriteRes<[HWPort0156, HWPort23, HWPort4]> {
+  let NumMicroOps = 7;
+  let ResourceCycles = [3, 2, 2];
+}
+def : InstRW<[WriteST_FP80m], (instregex "ST_FP80m")>;
+
+// FBSTP.
+// m80.
+def WriteFBSTP : SchedWriteRes<[]> {
+  let NumMicroOps = 226;
+}
+def : InstRW<[WriteFBSTP], (instregex "FBSTPm")>;
+
+// FXCHG.
+def : InstRW<[WriteNop], (instregex "XCH_F")>;
+
+// FILD.
+def WriteFILD : SchedWriteRes<[HWPort01, HWPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteFILD], (instregex "ILD_F(16|32|64)m")>;
+
+// FIST(P) FISTTP.
+def WriteFIST : SchedWriteRes<[HWPort1, HWPort23, HWPort4]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteFIST], (instregex "IST_(F|FP)(16|32)m")>;
+
+// FLDZ.
+def : InstRW<[WriteP01], (instregex "LD_F0")>;
+
+// FLD1.
+def : InstRW<[Write2P01], (instregex "LD_F1")>;
+
+// FLDPI FLDL2E etc.
+def : InstRW<[Write2P01], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>;
+
+// FCMOVcc.
+def WriteFCMOVcc : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteFCMOVcc], (instregex "CMOV(B|BE|P|NB|NBE|NE|NP)_F")>;
+
+// FNSTSW.
+// AX.
+def WriteFNSTSW : SchedWriteRes<[HWPort0, HWPort0156]> {
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteFNSTSW], (instregex "FNSTSW16r")>;
+
+// m16.
+def WriteFNSTSWm : SchedWriteRes<[HWPort0, HWPort4, HWPort237]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteFNSTSWm], (instregex "FNSTSWm")>;
+
+// FLDCW.
+def WriteFLDCW : SchedWriteRes<[HWPort01, HWPort23, HWPort6]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteFLDCW], (instregex "FLDCW16m")>;
+
+// FNSTCW.
+def WriteFNSTCW : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteFNSTCW], (instregex "FNSTCW16m")>;
+
+// FINCSTP FDECSTP.
+def : InstRW<[WriteP01], (instregex "FINCSTP", "FDECSTP")>;
+
+// FFREE.
+def : InstRW<[WriteP01], (instregex "FFREE")>;
+
+// FNSAVE.
+def WriteFNSAVE : SchedWriteRes<[]> {
+  let NumMicroOps = 147;
+}
+def : InstRW<[WriteFNSAVE], (instregex "FSAVEm")>;
+
+// FRSTOR.
+def WriteFRSTOR : SchedWriteRes<[]> {
+  let NumMicroOps = 90;
+}
+def : InstRW<[WriteFRSTOR], (instregex "FRSTORm")>;
+
+//-- Arithmetic instructions --//
+
+// FABS.
+def : InstRW<[WriteP0], (instregex "ABS_F")>;
+
+// FCHS.
+def : InstRW<[WriteP0], (instregex "CHS_F")>;
+
+// FCOM(P) FUCOM(P).
+// r.
+def : InstRW<[WriteP1], (instregex "COM_FST0r", "COMP_FST0r", "UCOM_Fr",
+                         "UCOM_FPr")>;
+// m.
+def : InstRW<[WriteP1_P23], (instregex "FCOM(32|64)m", "FCOMP(32|64)m")>;
+
+// FCOMPP FUCOMPP.
+// r.
+def : InstRW<[Write2P01], (instregex "FCOMPP", "UCOM_FPPr")>;
+
+// FCOMI(P) FUCOMI(P).
+// m.
+def : InstRW<[Write3P01], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr",
+                           "UCOM_FIPr")>;
+
+// FICOM(P).
+def : InstRW<[Write2P1_P23], (instregex "FICOM(16|32)m", "FICOMP(16|32)m")>;
+
+// FTST.
+def : InstRW<[WriteP1], (instregex "TST_F")>;
+
+// FXAM.
+def : InstRW<[Write2P1], (instregex "FXAM")>;
+
+// FPREM.
+def WriteFPREM : SchedWriteRes<[]> {
+  let Latency = 19;
+  let NumMicroOps = 28;
+}
+def : InstRW<[WriteFPREM], (instregex "FPREM")>;
+
+// FPREM1.
+def WriteFPREM1 : SchedWriteRes<[]> {
+  let Latency = 27;
+  let NumMicroOps = 41;
+}
+def : InstRW<[WriteFPREM1], (instregex "FPREM1")>;
+
+// FRNDINT.
+def WriteFRNDINT : SchedWriteRes<[]> {
+  let Latency = 11;
+  let NumMicroOps = 17;
+}
+def : InstRW<[WriteFRNDINT], (instregex "FRNDINT")>;
+
+//-- Math instructions --//
+
+// FSCALE.
+def WriteFSCALE : SchedWriteRes<[]> {
+  let Latency = 75; // 49-125
+  let NumMicroOps = 50; // 25-75
+}
+def : InstRW<[WriteFSCALE], (instregex "FSCALE")>;
+
+// FXTRACT.
+def WriteFXTRACT : SchedWriteRes<[]> {
+  let Latency = 15;
+  let NumMicroOps = 17;
+}
+def : InstRW<[WriteFXTRACT], (instregex "FXTRACT")>;
+
+//-- Other instructions --//
+
+// FNOP.
+def : InstRW<[WriteP01], (instregex "FNOP")>;
+
+// WAIT.
+def : InstRW<[Write2P01], (instregex "WAIT")>;
+
+// FNCLEX.
+def : InstRW<[Write5P0156], (instregex "FNCLEX")>;
+
+// FNINIT.
+def WriteFNINIT : SchedWriteRes<[]> {
+  let NumMicroOps = 26;
+}
+def : InstRW<[WriteFNINIT], (instregex "FNINIT")>;
+
+//=== Integer MMX and XMM Instructions ===//
+//-- Move instructions --//
+
+// MOVD.
+// r32/64 <- (x)mm.
+def : InstRW<[WriteP0], (instregex "MMX_MOVD64grr", "MMX_MOVD64from64rr",
+                         "VMOVPDI2DIrr", "MOVPDI2DIrr")>;
+
+// (x)mm <- r32/64.
+def : InstRW<[WriteP5], (instregex "MMX_MOVD64rr", "MMX_MOVD64to64rr",
+                         "VMOVDI2PDIrr", "MOVDI2PDIrr")>;
+
+// MOVQ.
+// r64 <- (x)mm.
+def : InstRW<[WriteP0], (instregex "VMOVPQIto64rr")>;
+
+// (x)mm <- r64.
+def : InstRW<[WriteP5], (instregex "VMOV64toPQIrr", "VMOVZQI2PQIrr")>;
+
+// (x)mm <- (x)mm.
+def : InstRW<[WriteP015], (instregex "MMX_MOVQ64rr")>;
+
+// (V)MOVDQA/U.
+// x <- x.
+def : InstRW<[WriteP015], (instregex "MOVDQ(A|U)rr", "VMOVDQ(A|U)rr",
+                           "MOVDQ(A|U)rr_REV", "VMOVDQ(A|U)rr_REV",
+                           "VMOVDQ(A|U)Yrr", "VMOVDQ(A|U)Yrr_REV")>;
+
+// MOVDQ2Q.
+def : InstRW<[WriteP01_P5], (instregex "MMX_MOVDQ2Qrr")>;
+
+// MOVQ2DQ.
+def : InstRW<[WriteP015], (instregex "MMX_MOVQ2DQrr")>;
+
+
+// PACKSSWB/DW.
+// mm <- mm.
+def WriteMMXPACKSSrr : SchedWriteRes<[HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [3];
+}
+def : InstRW<[WriteMMXPACKSSrr], (instregex "MMX_PACKSSDWirr",
+                                  "MMX_PACKSSWBirr", "MMX_PACKUSWBirr")>;
+
+// mm <- m64.
+def WriteMMXPACKSSrm : SchedWriteRes<[HWPort23, HWPort5]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 3];
+}
+def : InstRW<[WriteMMXPACKSSrm], (instregex "MMX_PACKSSDWirm",
+                                  "MMX_PACKSSWBirm", "MMX_PACKUSWBirm")>;
+
+// VPMOVSX/ZX BW BD BQ DW DQ.
+// y <- x.
+def WriteVPMOVSX : SchedWriteRes<[HWPort5]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+def : InstRW<[WriteVPMOVSX], (instregex "VPMOV(SX|ZX)(BW|BQ|DW|DQ)Yrr")>;
+
+// PBLENDW.
+// x,x,i / v,v,v,i
+def WritePBLENDWr : SchedWriteRes<[HWPort5]>;
+def : InstRW<[WritePBLENDWr], (instregex "(V?)PBLENDW(Y?)rri")>;
+
+// x,m,i / v,v,m,i
+def WritePBLENDWm : SchedWriteRes<[HWPort5, HWPort23]> {
+  let NumMicroOps = 2;
+  let Latency = 4;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePBLENDWm, ReadAfterLd], (instregex "(V?)PBLENDW(Y?)rmi")>;
+
+// VPBLENDD.
+// v,v,v,i.
+def WriteVPBLENDDr : SchedWriteRes<[HWPort015]>;
+def : InstRW<[WriteVPBLENDDr], (instregex "VPBLENDD(Y?)rri")>;
+
+// v,v,m,i
+def WriteVPBLENDDm : SchedWriteRes<[HWPort015, HWPort23]> {
+  let NumMicroOps = 2;
+  let Latency = 4;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteVPBLENDDm, ReadAfterLd], (instregex "VPBLENDD(Y?)rmi")>;
+
+// MASKMOVQ.
+def WriteMASKMOVQ : SchedWriteRes<[HWPort0, HWPort4, HWPort23]> {
+  let Latency = 13;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 1, 2];
+}
+def : InstRW<[WriteMASKMOVQ], (instregex "MMX_MASKMOVQ(64)?")>;
+
+// MASKMOVDQU.
+def WriteMASKMOVDQU : SchedWriteRes<[HWPort04, HWPort56, HWPort23]> {
+  let Latency = 14;
+  let NumMicroOps = 10;
+  let ResourceCycles = [4, 2, 4];
+}
+def : InstRW<[WriteMASKMOVDQU], (instregex "(V?)MASKMOVDQU(64)?")>;
+
+// VPMASKMOV D/Q.
+// v,v,m.
+def WriteVPMASKMOVr : SchedWriteRes<[HWPort5, HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVPMASKMOVr, ReadAfterLd],
+                               (instregex "VPMASKMOV(D|Q)(Y?)rm")>;
+
+// m, v,v.
+def WriteVPMASKMOVm : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
+  let Latency = 13;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteVPMASKMOVm], (instregex "VPMASKMOV(D|Q)(Y?)mr")>;
+
+// PMOVMSKB.
+def WritePMOVMSKB : SchedWriteRes<[HWPort0]> {
+  let Latency = 3;
+}
+def : InstRW<[WritePMOVMSKB], (instregex "(V|MMX_)?PMOVMSKB(Y?)rr")>;
+
+// PEXTR B/W/D/Q.
+// r32,x,i.
+def WritePEXTRr : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePEXTRr], (instregex "PEXTR(B|W|D|Q)rr", "MMX_PEXTRWirri")>;
+
+// m8,x,i.
+def WritePEXTRm : SchedWriteRes<[HWPort23, HWPort4, HWPort5]> {
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WritePEXTRm], (instregex "PEXTR(B|W|D|Q)mr")>;
+
+// VPBROADCAST B/W.
+// x, m8/16.
+def WriteVPBROADCAST128Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteVPBROADCAST128Ld, ReadAfterLd],
+                                     (instregex "VPBROADCAST(B|W)rm")>;
+
+// y, m8/16
+def WriteVPBROADCAST256Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteVPBROADCAST256Ld, ReadAfterLd],
+                                     (instregex "VPBROADCAST(B|W)Yrm")>;
+
+// VPGATHERDD.
+// x.
+def WriteVPGATHERDD128 : SchedWriteRes<[]> {
+  let NumMicroOps = 20;
+}
+def : InstRW<[WriteVPGATHERDD128, ReadAfterLd], (instregex "VPGATHERDDrm")>;
+
+// y.
+def WriteVPGATHERDD256 : SchedWriteRes<[]> {
+  let NumMicroOps = 34;
+}
+def : InstRW<[WriteVPGATHERDD256, ReadAfterLd], (instregex "VPGATHERDDYrm")>;
+
+// VPGATHERQD.
+// x.
+def WriteVPGATHERQD128 : SchedWriteRes<[]> {
+  let NumMicroOps = 15;
+}
+def : InstRW<[WriteVPGATHERQD128, ReadAfterLd], (instregex "VPGATHERQDrm")>;
+
+// y.
+def WriteVPGATHERQD256 : SchedWriteRes<[]> {
+  let NumMicroOps = 22;
+}
+def : InstRW<[WriteVPGATHERQD256, ReadAfterLd], (instregex "VPGATHERQDYrm")>;
+
+// VPGATHERDQ.
+// x.
+def WriteVPGATHERDQ128 : SchedWriteRes<[]> {
+  let NumMicroOps = 12;
+}
+def : InstRW<[WriteVPGATHERDQ128, ReadAfterLd], (instregex "VPGATHERDQrm")>;
+
+// y.
+def WriteVPGATHERDQ256 : SchedWriteRes<[]> {
+  let NumMicroOps = 20;
+}
+def : InstRW<[WriteVPGATHERDQ256, ReadAfterLd], (instregex "VPGATHERDQYrm")>;
+
+// VPGATHERQQ.
+// x.
+def WriteVPGATHERQQ128 : SchedWriteRes<[]> {
+  let NumMicroOps = 14;
+}
+def : InstRW<[WriteVPGATHERQQ128, ReadAfterLd], (instregex "VPGATHERQQrm")>;
+
+// y.
+def WriteVPGATHERQQ256 : SchedWriteRes<[]> {
+  let NumMicroOps = 22;
+}
+def : InstRW<[WriteVPGATHERQQ256, ReadAfterLd], (instregex "VPGATHERQQYrm")>;
+
+//-- Arithmetic instructions --//
+
+// PHADD|PHSUB (S) W/D.
+// v <- v,v.
+def WritePHADDSUBr : SchedWriteRes<[HWPort1, HWPort5]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2];
+}
+def : InstRW<[WritePHADDSUBr], (instregex "MMX_PHADD(W?)rr64",
+                               "MMX_PHADDSWrr64",
+                               "MMX_PHSUB(W|D)rr64",
+                               "MMX_PHSUBSWrr64",
+                               "(V?)PH(ADD|SUB)(W|D)(Y?)rr",
+                               "(V?)PH(ADD|SUB)SWrr(256)?")>;
+
+// v <- v,m.
+def WritePHADDSUBm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WritePHADDSUBm, ReadAfterLd],
+                              (instregex "MMX_PHADD(W?)rm64",
+                               "MMX_PHADDSWrm64",
+                               "MMX_PHSUB(W|D)rm64",
+                               "MMX_PHSUBSWrm64",
+                               "(V?)PH(ADD|SUB)(W|D)(Y?)rm",
+                               "(V?)PH(ADD|SUB)SWrm(128|256)?")>;
+
+// PCMPGTQ.
+// v <- v,v.
+def WritePCMPGTQr : SchedWriteRes<[HWPort0]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+def : InstRW<[WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
+
+// v <- v,m.
+def WritePCMPGTQm : SchedWriteRes<[HWPort0, HWPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePCMPGTQm, ReadAfterLd], (instregex "(V?)PCMPGTQ(Y?)rm")>;
+
+// PMULLD.
+// x,x / y,y,y.
+def WritePMULLDr : SchedWriteRes<[HWPort0]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def : InstRW<[WritePMULLDr], (instregex "(V?)PMULLD(Y?)rr")>;
+
+// x,m / y,y,m.
+def WritePMULLDm : SchedWriteRes<[HWPort0, HWPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WritePMULLDm, ReadAfterLd], (instregex "(V?)PMULLD(Y?)rm")>;
+
+//-- Logic instructions --//
+
+// PTEST.
+// v,v.
+def WritePTESTr : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rr")>;
+
+// v,m.
+def WritePTESTm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rm")>;
+
+// PSLL,PSRL,PSRA W/D/Q.
+// x,x / v,v,x.
+def WritePShift : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePShift], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)(Y?)rr")>;
+
+// PSLL,PSRL DQ.
+def : InstRW<[WriteP5], (instregex "(V?)PS(R|L)LDQ(Y?)ri")>;
+
+//-- Other --//
+
+// EMMS.
+def WriteEMMS : SchedWriteRes<[]> {
+  let Latency = 13;
+  let NumMicroOps = 31;
+}
+def : InstRW<[WriteEMMS], (instregex "MMX_EMMS")>;
+
+//=== Floating Point XMM and YMM Instructions ===//
+//-- Move instructions --//
+
+// MOVMSKP S/D.
+// r32 <- x.
+def WriteMOVMSKPr : SchedWriteRes<[HWPort0]> {
+  let Latency = 3;
+}
+def : InstRW<[WriteMOVMSKPr], (instregex "(V?)MOVMSKP(S|D)rr")>;
+
+// r32 <- y.
+def WriteVMOVMSKPYr : SchedWriteRes<[HWPort0]> {
+  let Latency = 2;
+}
+def : InstRW<[WriteVMOVMSKPYr], (instregex "VMOVMSKP(S|D)Yrr")>;
+
+// VPERM2F128.
+def : InstRW<[WriteFShuffle256], (instregex "VPERM2F128rr")>;
+def : InstRW<[WriteFShuffle256Ld, ReadAfterLd], (instregex "VPERM2F128rm")>;
+
+// BLENDVP S/D.
+def : InstRW<[WriteFVarBlend], (instregex "BLENDVP(S|D)rr0")>;
+def : InstRW<[WriteFVarBlendLd, ReadAfterLd], (instregex "BLENDVP(S|D)rm0")>;
+
+// VBROADCASTF128.
+def : InstRW<[WriteLoad], (instregex "VBROADCASTF128")>;
+
+// EXTRACTPS.
+// r32,x,i.
+def WriteEXTRACTPSr : SchedWriteRes<[HWPort0, HWPort5]> {
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>;
+
+// m32,x,i.
+def WriteEXTRACTPSm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>;
+
+// VEXTRACTF128.
+// x,y,i.
+def : InstRW<[WriteFShuffle256], (instregex "VEXTRACTF128rr")>;
+
+// m128,y,i.
+def WriteVEXTRACTF128m : SchedWriteRes<[HWPort23, HWPort4]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteVEXTRACTF128m], (instregex "VEXTRACTF128mr")>;
+
+// VINSERTF128.
+// y,y,x,i.
+def : InstRW<[WriteFShuffle256], (instregex "VINSERTF128rr")>;
+
+// y,y,m128,i.
+def WriteVINSERTF128m : SchedWriteRes<[HWPort015, HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteFShuffle256, ReadAfterLd], (instregex "VINSERTF128rm")>;
+
+// VMASKMOVP S/D.
+// v,v,m.
+def WriteVMASKMOVPrm : SchedWriteRes<[HWPort5, HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVMASKMOVPrm], (instregex "VMASKMOVP(S|D)(Y?)rm")>;
+
+// m128,x,x.
+def WriteVMASKMOVPmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
+  let Latency = 13;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteVMASKMOVPmr], (instregex "VMASKMOVP(S|D)mr")>;
+
+// m256,y,y.
+def WriteVMASKMOVPYmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
+  let Latency = 14;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteVMASKMOVPYmr], (instregex "VMASKMOVP(S|D)Ymr")>;
+
+// VGATHERDPS.
+// x.
+def WriteVGATHERDPS128 : SchedWriteRes<[]> {
+  let NumMicroOps = 20;
+}
+def : InstRW<[WriteVGATHERDPS128, ReadAfterLd], (instregex "VGATHERDPSrm")>;
+
+// y.
+def WriteVGATHERDPS256 : SchedWriteRes<[]> {
+  let NumMicroOps = 34;
+}
+def : InstRW<[WriteVGATHERDPS256, ReadAfterLd], (instregex "VGATHERDPSYrm")>;
+
+// VGATHERQPS.
+// x.
+def WriteVGATHERQPS128 : SchedWriteRes<[]> {
+  let NumMicroOps = 15;
+}
+def : InstRW<[WriteVGATHERQPS128, ReadAfterLd], (instregex "VGATHERQPSrm")>;
+
+// y.
+def WriteVGATHERQPS256 : SchedWriteRes<[]> {
+  let NumMicroOps = 22;
+}
+def : InstRW<[WriteVGATHERQPS256, ReadAfterLd], (instregex "VGATHERQPSYrm")>;
+
+// VGATHERDPD.
+// x.
+def WriteVGATHERDPD128 : SchedWriteRes<[]> {
+  let NumMicroOps = 12;
+}
+def : InstRW<[WriteVGATHERDPD128, ReadAfterLd], (instregex "VGATHERDPDrm")>;
+
+// y.
+def WriteVGATHERDPD256 : SchedWriteRes<[]> {
+  let NumMicroOps = 20;
+}
+def : InstRW<[WriteVGATHERDPD256, ReadAfterLd], (instregex "VGATHERDPDYrm")>;
+
+// VGATHERQPD.
+// x.
+def WriteVGATHERQPD128 : SchedWriteRes<[]> {
+  let NumMicroOps = 14;
+}
+def : InstRW<[WriteVGATHERQPD128, ReadAfterLd], (instregex "VGATHERQPDrm")>;
+
+// y.
+def WriteVGATHERQPD256 : SchedWriteRes<[]> {
+  let NumMicroOps = 22;
+}
+def : InstRW<[WriteVGATHERQPD256, ReadAfterLd], (instregex "VGATHERQPDYrm")>;
+
+//-- Conversion instructions --//
+
+// CVTPD2PS.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVTPD2PSrr")>;
+
+// x,m128.
+def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVTPD2PS(X?)rm")>;
+
+// x,y.
+def WriteCVTPD2PSYrr : SchedWriteRes<[HWPort1, HWPort5]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTPD2PSYrr], (instregex "(V?)CVTPD2PSYrr")>;
+
+// x,m256.
+def WriteCVTPD2PSYrm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteCVTPD2PSYrm], (instregex "(V?)CVTPD2PSYrm")>;
+
+// CVTSD2SS.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V)?CVTSD2SSrr")>;
+
+// x,m64.
+def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(Int_)?(V)?CVTSD2SSrm")>;
+
+// CVTPS2PD.
+// x,x.
+def WriteCVTPS2PDrr : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTPS2PDrr], (instregex "(V?)CVTPS2PDrr")>;
+
+// x,m64.
+// y,m128.
+def WriteCVTPS2PDrm : SchedWriteRes<[HWPort0, HWPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTPS2PDrm], (instregex "(V?)CVTPS2PD(Y?)rm")>;
+
+// y,x.
+def WriteVCVTPS2PDYrr : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteVCVTPS2PDYrr], (instregex "VCVTPS2PDYrr")>;
+
+// CVTSS2SD.
+// x,x.
+def WriteCVTSS2SDrr : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTSS2SDrr], (instregex "(Int_)?(V?)CVTSS2SDrr")>;
+
+// x,m32.
+def WriteCVTSS2SDrm : SchedWriteRes<[HWPort0, HWPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTSS2SDrm], (instregex "(Int_)?(V?)CVTSS2SDrm")>;
+
+// CVTDQ2PD.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(V)?CVTDQ2PDrr")>;
+
+// y,x.
+def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVTDQ2PDYrr")>;
+
+// CVT(T)PD2DQ.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVT(T?)PD2DQrr")>;
+// x,m128.
+def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVT(T?)PD2DQrm")>;
+// x,y.
+def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVT(T?)PD2DQYrr")>;
+// x,m256.
+def : InstRW<[WriteP1_P5_Lat6Ld], (instregex "VCVT(T?)PD2DQYrm")>;
+
+// CVT(T)PS2PI.
+// mm,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PS2PIirr")>;
+
+// CVTPI2PD.
+// x,mm.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PI2PDirr")>;
+
+// CVT(T)PD2PI.
+// mm,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PD2PIirr")>;
+
+// CVSTSI2SS.
+// x,r32.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V?)CVT(T?)SI2SS(64)?rr")>;
+
+// CVT(T)SS2SI.
+// r32,x.
+def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rr")>;
+// r32,m32.
+def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rm")>;
+
+// CVTSI2SD.
+// x,r32/64.
+def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVTSI2SS(64)?rr")>;
+
+// CVTSD2SI.
+// r32/64
+def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rr")>;
+// r32,m32.
+def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rm")>;
+
+// VCVTPS2PH.
+// x,v,i.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPS2PH(Y?)rr")>;
+// m,v,i.
+def : InstRW<[WriteP1_P5_Lat4Ld, WriteRMW], (instregex "VCVTPS2PH(Y?)mr")>;
+
+// VCVTPH2PS.
+// v,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPH2PS(Y?)rr")>;
+
+//-- Arithmetic instructions --//
+
+// HADD, HSUB PS/PD
+// x,x / v,v,v.
+def WriteHADDSUBPr : SchedWriteRes<[HWPort1, HWPort5]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2];
+}
+def : InstRW<[WriteHADDSUBPr], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rr")>;
+
+// x,m / v,v,m.
+def WriteHADDSUBPm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WriteHADDSUBPm], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rm")>;
+
+// MULL SS/SD PS/PD.
+// x,x / v,v,v.
+def WriteMULr : SchedWriteRes<[HWPort01]> {
+  let Latency = 5;
+}
+def : InstRW<[WriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>;
+
+// x,m / v,v,m.
+def WriteMULm : SchedWriteRes<[HWPort01, HWPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteMULm], (instregex "(V?)MUL(P|S)(S|D)rm")>;
+
+// VDIVPS.
+// y,y,y.
+def WriteVDIVPSYrr : SchedWriteRes<[HWPort0, HWPort15]> {
+  let Latency = 19; // 18-21 cycles.
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVDIVPSYrr], (instregex "VDIVPSYrr")>;
+
+// y,y,m256.
+def WriteVDIVPSYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+  let Latency = 23; // 18-21 + 4 cycles.
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVDIVPSYrm, ReadAfterLd], (instregex "VDIVPSYrm")>;
+
+// VDIVPD.
+// y,y,y.
+def WriteVDIVPDYrr : SchedWriteRes<[HWPort0, HWPort15]> {
+  let Latency = 27; // 19-35 cycles.
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVDIVPDYrr], (instregex "VDIVPDYrr")>;
+
+// y,y,m256.
+def WriteVDIVPDYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+  let Latency = 31; // 19-35 + 4 cycles.
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVDIVPDYrm, ReadAfterLd], (instregex "VDIVPDYrm")>;
+
+// VRCPPS.
+// y,y.
+def WriteVRCPPSr : SchedWriteRes<[HWPort0, HWPort15]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVRCPPSr], (instregex "VRCPPSYr(_Int)?")>;
+
+// y,m256.
+def WriteVRCPPSm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+  let Latency = 11;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVRCPPSm], (instregex "VRCPPSYm(_Int)?")>;
+
+// ROUND SS/SD PS/PD.
+// v,v,i.
+def WriteROUNDr : SchedWriteRes<[HWPort1]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def : InstRW<[WriteROUNDr], (instregex "(V?)ROUND(Y?)(S|P)(S|D)r(_Int)?")>;
+
+// v,m,i.
+def WriteROUNDm : SchedWriteRes<[HWPort1, HWPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteROUNDm], (instregex "(V?)ROUND(Y?)(S|P)(S|D)m(_Int)?")>;
+
+// DPPS.
+// x,x,i / v,v,v,i.
+def WriteDPPSr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> {
+  let Latency = 14;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteDPPSr], (instregex "(V?)DPPS(Y?)rri")>;
+
+// x,m,i / v,v,m,i.
+def WriteDPPSm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23, HWPort6]> {
+  let Latency = 18;
+  let NumMicroOps = 6;
+  let ResourceCycles = [2, 1, 1, 1, 1];
+}
+def : InstRW<[WriteDPPSm, ReadAfterLd], (instregex "(V?)DPPS(Y?)rmi")>;
+
+// DPPD.
+// x,x,i.
+def WriteDPPDr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteDPPDr], (instregex "(V?)DPPDrri")>;
+
+// x,m,i.
+def WriteDPPDm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23]> {
+  let Latency = 13;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteDPPDm], (instregex "(V?)DPPDrmi")>;
+
+// VFMADD.
+// v,v,v.
+def WriteFMADDr : SchedWriteRes<[HWPort01]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+def : InstRW<[WriteFMADDr],
+    (instregex
+    // 3p forms.
+    "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)r(Y)?",
+    // 3s forms.
+    "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)r",
+    // 4s/4s_int forms.
+    "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?",
+    // 4p forms.
+    "VF(N?)M(ADD|SUB)P(S|D)4rr(Y)?(_REV)?")>;
+
+// v,v,m.
+def WriteFMADDm : SchedWriteRes<[HWPort01, HWPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteFMADDm],
+    (instregex
+    // 3p forms.
+    "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)m(Y)?",
+    // 3s forms.
+    "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)m",
+    // 4s/4s_int forms.
+    "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?",
+    // 4p forms.
+    "VF(N?)M(ADD|SUB)P(S|D)4(rm|mr)(Y)?")>;
+
+//-- Math instructions --//
+
+// VSQRTPS.
+// y,y.
+def WriteVSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> {
+  let Latency = 19;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVSQRTPSYr], (instregex "VSQRTPSYr")>;
+
+// y,m256.
+def WriteVSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+  let Latency = 23;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVSQRTPSYm], (instregex "VSQRTPSYm")>;
+
+// VSQRTPD.
+// y,y.
+def WriteVSQRTPDYr : SchedWriteRes<[HWPort0, HWPort15]> {
+  let Latency = 28;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVSQRTPDYr], (instregex "VSQRTPDYr")>;
+
+// y,m256.
+def WriteVSQRTPDYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+  let Latency = 32;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVSQRTPDYm], (instregex "VSQRTPDYm")>;
+
+// RSQRT SS/PS.
+// x,x.
+def WriteRSQRTr : SchedWriteRes<[HWPort0]> {
+  let Latency = 5;
+}
+def : InstRW<[WriteRSQRTr], (instregex "(V?)RSQRT(SS|PS)r(_Int)?")>;
+
+// x,m128.
+def WriteRSQRTm : SchedWriteRes<[HWPort0, HWPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteRSQRTm], (instregex "(V?)RSQRT(SS|PS)m(_Int)?")>;
+
+// RSQRTPS 256.
+// y,y.
+def WriteRSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteRSQRTPSYr], (instregex "VRSQRTPSYr(_Int)?")>;
+
+// y,m256.
+def WriteRSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+  let Latency = 11;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteRSQRTPSYm], (instregex "VRSQRTPSYm(_Int)?")>;
+
+//-- Logic instructions --//
+
+// AND, ANDN, OR, XOR PS/PD.
+// x,x / v,v,v.
+def : InstRW<[WriteP5], (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rr")>;
+// x,m / v,v,m.
+def : InstRW<[WriteP5Ld, ReadAfterLd],
+                         (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rm")>;
+
+//-- Other instructions --//
+
+// VZEROUPPER.
+def WriteVZEROUPPER : SchedWriteRes<[]> {
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteVZEROUPPER], (instregex "VZEROUPPER")>;
+
+// VZEROALL.
+def WriteVZEROALL : SchedWriteRes<[]> {
+  let NumMicroOps = 12;
+}
+def : InstRW<[WriteVZEROALL], (instregex "VZEROALL")>;
+
+// LDMXCSR.
+def WriteLDMXCSR : SchedWriteRes<[HWPort0, HWPort6, HWPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteLDMXCSR], (instregex "(V)?LDMXCSR")>;
+
+// STMXCSR.
+def WriteSTMXCSR : SchedWriteRes<[HWPort0, HWPort4, HWPort6, HWPort237]> {
+  let Latency = 7;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteSTMXCSR], (instregex "(V)?STMXCSR")>;
+
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
new file mode 100644
index 0000000..eca65c2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -0,0 +1,250 @@
+//=- X86SchedSandyBridge.td - X86 Sandy Bridge Scheduling ----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Sandy Bridge to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def SandyBridgeModel : SchedMachineModel {
+  // All x86 instructions are modeled as a single micro-op, and SB can decode 4
+  // instructions per cycle.
+  // FIXME: Identify instructions that aren't a single fused micro-op.
+  let IssueWidth = 4;
+  let MicroOpBufferSize = 168; // Based on the reorder buffer.
+  let LoadLatency = 4;
+  let MispredictPenalty = 16;
+
+  // Based on the LSD (loop-stream detector) queue size.
+  let LoopMicroOpBufferSize = 28;
+
+  // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
+}
+
+let SchedModel = SandyBridgeModel in {
+
+// Sandy Bridge can issue micro-ops to 6 different ports in one cycle.
+
+// Ports 0, 1, and 5 handle all computation.
+def SBPort0 : ProcResource<1>;
+def SBPort1 : ProcResource<1>;
+def SBPort5 : ProcResource<1>;
+
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores.
+def SBPort23 : ProcResource<2>;
+
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+def SBPort4 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def SBPort05  : ProcResGroup<[SBPort0, SBPort5]>;
+def SBPort15  : ProcResGroup<[SBPort1, SBPort5]>;
+def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>;
+
+// 54 Entry Unified Scheduler
+def SBPortAny : ProcResGroup<[SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]> {
+  let BufferSize=54;
+}
+
+// Integer division issued on port 0.
+def SBDivider : ProcResource<1>;
+
+// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SBWriteResPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [SBPort23, ExePort]> {
+     let Latency = !add(Lat, 4);
+  }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SBPort4]>;
+
+def : WriteRes<WriteStore, [SBPort23, SBPort4]>;
+def : WriteRes<WriteLoad,  [SBPort23]> { let Latency = 4; }
+def : WriteRes<WriteMove,  [SBPort015]>;
+def : WriteRes<WriteZero,  []>;
+
+defm : SBWriteResPair<WriteALU,   SBPort015, 1>;
+defm : SBWriteResPair<WriteIMul,  SBPort1,   3>;
+def  : WriteRes<WriteIMulH, []> { let Latency = 3; }
+defm : SBWriteResPair<WriteShift, SBPort05,  1>;
+defm : SBWriteResPair<WriteJump,  SBPort5,   1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [SBPort15]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [SBPort0, SBDivider]> {
+  let Latency = 25;
+  let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> {
+  let Latency = 29;
+  let ResourceCycles = [1, 1, 10];
+}
+
+// Scalar and vector floating point.
+defm : SBWriteResPair<WriteFAdd,   SBPort1, 3>;
+defm : SBWriteResPair<WriteFMul,   SBPort0, 5>;
+defm : SBWriteResPair<WriteFDiv,   SBPort0, 12>; // 10-14 cycles.
+defm : SBWriteResPair<WriteFRcp,   SBPort0, 5>;
+defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>;
+defm : SBWriteResPair<WriteFSqrt,  SBPort0, 15>;
+defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
+defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
+defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>;
+defm : SBWriteResPair<WriteFShuffle,  SBPort5,  1>;
+defm : SBWriteResPair<WriteFBlend,  SBPort05,  1>;
+def : WriteRes<WriteFVarBlend, [SBPort0, SBPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteFVarBlendLd, [SBPort0, SBPort5, SBPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 1];
+}
+
+// Vector integer operations.
+defm : SBWriteResPair<WriteVecShift, SBPort05,  1>;
+defm : SBWriteResPair<WriteVecLogic, SBPort015, 1>;
+defm : SBWriteResPair<WriteVecALU,   SBPort15,  1>;
+defm : SBWriteResPair<WriteVecIMul,  SBPort0,   5>;
+defm : SBWriteResPair<WriteShuffle,  SBPort15,  1>;
+defm : SBWriteResPair<WriteBlend,  SBPort15,  1>;
+def : WriteRes<WriteVarBlend, [SBPort1, SBPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteVarBlendLd, [SBPort1, SBPort5, SBPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 1];
+}
+def : WriteRes<WriteMPSAD, [SBPort0, SBPort1, SBPort5]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 1];
+}
+def : WriteRes<WriteMPSADLd, [SBPort0, SBPort1, SBPort5, SBPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [SBPort015]> {
+  let Latency = 11;
+  let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [SBPort015, SBPort23]> {
+  let Latency = 11;
+  let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [SBPort015]> {
+  let Latency = 11;
+  let ResourceCycles = [8];
+}
+def : WriteRes<WritePCmpEStrMLd, [SBPort015, SBPort23]> {
+  let Latency = 11;
+  let ResourceCycles = [7, 1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [SBPort015]> {
+  let Latency = 3;
+  let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [SBPort015, SBPort23]> {
+  let Latency = 3;
+  let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [SBPort015]> {
+  let Latency = 4;
+  let ResourceCycles = [8];
+}
+def : WriteRes<WritePCmpEStrILd, [SBPort015, SBPort23]> {
+  let Latency = 4;
+  let ResourceCycles = [7, 1];
+}
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [SBPort015]> {
+  let Latency = 8;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESDecEncLd, [SBPort015, SBPort23]> {
+  let Latency = 8;
+  let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteAESIMC, [SBPort015]> {
+  let Latency = 8;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [SBPort015, SBPort23]> {
+  let Latency = 8;
+  let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [SBPort015]> {
+  let Latency = 8;
+  let ResourceCycles = [11];
+}
+def : WriteRes<WriteAESKeyGenLd, [SBPort015, SBPort23]> {
+  let Latency = 8;
+  let ResourceCycles = [10, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [SBPort015]> {
+  let Latency = 14;
+  let ResourceCycles = [18];
+}
+def : WriteRes<WriteCLMulLd, [SBPort015, SBPort23]> {
+  let Latency = 14;
+  let ResourceCycles = [17, 1];
+}
+
+
+def : WriteRes<WriteSystem,     [SBPort015]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; }
+def : WriteRes<WriteFence, [SBPort23, SBPort4]>;
+def : WriteRes<WriteNop, []>;
+
+// AVX2 is not supported on that architecture, but we should define the basic
+// scheduling resources anyway.
+defm : SBWriteResPair<WriteFShuffle256, SBPort0,  1>;
+defm : SBWriteResPair<WriteShuffle256, SBPort0,  1>;
+defm : SBWriteResPair<WriteVarVecShift, SBPort0,  1>;
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm/lib/Target/X86/X86Schedule.td
new file mode 100644
index 0000000..a261356
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Schedule.td
@@ -0,0 +1,650 @@
+//===-- X86Schedule.td - X86 Scheduling Definitions --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// InstrSchedModel annotations for out-of-order CPUs.
+//
+// These annotations are independent of the itinerary classes defined below.
+
+// Instructions with folded loads need to read the memory operand immediately,
+// but other register operands don't have to be read until the load is ready.
+// These operands are marked with ReadAfterLd.
+def ReadAfterLd : SchedRead;
+
+// Instructions with both a load and a store folded are modeled as a folded
+// load + WriteRMW.
+def WriteRMW : SchedWrite;
+
+// Most instructions can fold loads, so almost every SchedWrite comes in two
+// variants: With and without a folded load.
+// An X86FoldableSchedWrite holds a reference to the corresponding SchedWrite
+// with a folded load.
+class X86FoldableSchedWrite : SchedWrite {
+  // The SchedWrite to use when a load is folded into the instruction.
+  SchedWrite Folded;
+}
+
+// Multiclass that produces a linked pair of SchedWrites.
+multiclass X86SchedWritePair {
+  // Register-Memory operation.
+  def Ld : SchedWrite;
+  // Register-Register operation.
+  def NAME : X86FoldableSchedWrite {
+    let Folded = !cast<SchedWrite>(NAME#"Ld");
+  }
+}
+
+// Arithmetic.
+defm WriteALU  : X86SchedWritePair; // Simple integer ALU op.
+defm WriteIMul : X86SchedWritePair; // Integer multiplication.
+def  WriteIMulH : SchedWrite;       // Integer multiplication, high part.
+defm WriteIDiv : X86SchedWritePair; // Integer division.
+def  WriteLEA  : SchedWrite;        // LEA instructions can't fold loads.
+
+// Integer shifts and rotates.
+defm WriteShift : X86SchedWritePair;
+
+// Loads, stores, and moves, not folded with other operations.
+def WriteLoad  : SchedWrite;
+def WriteStore : SchedWrite;
+def WriteMove  : SchedWrite;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+def WriteZero : SchedWrite;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm WriteJump : X86SchedWritePair;
+
+// Floating point. This covers both scalar and vector operations.
+defm WriteFAdd   : X86SchedWritePair; // Floating point add/sub/compare.
+defm WriteFMul   : X86SchedWritePair; // Floating point multiplication.
+defm WriteFDiv   : X86SchedWritePair; // Floating point division.
+defm WriteFSqrt  : X86SchedWritePair; // Floating point square root.
+defm WriteFRcp   : X86SchedWritePair; // Floating point reciprocal estimate.
+defm WriteFRsqrt : X86SchedWritePair; // Floating point reciprocal square root estimate.
+defm WriteFMA    : X86SchedWritePair; // Fused Multiply Add.
+defm WriteFShuffle  : X86SchedWritePair; // Floating point vector shuffles.
+defm WriteFBlend  : X86SchedWritePair; // Floating point vector blends.
+defm WriteFVarBlend  : X86SchedWritePair; // Fp vector variable blends.
+
+// FMA Scheduling helper class.
+class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
+
+// Vector integer operations.
+defm WriteVecALU   : X86SchedWritePair; // Vector integer ALU op, no logicals.
+defm WriteVecShift : X86SchedWritePair; // Vector integer shifts.
+defm WriteVecIMul  : X86SchedWritePair; // Vector integer multiply.
+defm WriteShuffle  : X86SchedWritePair; // Vector shuffles.
+defm WriteBlend  : X86SchedWritePair; // Vector blends.
+defm WriteVarBlend  : X86SchedWritePair; // Vector variable blends.
+defm WriteMPSAD : X86SchedWritePair; // Vector MPSAD.
+
+// Vector bitwise operations.
+// These are often used on both floating point and integer vectors.
+defm WriteVecLogic : X86SchedWritePair; // Vector and/or/xor.
+
+// Conversion between integer and float.
+defm WriteCvtF2I : X86SchedWritePair; // Float -> Integer.
+defm WriteCvtI2F : X86SchedWritePair; // Integer -> Float.
+defm WriteCvtF2F : X86SchedWritePair; // Float -> Float size conversion.
+
+// Strings instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+defm WritePCmpIStrM : X86SchedWritePair;
+// Packed Compare Explicit Length Strings, Return Mask
+defm WritePCmpEStrM : X86SchedWritePair;
+// Packed Compare Implicit Length Strings, Return Index
+defm WritePCmpIStrI : X86SchedWritePair;
+// Packed Compare Explicit Length Strings, Return Index
+defm WritePCmpEStrI : X86SchedWritePair;
+
+// AES instructions.
+defm WriteAESDecEnc : X86SchedWritePair; // Decryption, encryption.
+defm WriteAESIMC : X86SchedWritePair; // InvMixColumn.
+defm WriteAESKeyGen : X86SchedWritePair; // Key Generation.
+
+// Carry-less multiplication instructions.
+defm WriteCLMul : X86SchedWritePair;
+
+// Catch-all for expensive system instructions.
+def WriteSystem : SchedWrite;
+
+// AVX2.
+defm WriteFShuffle256 : X86SchedWritePair; // Fp 256-bit width vector shuffles.
+defm WriteShuffle256 : X86SchedWritePair; // 256-bit width vector shuffles.
+defm WriteVarVecShift : X86SchedWritePair; // Variable vector shifts.
+
+// Old microcoded instructions that nobody use.
+def WriteMicrocoded : SchedWrite;
+
+// Fence instructions.
+def WriteFence : SchedWrite;
+
+// Nop, not very useful expect it provides a model for nops!
+def WriteNop : SchedWrite;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for X86
+def IIC_ALU_MEM     : InstrItinClass;
+def IIC_ALU_NONMEM  : InstrItinClass;
+def IIC_LEA         : InstrItinClass;
+def IIC_LEA_16      : InstrItinClass;
+def IIC_MUL8        : InstrItinClass;
+def IIC_MUL16_MEM   : InstrItinClass;
+def IIC_MUL16_REG   : InstrItinClass;
+def IIC_MUL32_MEM   : InstrItinClass;
+def IIC_MUL32_REG   : InstrItinClass;
+def IIC_MUL64       : InstrItinClass;
+// imul by al, ax, eax, tax
+def IIC_IMUL8       : InstrItinClass;
+def IIC_IMUL16_MEM  : InstrItinClass;
+def IIC_IMUL16_REG  : InstrItinClass;
+def IIC_IMUL32_MEM  : InstrItinClass;
+def IIC_IMUL32_REG  : InstrItinClass;
+def IIC_IMUL64      : InstrItinClass;
+// imul reg by reg|mem
+def IIC_IMUL16_RM   : InstrItinClass;
+def IIC_IMUL16_RR   : InstrItinClass;
+def IIC_IMUL32_RM   : InstrItinClass;
+def IIC_IMUL32_RR   : InstrItinClass;
+def IIC_IMUL64_RM   : InstrItinClass;
+def IIC_IMUL64_RR   : InstrItinClass;
+// imul reg = reg/mem * imm
+def IIC_IMUL16_RMI  : InstrItinClass;
+def IIC_IMUL16_RRI  : InstrItinClass;
+def IIC_IMUL32_RMI  : InstrItinClass;
+def IIC_IMUL32_RRI  : InstrItinClass;
+def IIC_IMUL64_RMI  : InstrItinClass;
+def IIC_IMUL64_RRI  : InstrItinClass;
+// div
+def IIC_DIV8_MEM    : InstrItinClass;
+def IIC_DIV8_REG    : InstrItinClass;
+def IIC_DIV16       : InstrItinClass;
+def IIC_DIV32       : InstrItinClass;
+def IIC_DIV64       : InstrItinClass;
+// idiv
+def IIC_IDIV8       : InstrItinClass;
+def IIC_IDIV16      : InstrItinClass;
+def IIC_IDIV32      : InstrItinClass;
+def IIC_IDIV64      : InstrItinClass;
+// neg/not/inc/dec
+def IIC_UNARY_REG   : InstrItinClass;
+def IIC_UNARY_MEM   : InstrItinClass;
+// add/sub/and/or/xor/sbc/cmp/test
+def IIC_BIN_MEM     : InstrItinClass;
+def IIC_BIN_NONMEM  : InstrItinClass;
+// adc/sbc
+def IIC_BIN_CARRY_MEM     : InstrItinClass;
+def IIC_BIN_CARRY_NONMEM  : InstrItinClass;
+// shift/rotate
+def IIC_SR          : InstrItinClass;
+// shift double
+def IIC_SHD16_REG_IM : InstrItinClass;
+def IIC_SHD16_REG_CL : InstrItinClass;
+def IIC_SHD16_MEM_IM : InstrItinClass;
+def IIC_SHD16_MEM_CL : InstrItinClass;
+def IIC_SHD32_REG_IM : InstrItinClass;
+def IIC_SHD32_REG_CL : InstrItinClass;
+def IIC_SHD32_MEM_IM : InstrItinClass;
+def IIC_SHD32_MEM_CL : InstrItinClass;
+def IIC_SHD64_REG_IM : InstrItinClass;
+def IIC_SHD64_REG_CL : InstrItinClass;
+def IIC_SHD64_MEM_IM : InstrItinClass;
+def IIC_SHD64_MEM_CL : InstrItinClass;
+// cmov
+def IIC_CMOV16_RM : InstrItinClass;
+def IIC_CMOV16_RR : InstrItinClass;
+def IIC_CMOV32_RM : InstrItinClass;
+def IIC_CMOV32_RR : InstrItinClass;
+def IIC_CMOV64_RM : InstrItinClass;
+def IIC_CMOV64_RR : InstrItinClass;
+// set
+def IIC_SET_R : InstrItinClass;
+def IIC_SET_M : InstrItinClass;
+// jmp/jcc/jcxz
+def IIC_Jcc : InstrItinClass;
+def IIC_JCXZ : InstrItinClass;
+def IIC_JMP_REL : InstrItinClass;
+def IIC_JMP_REG : InstrItinClass;
+def IIC_JMP_MEM : InstrItinClass;
+def IIC_JMP_FAR_MEM : InstrItinClass;
+def IIC_JMP_FAR_PTR : InstrItinClass;
+// loop
+def IIC_LOOP : InstrItinClass;
+def IIC_LOOPE : InstrItinClass;
+def IIC_LOOPNE : InstrItinClass;
+// call
+def IIC_CALL_RI : InstrItinClass;
+def IIC_CALL_MEM : InstrItinClass;
+def IIC_CALL_FAR_MEM : InstrItinClass;
+def IIC_CALL_FAR_PTR : InstrItinClass;
+// ret
+def IIC_RET : InstrItinClass;
+def IIC_RET_IMM : InstrItinClass;
+//sign extension movs
+def IIC_MOVSX : InstrItinClass;
+def IIC_MOVSX_R16_R8 : InstrItinClass;
+def IIC_MOVSX_R16_M8 : InstrItinClass;
+def IIC_MOVSX_R16_R16 : InstrItinClass;
+def IIC_MOVSX_R32_R32 : InstrItinClass;
+//zero extension movs
+def IIC_MOVZX : InstrItinClass;
+def IIC_MOVZX_R16_R8 : InstrItinClass;
+def IIC_MOVZX_R16_M8 : InstrItinClass;
+
+def IIC_REP_MOVS : InstrItinClass;
+def IIC_REP_STOS : InstrItinClass;
+
+// SSE scalar/parallel binary operations
+def IIC_SSE_ALU_F32S_RR : InstrItinClass;
+def IIC_SSE_ALU_F32S_RM : InstrItinClass;
+def IIC_SSE_ALU_F64S_RR : InstrItinClass;
+def IIC_SSE_ALU_F64S_RM : InstrItinClass;
+def IIC_SSE_MUL_F32S_RR : InstrItinClass;
+def IIC_SSE_MUL_F32S_RM : InstrItinClass;
+def IIC_SSE_MUL_F64S_RR : InstrItinClass;
+def IIC_SSE_MUL_F64S_RM : InstrItinClass;
+def IIC_SSE_DIV_F32S_RR : InstrItinClass;
+def IIC_SSE_DIV_F32S_RM : InstrItinClass;
+def IIC_SSE_DIV_F64S_RR : InstrItinClass;
+def IIC_SSE_DIV_F64S_RM : InstrItinClass;
+def IIC_SSE_ALU_F32P_RR : InstrItinClass;
+def IIC_SSE_ALU_F32P_RM : InstrItinClass;
+def IIC_SSE_ALU_F64P_RR : InstrItinClass;
+def IIC_SSE_ALU_F64P_RM : InstrItinClass;
+def IIC_SSE_MUL_F32P_RR : InstrItinClass;
+def IIC_SSE_MUL_F32P_RM : InstrItinClass;
+def IIC_SSE_MUL_F64P_RR : InstrItinClass;
+def IIC_SSE_MUL_F64P_RM : InstrItinClass;
+def IIC_SSE_DIV_F32P_RR : InstrItinClass;
+def IIC_SSE_DIV_F32P_RM : InstrItinClass;
+def IIC_SSE_DIV_F64P_RR : InstrItinClass;
+def IIC_SSE_DIV_F64P_RM : InstrItinClass;
+
+def IIC_SSE_COMIS_RR : InstrItinClass;
+def IIC_SSE_COMIS_RM : InstrItinClass;
+
+def IIC_SSE_HADDSUB_RR : InstrItinClass;
+def IIC_SSE_HADDSUB_RM : InstrItinClass;
+
+def IIC_SSE_BIT_P_RR  : InstrItinClass;
+def IIC_SSE_BIT_P_RM  : InstrItinClass;
+
+def IIC_SSE_INTALU_P_RR  : InstrItinClass;
+def IIC_SSE_INTALU_P_RM  : InstrItinClass;
+def IIC_SSE_INTALUQ_P_RR  : InstrItinClass;
+def IIC_SSE_INTALUQ_P_RM  : InstrItinClass;
+
+def IIC_SSE_INTMUL_P_RR : InstrItinClass;
+def IIC_SSE_INTMUL_P_RM : InstrItinClass;
+
+def IIC_SSE_INTSH_P_RR : InstrItinClass;
+def IIC_SSE_INTSH_P_RM : InstrItinClass;
+def IIC_SSE_INTSH_P_RI : InstrItinClass;
+
+def IIC_SSE_INTSHDQ_P_RI : InstrItinClass;
+
+def IIC_SSE_SHUFP : InstrItinClass;
+def IIC_SSE_PSHUF_RI : InstrItinClass;
+def IIC_SSE_PSHUF_MI : InstrItinClass;
+
+def IIC_SSE_UNPCK : InstrItinClass;
+
+def IIC_SSE_MOVMSK : InstrItinClass;
+def IIC_SSE_MASKMOV : InstrItinClass;
+
+def IIC_SSE_PEXTRW : InstrItinClass;
+def IIC_SSE_PINSRW : InstrItinClass;
+
+def IIC_SSE_PABS_RR : InstrItinClass;
+def IIC_SSE_PABS_RM : InstrItinClass;
+
+def IIC_SSE_SQRTPS_RR : InstrItinClass;
+def IIC_SSE_SQRTPS_RM : InstrItinClass;
+def IIC_SSE_SQRTSS_RR : InstrItinClass;
+def IIC_SSE_SQRTSS_RM : InstrItinClass;
+def IIC_SSE_SQRTPD_RR : InstrItinClass;
+def IIC_SSE_SQRTPD_RM : InstrItinClass;
+def IIC_SSE_SQRTSD_RR : InstrItinClass;
+def IIC_SSE_SQRTSD_RM : InstrItinClass;
+
+def IIC_SSE_RSQRTPS_RR : InstrItinClass;
+def IIC_SSE_RSQRTPS_RM : InstrItinClass;
+def IIC_SSE_RSQRTSS_RR : InstrItinClass;
+def IIC_SSE_RSQRTSS_RM : InstrItinClass;
+
+def IIC_SSE_RCPP_RR : InstrItinClass;
+def IIC_SSE_RCPP_RM : InstrItinClass;
+def IIC_SSE_RCPS_RR : InstrItinClass;
+def IIC_SSE_RCPS_RM : InstrItinClass;
+
+def IIC_SSE_MOV_S_RR : InstrItinClass;
+def IIC_SSE_MOV_S_RM : InstrItinClass;
+def IIC_SSE_MOV_S_MR : InstrItinClass;
+
+def IIC_SSE_MOVA_P_RR : InstrItinClass;
+def IIC_SSE_MOVA_P_RM : InstrItinClass;
+def IIC_SSE_MOVA_P_MR : InstrItinClass;
+
+def IIC_SSE_MOVU_P_RR : InstrItinClass;
+def IIC_SSE_MOVU_P_RM : InstrItinClass;
+def IIC_SSE_MOVU_P_MR : InstrItinClass;
+
+def IIC_SSE_MOVDQ : InstrItinClass;
+def IIC_SSE_MOVD_ToGP : InstrItinClass;
+def IIC_SSE_MOVQ_RR : InstrItinClass;
+
+def IIC_SSE_MOV_LH : InstrItinClass;
+
+def IIC_SSE_LDDQU : InstrItinClass;
+
+def IIC_SSE_MOVNT : InstrItinClass;
+
+def IIC_SSE_PHADDSUBD_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBD_RM : InstrItinClass;
+def IIC_SSE_PHADDSUBSW_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBSW_RM : InstrItinClass;
+def IIC_SSE_PHADDSUBW_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBW_RM : InstrItinClass;
+def IIC_SSE_PSHUFB_RR : InstrItinClass;
+def IIC_SSE_PSHUFB_RM : InstrItinClass;
+def IIC_SSE_PSIGN_RR : InstrItinClass;
+def IIC_SSE_PSIGN_RM : InstrItinClass;
+
+def IIC_SSE_PMADD : InstrItinClass;
+def IIC_SSE_PMULHRSW : InstrItinClass;
+def IIC_SSE_PALIGNRR : InstrItinClass;
+def IIC_SSE_PALIGNRM : InstrItinClass;
+def IIC_SSE_MWAIT : InstrItinClass;
+def IIC_SSE_MONITOR : InstrItinClass;
+
+def IIC_SSE_PREFETCH : InstrItinClass;
+def IIC_SSE_PAUSE : InstrItinClass;
+def IIC_SSE_LFENCE : InstrItinClass;
+def IIC_SSE_MFENCE : InstrItinClass;
+def IIC_SSE_SFENCE : InstrItinClass;
+def IIC_SSE_LDMXCSR : InstrItinClass;
+def IIC_SSE_STMXCSR : InstrItinClass;
+
+def IIC_SSE_CVT_PD_RR : InstrItinClass;
+def IIC_SSE_CVT_PD_RM : InstrItinClass;
+def IIC_SSE_CVT_PS_RR : InstrItinClass;
+def IIC_SSE_CVT_PS_RM : InstrItinClass;
+def IIC_SSE_CVT_PI2PS_RR : InstrItinClass;
+def IIC_SSE_CVT_PI2PS_RM : InstrItinClass;
+def IIC_SSE_CVT_Scalar_RR : InstrItinClass;
+def IIC_SSE_CVT_Scalar_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI32_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI32_RR : InstrItinClass;
+def IIC_SSE_CVT_SS2SI64_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass;
+def IIC_SSE_CVT_SD2SI_RM : InstrItinClass;
+def IIC_SSE_CVT_SD2SI_RR : InstrItinClass;
+
+// MMX
+def IIC_MMX_MOV_MM_RM : InstrItinClass;
+def IIC_MMX_MOV_REG_MM : InstrItinClass;
+def IIC_MMX_MOVQ_RM : InstrItinClass;
+def IIC_MMX_MOVQ_RR : InstrItinClass;
+
+def IIC_MMX_ALU_RM : InstrItinClass;
+def IIC_MMX_ALU_RR : InstrItinClass;
+def IIC_MMX_ALUQ_RM : InstrItinClass;
+def IIC_MMX_ALUQ_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RR : InstrItinClass;
+def IIC_MMX_PMUL : InstrItinClass;
+def IIC_MMX_MISC_FUNC_MEM : InstrItinClass;
+def IIC_MMX_MISC_FUNC_REG : InstrItinClass;
+def IIC_MMX_PSADBW : InstrItinClass;
+def IIC_MMX_SHIFT_RI : InstrItinClass;
+def IIC_MMX_SHIFT_RM : InstrItinClass;
+def IIC_MMX_SHIFT_RR : InstrItinClass;
+def IIC_MMX_UNPCK_H_RM : InstrItinClass;
+def IIC_MMX_UNPCK_H_RR : InstrItinClass;
+def IIC_MMX_UNPCK_L : InstrItinClass;
+def IIC_MMX_PCK_RM : InstrItinClass;
+def IIC_MMX_PCK_RR : InstrItinClass;
+def IIC_MMX_PSHUF : InstrItinClass;
+def IIC_MMX_PEXTR : InstrItinClass;
+def IIC_MMX_PINSRW : InstrItinClass;
+def IIC_MMX_MASKMOV : InstrItinClass;
+
+def IIC_MMX_CVT_PD_RR : InstrItinClass;
+def IIC_MMX_CVT_PD_RM : InstrItinClass;
+def IIC_MMX_CVT_PS_RR : InstrItinClass;
+def IIC_MMX_CVT_PS_RM : InstrItinClass;
+
+def IIC_CMPX_LOCK : InstrItinClass;
+def IIC_CMPX_LOCK_8 : InstrItinClass;
+def IIC_CMPX_LOCK_8B : InstrItinClass;
+def IIC_CMPX_LOCK_16B : InstrItinClass;
+
+def IIC_XADD_LOCK_MEM : InstrItinClass;
+def IIC_XADD_LOCK_MEM8 : InstrItinClass;
+
+def IIC_FILD : InstrItinClass;
+def IIC_FLD : InstrItinClass;
+def IIC_FLD80 : InstrItinClass;
+def IIC_FST : InstrItinClass;
+def IIC_FST80 : InstrItinClass;
+def IIC_FIST : InstrItinClass;
+def IIC_FLDZ : InstrItinClass;
+def IIC_FUCOM : InstrItinClass;
+def IIC_FUCOMI : InstrItinClass;
+def IIC_FCOMI : InstrItinClass;
+def IIC_FNSTSW : InstrItinClass;
+def IIC_FNSTCW : InstrItinClass;
+def IIC_FLDCW : InstrItinClass;
+def IIC_FNINIT : InstrItinClass;
+def IIC_FFREE : InstrItinClass;
+def IIC_FNCLEX : InstrItinClass;
+def IIC_WAIT : InstrItinClass;
+def IIC_FXAM : InstrItinClass;
+def IIC_FNOP : InstrItinClass;
+def IIC_FLDL : InstrItinClass;
+def IIC_F2XM1 : InstrItinClass;
+def IIC_FYL2X : InstrItinClass;
+def IIC_FPTAN : InstrItinClass;
+def IIC_FPATAN : InstrItinClass;
+def IIC_FXTRACT : InstrItinClass;
+def IIC_FPREM1 : InstrItinClass;
+def IIC_FPSTP : InstrItinClass;
+def IIC_FPREM : InstrItinClass;
+def IIC_FYL2XP1 : InstrItinClass;
+def IIC_FSINCOS : InstrItinClass;
+def IIC_FRNDINT : InstrItinClass;
+def IIC_FSCALE : InstrItinClass;
+def IIC_FCOMPP : InstrItinClass;
+def IIC_FXSAVE : InstrItinClass;
+def IIC_FXRSTOR : InstrItinClass;
+
+def IIC_FXCH : InstrItinClass;
+
+// System instructions
+def IIC_CPUID : InstrItinClass;
+def IIC_INT : InstrItinClass;
+def IIC_INT3 : InstrItinClass;
+def IIC_INVD : InstrItinClass;
+def IIC_INVLPG : InstrItinClass;
+def IIC_IRET : InstrItinClass;
+def IIC_HLT : InstrItinClass;
+def IIC_LXS : InstrItinClass;
+def IIC_LTR : InstrItinClass;
+def IIC_RDTSC : InstrItinClass;
+def IIC_RSM : InstrItinClass;
+def IIC_SIDT : InstrItinClass;
+def IIC_SGDT : InstrItinClass;
+def IIC_SLDT : InstrItinClass;
+def IIC_STR : InstrItinClass;
+def IIC_SWAPGS : InstrItinClass;
+def IIC_SYSCALL : InstrItinClass;
+def IIC_SYS_ENTER_EXIT : InstrItinClass;
+def IIC_IN_RR : InstrItinClass;
+def IIC_IN_RI : InstrItinClass;
+def IIC_OUT_RR : InstrItinClass;
+def IIC_OUT_IR : InstrItinClass;
+def IIC_INS : InstrItinClass;
+def IIC_MOV_REG_DR : InstrItinClass;
+def IIC_MOV_DR_REG : InstrItinClass;
+def IIC_MOV_REG_CR : InstrItinClass;
+def IIC_MOV_CR_REG : InstrItinClass;
+def IIC_MOV_REG_SR : InstrItinClass;
+def IIC_MOV_MEM_SR : InstrItinClass;
+def IIC_MOV_SR_REG : InstrItinClass;
+def IIC_MOV_SR_MEM : InstrItinClass;
+def IIC_LAR_RM : InstrItinClass;
+def IIC_LAR_RR : InstrItinClass;
+def IIC_LSL_RM : InstrItinClass;
+def IIC_LSL_RR : InstrItinClass;
+def IIC_LGDT : InstrItinClass;
+def IIC_LIDT : InstrItinClass;
+def IIC_LLDT_REG : InstrItinClass;
+def IIC_LLDT_MEM : InstrItinClass;
+def IIC_PUSH_CS : InstrItinClass;
+def IIC_PUSH_SR : InstrItinClass;
+def IIC_POP_SR : InstrItinClass;
+def IIC_POP_SR_SS : InstrItinClass;
+def IIC_VERR : InstrItinClass;
+def IIC_VERW_REG : InstrItinClass;
+def IIC_VERW_MEM : InstrItinClass;
+def IIC_WRMSR : InstrItinClass;
+def IIC_RDMSR : InstrItinClass;
+def IIC_RDPMC : InstrItinClass;
+def IIC_SMSW : InstrItinClass;
+def IIC_LMSW_REG : InstrItinClass;
+def IIC_LMSW_MEM : InstrItinClass;
+def IIC_ENTER : InstrItinClass;
+def IIC_LEAVE : InstrItinClass;
+def IIC_POP_MEM : InstrItinClass;
+def IIC_POP_REG16 : InstrItinClass;
+def IIC_POP_REG : InstrItinClass;
+def IIC_POP_F : InstrItinClass;
+def IIC_POP_FD : InstrItinClass;
+def IIC_POP_A : InstrItinClass;
+def IIC_PUSH_IMM : InstrItinClass;
+def IIC_PUSH_MEM : InstrItinClass;
+def IIC_PUSH_REG : InstrItinClass;
+def IIC_PUSH_F : InstrItinClass;
+def IIC_PUSH_A : InstrItinClass;
+def IIC_BSWAP : InstrItinClass;
+def IIC_BIT_SCAN_MEM : InstrItinClass;
+def IIC_BIT_SCAN_REG : InstrItinClass;
+def IIC_MOVS : InstrItinClass;
+def IIC_STOS : InstrItinClass;
+def IIC_SCAS : InstrItinClass;
+def IIC_CMPS : InstrItinClass;
+def IIC_MOV : InstrItinClass;
+def IIC_MOV_MEM : InstrItinClass;
+def IIC_AHF : InstrItinClass;
+def IIC_BT_MI : InstrItinClass;
+def IIC_BT_MR : InstrItinClass;
+def IIC_BT_RI : InstrItinClass;
+def IIC_BT_RR : InstrItinClass;
+def IIC_BTX_MI : InstrItinClass;
+def IIC_BTX_MR : InstrItinClass;
+def IIC_BTX_RI : InstrItinClass;
+def IIC_BTX_RR : InstrItinClass;
+def IIC_XCHG_REG : InstrItinClass;
+def IIC_XCHG_MEM : InstrItinClass;
+def IIC_XADD_REG : InstrItinClass;
+def IIC_XADD_MEM : InstrItinClass;
+def IIC_CMPXCHG_MEM : InstrItinClass;
+def IIC_CMPXCHG_REG : InstrItinClass;
+def IIC_CMPXCHG_MEM8 : InstrItinClass;
+def IIC_CMPXCHG_REG8 : InstrItinClass;
+def IIC_CMPXCHG_8B : InstrItinClass;
+def IIC_CMPXCHG_16B : InstrItinClass;
+def IIC_LODS : InstrItinClass;
+def IIC_OUTS : InstrItinClass;
+def IIC_CLC : InstrItinClass;
+def IIC_CLD : InstrItinClass;
+def IIC_CLI : InstrItinClass;
+def IIC_CMC : InstrItinClass;
+def IIC_CLTS : InstrItinClass;
+def IIC_STC : InstrItinClass;
+def IIC_STI : InstrItinClass;
+def IIC_STD : InstrItinClass;
+def IIC_XLAT : InstrItinClass;
+def IIC_AAA : InstrItinClass;
+def IIC_AAD : InstrItinClass;
+def IIC_AAM : InstrItinClass;
+def IIC_AAS : InstrItinClass;
+def IIC_DAA : InstrItinClass;
+def IIC_DAS : InstrItinClass;
+def IIC_BOUND : InstrItinClass;
+def IIC_ARPL_REG : InstrItinClass;
+def IIC_ARPL_MEM : InstrItinClass;
+def IIC_MOVBE : InstrItinClass;
+def IIC_AES   : InstrItinClass;
+def IIC_BLEND_MEM : InstrItinClass;
+def IIC_BLEND_NOMEM : InstrItinClass;
+def IIC_CBW   : InstrItinClass;
+def IIC_CRC32_REG : InstrItinClass;
+def IIC_CRC32_MEM : InstrItinClass;
+def IIC_SSE_DPPD_RR : InstrItinClass;
+def IIC_SSE_DPPD_RM : InstrItinClass;
+def IIC_SSE_DPPS_RR : InstrItinClass;
+def IIC_SSE_DPPS_RM : InstrItinClass;
+def IIC_MMX_EMMS : InstrItinClass;
+def IIC_SSE_EXTRACTPS_RR : InstrItinClass;
+def IIC_SSE_EXTRACTPS_RM : InstrItinClass;
+def IIC_SSE_INSERTPS_RR : InstrItinClass;
+def IIC_SSE_INSERTPS_RM : InstrItinClass;
+def IIC_SSE_MPSADBW_RR : InstrItinClass;
+def IIC_SSE_MPSADBW_RM : InstrItinClass;
+def IIC_SSE_PMULLD_RR : InstrItinClass;
+def IIC_SSE_PMULLD_RM : InstrItinClass;
+def IIC_SSE_ROUNDPS_REG : InstrItinClass;
+def IIC_SSE_ROUNDPS_MEM : InstrItinClass;
+def IIC_SSE_ROUNDPD_REG : InstrItinClass;
+def IIC_SSE_ROUNDPD_MEM : InstrItinClass;
+def IIC_SSE_POPCNT_RR : InstrItinClass;
+def IIC_SSE_POPCNT_RM : InstrItinClass;
+def IIC_SSE_PCLMULQDQ_RR : InstrItinClass;
+def IIC_SSE_PCLMULQDQ_RM : InstrItinClass;
+
+def IIC_NOP : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+// IssueWidth is analogous to the number of decode units. Core and its
+// descendents, including Nehalem and SandyBridge have 4 decoders.
+// Resources beyond the decoder operate on micro-ops and are bufferred
+// so adjacent micro-ops don't directly compete.
+//
+// MicroOpBufferSize > 1 indicates that RAW dependencies can be
+// decoded in the same cycle. The value 32 is a reasonably arbitrary
+// number of in-flight instructions.
+//
+// HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef
+// indicates high latency opcodes. Alternatively, InstrItinData
+// entries may be included here to define specific operand
+// latencies. Since these latencies are not used for pipeline hazards,
+// they do not need to be exact.
+//
+// The GenericModel contains no instruction itineraries.
+def GenericModel : SchedMachineModel {
+  let IssueWidth = 4;
+  let MicroOpBufferSize = 32;
+  let LoadLatency = 4;
+  let HighLatency = 10;
+  let PostRAScheduler = 0;
+}
+
+include "X86ScheduleAtom.td"
+include "X86SchedSandyBridge.td"
+include "X86SchedHaswell.td"
+include "X86ScheduleSLM.td"
+include "X86ScheduleBtVer2.td"
+
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td
new file mode 100644
index 0000000..4c559c9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -0,0 +1,549 @@
+//===- X86ScheduleAtom.td - X86 Atom Scheduling Definitions -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Intel Atom
+// in order (Saltwell-32nm/Bonnell-45nm) processors.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// Scheduling information derived from the "Intel 64 and IA32 Architectures
+// Optimization Reference Manual", Chapter 13, Section 4.
+// Functional Units
+//    Port 0
+def Port0 : FuncUnit; // ALU: ALU0, shift/rotate, load/store
+                      // SIMD/FP: SIMD ALU, Shuffle,SIMD/FP multiply, divide
+def Port1 : FuncUnit; // ALU: ALU1, bit processing, jump, and LEA
+                      // SIMD/FP: SIMD ALU, FP Adder
+
+def AtomItineraries : ProcessorItineraries<
+  [ Port0, Port1 ],
+  [], [
+  // P0 only
+  // InstrItinData<class, [InstrStage<N, [P0]>] >,
+  // P0 or P1
+  // InstrItinData<class, [InstrStage<N, [P0, P1]>] >,
+  // P0 and P1
+  // InstrItinData<class, [InstrStage<N, [P0], 0>,  InstrStage<N, [P1]>] >,
+  //
+  // Default is 1 cycle, port0 or port1
+  InstrItinData<IIC_ALU_MEM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_LEA, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_LEA_16, [InstrStage<2, [Port0, Port1]>] >,
+  // mul
+  InstrItinData<IIC_MUL8, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL64, [InstrStage<12, [Port0, Port1]>] >,
+  // imul by al, ax, eax, rax
+  InstrItinData<IIC_IMUL8, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL64, [InstrStage<12, [Port0, Port1]>] >,
+  // imul reg by reg|mem
+  InstrItinData<IIC_IMUL16_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL16_RR, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_RM, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_IMUL32_RR, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_IMUL64_RM, [InstrStage<12, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL64_RR, [InstrStage<12, [Port0, Port1]>] >,
+  // imul reg = reg/mem * imm
+  InstrItinData<IIC_IMUL16_RRI, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_RRI, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_IMUL64_RRI, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL16_RMI, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_RMI, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_IMUL64_RMI, [InstrStage<14, [Port0, Port1]>] >,
+  // idiv
+  InstrItinData<IIC_IDIV8, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_IDIV16, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_IDIV32, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_IDIV64, [InstrStage<130, [Port0, Port1]>] >,
+  // div
+  InstrItinData<IIC_DIV8_REG, [InstrStage<50, [Port0, Port1]>] >,
+  InstrItinData<IIC_DIV8_MEM, [InstrStage<68, [Port0, Port1]>] >,
+  InstrItinData<IIC_DIV16, [InstrStage<50, [Port0, Port1]>] >,
+  InstrItinData<IIC_DIV32, [InstrStage<50, [Port0, Port1]>] >,
+  InstrItinData<IIC_DIV64, [InstrStage<130, [Port0, Port1]>] >,
+  // neg/not/inc/dec
+  InstrItinData<IIC_UNARY_REG, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [Port0]>] >,
+  // add/sub/and/or/xor/cmp/test
+  InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIN_MEM, [InstrStage<1, [Port0]>] >,
+  // adc/sbc
+  InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<1, [Port0]>] >,
+  // shift/rotate
+  InstrItinData<IIC_SR, [InstrStage<1, [Port0]>] >,
+  // shift double
+  InstrItinData<IIC_SHD16_REG_IM, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD16_REG_CL, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD32_REG_CL, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD64_REG_IM, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD64_REG_CL, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<9, [Port0, Port1]>] >,
+  // cmov
+  InstrItinData<IIC_CMOV16_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_CMOV16_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMOV32_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_CMOV32_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMOV64_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_CMOV64_RR, [InstrStage<1, [Port0, Port1]>] >,
+  // set
+  InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SET_R, [InstrStage<1, [Port0, Port1]>] >,
+  // jcc
+  InstrItinData<IIC_Jcc, [InstrStage<1, [Port1]>] >,
+  // jcxz/jecxz/jrcxz
+  InstrItinData<IIC_JCXZ, [InstrStage<4, [Port0, Port1]>] >,
+  // jmp rel
+  InstrItinData<IIC_JMP_REL, [InstrStage<1, [Port1]>] >,
+  // jmp indirect
+  InstrItinData<IIC_JMP_REG, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_JMP_MEM, [InstrStage<2, [Port0, Port1]>] >,
+  // jmp far
+  InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<32, [Port0, Port1]>] >,
+  InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<31, [Port0, Port1]>] >,
+  // loop/loope/loopne
+  InstrItinData<IIC_LOOP, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_LOOPE, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_LOOPNE, [InstrStage<17, [Port0, Port1]>] >,
+  // call - all but reg/imm
+  InstrItinData<IIC_CALL_RI, [InstrStage<1, [Port0], 0>,
+                              InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_CALL_MEM, [InstrStage<15, [Port0, Port1]>] >,
+  InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<40, [Port0, Port1]>] >,
+  InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<39, [Port0, Port1]>] >,
+  //ret
+  InstrItinData<IIC_RET, [InstrStage<79, [Port0, Port1]>] >,
+  InstrItinData<IIC_RET_IMM, [InstrStage<1, [Port0], 0>,  InstrStage<1, [Port1]>] >,
+  //sign extension movs
+  InstrItinData<IIC_MOVSX,[InstrStage<1, [Port0] >] >,
+  InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [Port0, Port1]>] >,
+  //zero extension movs
+  InstrItinData<IIC_MOVZX,[InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_REP_MOVS, [InstrStage<75, [Port0, Port1]>] >,
+  InstrItinData<IIC_REP_STOS, [InstrStage<74, [Port0, Port1]>] >,
+
+  // SSE binary operations
+  // arithmetic fp scalar
+  InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<5, [Port0], 0>,
+                                   InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<5, [Port0], 0>,
+                                   InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<34, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<34, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<62, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<10, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<9, [Port0, Port1]>] >,
+
+  // arithmetic fp parallel
+  InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<5, [Port0], 0>,
+                                   InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<70, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<70, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<125, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<125, [Port0, Port1]>] >,
+
+  // bitwise parallel
+  InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [Port0]>] >,
+
+  // arithmetic int parallel
+  InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+
+  // multiply int parallel
+  InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [Port0]>] >,
+
+  // shift parallel
+  InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<70, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<70, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<34, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<34, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<125, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<125, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_RSQRTPS_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_RSQRTPS_RM, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_RSQRTSS_RR, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_SSE_RSQRTSS_RM, [InstrStage<4, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_MOVMSK, [InstrStage<3, [Port0]>] >,
+  InstrItinData<IIC_SSE_MASKMOV, [InstrStage<2, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<2, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_LDDQU, [InstrStage<3, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<3, [Port0]>] >,
+  InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PAUSE, [InstrStage<17, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_STMXCSR, [InstrStage<15, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_MWAIT, [InstrStage<46, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MONITOR, [InstrStage<45, [Port0, Port1]>] >,
+
+  // conversions
+  // to/from PD ...
+  InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
+  // to/from PS except to/from PD and PS2PI
+  InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >,
+
+  // MMX MOVs
+  InstrItinData<IIC_MMX_MOV_MM_RM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<3, [Port0]>] >,
+  InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
+  // other MMX
+  InstrItinData<IIC_MMX_ALU_RM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_ALU_RR,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PMUL, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PSADBW,   [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_PCK_RM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_PCK_RR,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PSHUF,   [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_PEXTR,   [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PINSRW,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [Port0]>] >,
+  // conversions
+  // from/to PD
+  InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
+  // from/to PI
+  InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<5, [Port0], 0>,
+                                    InstrStage<5, [Port1]>]>,
+
+  InstrItinData<IIC_CMPX_LOCK, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<22, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_FILD, [InstrStage<5, [Port0], 0>, InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_FLD,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_FLD80, [InstrStage<4, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_FST,   [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_FST80, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_FIST,  [InstrStage<6, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_FLDZ,   [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FUCOM,  [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_FUCOMI, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_FCOMI,  [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNSTSW, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNSTCW, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_FLDCW,  [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNINIT, [InstrStage<63, [Port0, Port1]>] >,
+  InstrItinData<IIC_FFREE,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNCLEX, [InstrStage<25, [Port0, Port1]>] >,
+  InstrItinData<IIC_WAIT,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXAM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_FNOP,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FLDL,  [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_F2XM1,  [InstrStage<99, [Port0, Port1]>] >,
+  InstrItinData<IIC_FYL2X,  [InstrStage<146, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPTAN,  [InstrStage<168, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPATAN,  [InstrStage<183, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXTRACT,  [InstrStage<25, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPREM1,  [InstrStage<71, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPSTP,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPREM,  [InstrStage<55, [Port0, Port1]>] >,
+  InstrItinData<IIC_FYL2XP1,  [InstrStage<147, [Port0, Port1]>] >,
+  InstrItinData<IIC_FSINCOS,  [InstrStage<174, [Port0, Port1]>] >,
+  InstrItinData<IIC_FRNDINT,  [InstrStage<46, [Port0, Port1]>] >,
+  InstrItinData<IIC_FSCALE,  [InstrStage<77, [Port0, Port1]>] >,
+  InstrItinData<IIC_FCOMPP,  [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_FXSAVE,  [InstrStage<140, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXRSTOR,  [InstrStage<141, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXCH, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >,
+
+  // System instructions
+  InstrItinData<IIC_CPUID, [InstrStage<121, [Port0, Port1]>] >,
+  InstrItinData<IIC_INT,   [InstrStage<127, [Port0, Port1]>] >,
+  InstrItinData<IIC_INT3,  [InstrStage<130, [Port0, Port1]>] >,
+  InstrItinData<IIC_INVD,  [InstrStage<1003, [Port0, Port1]>] >,
+  InstrItinData<IIC_INVLPG, [InstrStage<71, [Port0, Port1]>] >,
+  InstrItinData<IIC_IRET,  [InstrStage<109, [Port0, Port1]>] >,
+  InstrItinData<IIC_HLT,   [InstrStage<121, [Port0, Port1]>] >,
+  InstrItinData<IIC_LXS,   [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_LTR,   [InstrStage<83, [Port0, Port1]>] >,
+  InstrItinData<IIC_RDTSC, [InstrStage<30, [Port0, Port1]>] >,
+  InstrItinData<IIC_RSM,   [InstrStage<741, [Port0, Port1]>] >,
+  InstrItinData<IIC_SIDT,  [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SGDT,  [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SLDT,  [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_STR,    [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_SWAPGS, [InstrStage<22, [Port0, Port1]>] >,
+  InstrItinData<IIC_SYSCALL, [InstrStage<96, [Port0, Port1]>] >,
+  InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<88, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_IN_RR,  [InstrStage<94, [Port0, Port1]>] >,
+  InstrItinData<IIC_IN_RI,  [InstrStage<92, [Port0, Port1]>] >,
+  InstrItinData<IIC_OUT_RR, [InstrStage<68, [Port0, Port1]>] >,
+  InstrItinData<IIC_OUT_IR, [InstrStage<72, [Port0, Port1]>] >,
+  InstrItinData<IIC_INS,    [InstrStage<59, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_MOV_REG_DR, [InstrStage<88, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_DR_REG, [InstrStage<123, [Port0, Port1]>] >,
+  // worst case for mov REG_CRx
+  InstrItinData<IIC_MOV_REG_CR, [InstrStage<12, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_CR_REG, [InstrStage<136, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MOV_MEM_SR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_SR_REG, [InstrStage<21, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_SR_MEM, [InstrStage<26, [Port0, Port1]>] >,
+  // LAR
+  InstrItinData<IIC_LAR_RM,  [InstrStage<50, [Port0, Port1]>] >,
+  InstrItinData<IIC_LAR_RR,  [InstrStage<54, [Port0, Port1]>] >,
+  // LSL
+  InstrItinData<IIC_LSL_RM,  [InstrStage<46, [Port0, Port1]>] >,
+  InstrItinData<IIC_LSL_RR,  [InstrStage<49, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_LGDT, [InstrStage<44, [Port0, Port1]>] >,
+  InstrItinData<IIC_LIDT, [InstrStage<44, [Port0, Port1]>] >,
+  InstrItinData<IIC_LLDT_REG, [InstrStage<60, [Port0, Port1]>] >,
+  InstrItinData<IIC_LLDT_MEM, [InstrStage<64, [Port0, Port1]>] >,
+  // push control register, segment registers
+  InstrItinData<IIC_PUSH_CS, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_PUSH_SR, [InstrStage<2, [Port0, Port1]>] >,
+  // pop control register, segment registers
+  InstrItinData<IIC_POP_SR,    [InstrStage<29, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_SR_SS, [InstrStage<48, [Port0, Port1]>] >,
+  // VERR, VERW
+  InstrItinData<IIC_VERR,     [InstrStage<41, [Port0, Port1]>] >,
+  InstrItinData<IIC_VERW_REG, [InstrStage<51, [Port0, Port1]>] >,
+  InstrItinData<IIC_VERW_MEM, [InstrStage<50, [Port0, Port1]>] >,
+  // WRMSR, RDMSR
+  InstrItinData<IIC_WRMSR, [InstrStage<202, [Port0, Port1]>] >,
+  InstrItinData<IIC_RDMSR, [InstrStage<78, [Port0, Port1]>] >,
+  InstrItinData<IIC_RDPMC, [InstrStage<46, [Port0, Port1]>] >,
+  // SMSW, LMSW
+  InstrItinData<IIC_SMSW, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_LMSW_REG, [InstrStage<69, [Port0, Port1]>] >,
+  InstrItinData<IIC_LMSW_MEM, [InstrStage<67, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_ENTER, [InstrStage<32, [Port0, Port1]>] >,
+  InstrItinData<IIC_LEAVE, [InstrStage<2, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_POP_MEM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_REG16, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_REG, [InstrStage<1, [Port0], 0>,
+                            InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_POP_F, [InstrStage<32, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_FD, [InstrStage<26, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_A, [InstrStage<9, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [Port0], 0>,
+                               InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_PUSH_MEM, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_PUSH_REG, [InstrStage<1, [Port0], 0>,
+                               InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_PUSH_F, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<16, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<16, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPS, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_MEM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_AHF, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BT_MI, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BT_MR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_BT_RI, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_BT_RR, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_BTX_MI, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_BTX_MR, [InstrStage<11, [Port0, Port1]>] >,
+  InstrItinData<IIC_BTX_RI, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_BTX_RR, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_XCHG_REG, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_XCHG_MEM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_XADD_REG, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_XADD_MEM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_REG, [InstrStage<15, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_8B, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_16B, [InstrStage<22, [Port0, Port1]>] >,
+  InstrItinData<IIC_LODS, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_OUTS, [InstrStage<74, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLD, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLI, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLTS, [InstrStage<33, [Port0, Port1]>] >,
+  InstrItinData<IIC_STC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_STI, [InstrStage<17, [Port0, Port1]>] >,
+  InstrItinData<IIC_STD, [InstrStage<21, [Port0, Port1]>] >,
+  InstrItinData<IIC_XLAT, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAA, [InstrStage<13, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAD, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAM, [InstrStage<21, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAS, [InstrStage<13, [Port0, Port1]>] >,
+  InstrItinData<IIC_DAA, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_DAS, [InstrStage<20, [Port0, Port1]>] >,
+  InstrItinData<IIC_BOUND, [InstrStage<11, [Port0, Port1]>] >,
+  InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >,
+  InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_CBW, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_EMMS, [InstrStage<5, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] >
+  ]>;
+
+// Atom machine model.
+def AtomModel : SchedMachineModel {
+  let IssueWidth = 2;  // Allows 2 instructions per scheduling group.
+  let MicroOpBufferSize = 0; // In-order execution, always hide latency.
+  let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
+  let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+
+  // On the Atom, the throughput for taken branches is 2 cycles. For small
+  // simple loops, expand by a small factor to hide the backedge cost.
+  let LoopMicroOpBufferSize = 10;
+  let PostRAScheduler = 1;
+
+  let Itineraries = AtomItineraries;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td
new file mode 100644
index 0000000..ce1ece3
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -0,0 +1,341 @@
+//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for AMD btver2 (Jaguar) to support
+// instruction scheduling and other instruction cost heuristics. Based off AMD Software
+// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix.
+//
+//===----------------------------------------------------------------------===//
+
+def BtVer2Model : SchedMachineModel {
+  // All x86 instructions are modeled as a single micro-op, and btver2 can
+  // decode 2 instructions per cycle.
+  let IssueWidth = 2;
+  let MicroOpBufferSize = 64; // Retire Control Unit
+  let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency)
+  let HighLatency = 25;
+  let MispredictPenalty = 14; // Minimum branch misdirection penalty
+  let PostRAScheduler = 1;
+
+  // FIXME: SSE4/AVX is unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
+}
+
+let SchedModel = BtVer2Model in {
+
+// Jaguar can issue up to 6 micro-ops in one cycle
+def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam)
+def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV
+def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU
+def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA)
+def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA
+def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
+
+// Any pipe - FIXME we need this until we can discriminate between int/fpu load/store/moves properly
+def JAny : ProcResGroup<[JALU0, JALU1, JLAGU, JSAGU, JFPU0, JFPU1]>;
+
+// Integer Pipe Scheduler
+def JALU01 : ProcResGroup<[JALU0, JALU1]> {
+  let BufferSize=20;
+}
+
+// AGU Pipe Scheduler
+def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> {
+  let BufferSize=12;
+}
+
+// Fpu Pipe Scheduler
+def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> {
+  let BufferSize=18;
+}
+
+def JDiv    : ProcResource<1>; // integer division
+def JMul    : ProcResource<1>; // integer multiplication
+def JVALU0  : ProcResource<1>; // vector integer
+def JVALU1  : ProcResource<1>; // vector integer
+def JVIMUL  : ProcResource<1>; // vector integer multiplication
+def JSTC    : ProcResource<1>; // vector store/convert
+def JFPM    : ProcResource<1>; // FP multiplication
+def JFPA    : ProcResource<1>; // FP addition
+
+// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 3>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when dispatched by the schedulers.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> {
+     let Latency = !add(Lat, 3);
+  }
+}
+
+multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> {
+     let Latency = !add(Lat, 5);
+  }
+}
+
+// A folded store needs a cycle on the SAGU for the store data.
+def : WriteRes<WriteRMW, [JSAGU]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Arithmetic.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResIntPair<WriteALU,   JALU01, 1>;
+defm : JWriteResIntPair<WriteIMul,  JALU1,  3>;
+
+def  : WriteRes<WriteIMulH, [JALU1]> {
+  let Latency = 6;
+  let ResourceCycles = [4];
+}
+
+// FIXME 8/16 bit divisions
+def : WriteRes<WriteIDiv, [JALU1, JDiv]> {
+  let Latency = 25;
+  let ResourceCycles = [1, 25];
+}
+def : WriteRes<WriteIDivLd, [JALU1, JLAGU, JDiv]> {
+  let Latency = 41;
+  let ResourceCycles = [1, 1, 25];
+}
+
+// This is for simple LEAs with one or two input operands.
+// FIXME: SAGU 3-operand LEA
+def : WriteRes<WriteLEA, [JALU01]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Integer shifts and rotates.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResIntPair<WriteShift, JALU01, 1>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Loads, stores, and moves, not folded with other operations.
+// FIXME: Split x86 and SSE load/store/moves
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteLoad,  [JLAGU]> { let Latency = 5; }
+def : WriteRes<WriteStore, [JSAGU]>;
+def : WriteRes<WriteMove,  [JAny]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteZero,  []>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResIntPair<WriteJump,  JALU01, 1>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Floating point. This covers both scalar and vector operations.
+// FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions?
+// FIXME: Double precision latencies
+// FIXME: SS vs PS latencies
+// FIXME: ymm latencies
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResFpuPair<WriteFAdd,        JFPU0,  3>;
+defm : JWriteResFpuPair<WriteFMul,        JFPU1,  2>;
+defm : JWriteResFpuPair<WriteFRcp,        JFPU1,  2>;
+defm : JWriteResFpuPair<WriteFRsqrt,      JFPU1,  2>;
+defm : JWriteResFpuPair<WriteFShuffle,   JFPU01,  1>;
+defm : JWriteResFpuPair<WriteFBlend,     JFPU01,  1>;
+defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>;
+
+def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> {
+  let Latency = 21;
+  let ResourceCycles = [1, 1, 21];
+}
+def : WriteRes<WriteFSqrtLd, [JFPU1, JLAGU, JFPM]> {
+  let Latency = 26;
+  let ResourceCycles = [1, 1, 21];
+}
+
+def : WriteRes<WriteFDiv, [JFPU1, JLAGU, JFPM]> {
+  let Latency = 19;
+  let ResourceCycles = [1, 1, 19];
+}
+def : WriteRes<WriteFDivLd, [JFPU1, JLAGU, JFPM]> {
+  let Latency = 24;
+  let ResourceCycles = [1, 1, 19];
+}
+
+// FIXME: integer pipes
+defm : JWriteResFpuPair<WriteCvtF2I,    JFPU1,  3>; // Float -> Integer.
+defm : JWriteResFpuPair<WriteCvtI2F,    JFPU1,  3>; // Integer -> Float.
+defm : JWriteResFpuPair<WriteCvtF2F,    JFPU1,  3>; // Float -> Float size conversion.
+
+def : WriteRes<WriteFVarBlend, [JFPU01]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteFVarBlendLd, [JLAGU, JFPU01]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 2];
+}
+
+// Vector integer operations.
+defm : JWriteResFpuPair<WriteVecALU,   JFPU01,  1>;
+defm : JWriteResFpuPair<WriteVecShift, JFPU01,  1>;
+defm : JWriteResFpuPair<WriteVecIMul,  JFPU0,   2>;
+defm : JWriteResFpuPair<WriteShuffle,  JFPU01,  1>;
+defm : JWriteResFpuPair<WriteBlend,    JFPU01,  1>;
+defm : JWriteResFpuPair<WriteVecLogic, JFPU01,  1>;
+defm : JWriteResFpuPair<WriteShuffle256, JFPU01, 1>;
+
+def : WriteRes<WriteVarBlend, [JFPU01]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteVarBlendLd, [JLAGU, JFPU01]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 2];
+}
+
+// FIXME: why do we need to define AVX2 resource on CPU that doesn't have AVX2?
+def : WriteRes<WriteVarVecShift, [JFPU01]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteVarVecShiftLd, [JLAGU, JFPU01]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1];
+}
+
+def : WriteRes<WriteMPSAD, [JFPU0]> {
+  let Latency = 3;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteMPSADLd, [JLAGU, JFPU0]> {
+  let Latency = 8;
+  let ResourceCycles = [1, 2];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+// FIXME: approximate latencies + pipe dependencies
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WritePCmpIStrM, [JFPU01]> {
+  let Latency = 7;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WritePCmpIStrMLd, [JLAGU, JFPU01]> {
+  let Latency = 12;
+  let ResourceCycles = [1, 2];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [JFPU01]> {
+  let Latency = 13;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WritePCmpEStrMLd, [JLAGU, JFPU01]> {
+  let Latency = 18;
+  let ResourceCycles = [1, 5];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [JFPU01]> {
+  let Latency = 6;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WritePCmpIStrILd, [JLAGU, JFPU01]> {
+  let Latency = 11;
+  let ResourceCycles = [1, 2];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [JFPU01]> {
+  let Latency = 13;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WritePCmpEStrILd, [JLAGU, JFPU01]> {
+  let Latency = 18;
+  let ResourceCycles = [1, 5];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// AES Instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteAESDecEnc, [JFPU01, JVIMUL]> {
+  let Latency = 3;
+  let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteAESDecEncLd, [JFPU01, JLAGU, JVIMUL]> {
+  let Latency = 8;
+  let ResourceCycles = [1, 1, 1];
+}
+
+def : WriteRes<WriteAESIMC, [JVIMUL]> {
+  let Latency = 2;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESIMCLd, [JLAGU, JVIMUL]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [JVIMUL]> {
+  let Latency = 2;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESKeyGenLd, [JLAGU, JVIMUL]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 1];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Carry-less multiplication instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteCLMul, [JVIMUL]> {
+  let Latency = 2;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteCLMulLd, [JLAGU, JVIMUL]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 1];
+}
+
+// FIXME: pipe for system/microcode?
+def : WriteRes<WriteSystem,     [JAny]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [JAny]> { let Latency = 100; }
+def : WriteRes<WriteFence,  [JSAGU]>;
+def : WriteRes<WriteNop, []>;
+} // SchedModel
+
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td
new file mode 100644
index 0000000..f95d4fa
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -0,0 +1,233 @@
+//=- X86ScheduleSLM.td - X86 Silvermont Scheduling -----------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Intel Silvermont to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def SLMModel : SchedMachineModel {
+  // All x86 instructions are modeled as a single micro-op, and SLM can decode 2
+  // instructions per cycle.
+  let IssueWidth = 2;
+  let MicroOpBufferSize = 32; // Based on the reorder buffer.
+  let LoadLatency = 3;
+  let MispredictPenalty = 10;
+  let PostRAScheduler = 1;
+
+  // For small loops, expand by a small factor to hide the backedge cost.
+  let LoopMicroOpBufferSize = 10;
+
+  // FIXME: SSE4 is unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
+}
+
+let SchedModel = SLMModel in {
+
+// Silvermont has 5 reservation stations for micro-ops
+
+def IEC_RSV0 : ProcResource<1>;
+def IEC_RSV1 : ProcResource<1>;
+def FPC_RSV0 : ProcResource<1> { let BufferSize = 1; }
+def FPC_RSV1 : ProcResource<1> { let BufferSize = 1; }
+def MEC_RSV  : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def IEC_RSV01  : ProcResGroup<[IEC_RSV0, IEC_RSV1]>;
+def FPC_RSV01  : ProcResGroup<[FPC_RSV0, FPC_RSV1]>;
+
+def SMDivider      : ProcResource<1>;
+def SMFPMultiplier : ProcResource<1>;
+def SMFPDivider    : ProcResource<1>;
+
+// Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 3>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SMWriteResPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on MEC_RSV and adds 3 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [MEC_RSV, ExePort]> {
+     let Latency = !add(Lat, 3);
+  }
+}
+
+// A folded store needs a cycle on MEC_RSV for the store data, but it does not
+// need an extra port cycle to recompute the address.
+def : WriteRes<WriteRMW, [MEC_RSV]>;
+
+def : WriteRes<WriteStore, [IEC_RSV01, MEC_RSV]>;
+def : WriteRes<WriteLoad,  [MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteMove,  [IEC_RSV01]>;
+def : WriteRes<WriteZero,  []>;
+
+defm : SMWriteResPair<WriteALU,   IEC_RSV01, 1>;
+defm : SMWriteResPair<WriteIMul,  IEC_RSV1,  3>;
+defm : SMWriteResPair<WriteShift, IEC_RSV0,  1>;
+defm : SMWriteResPair<WriteJump,  IEC_RSV1,   1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [IEC_RSV1]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [IEC_RSV01, SMDivider]> {
+  let Latency = 25;
+  let ResourceCycles = [1, 25];
+}
+def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> {
+  let Latency = 29;
+  let ResourceCycles = [1, 1, 25];
+}
+
+// Scalar and vector floating point.
+defm : SMWriteResPair<WriteFAdd,   FPC_RSV1, 3>;
+defm : SMWriteResPair<WriteFRcp,   FPC_RSV0, 5>;
+defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>;
+defm : SMWriteResPair<WriteFSqrt,  FPC_RSV0, 15>;
+defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteCvtF2F, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteFShuffle,  FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteFBlend,  FPC_RSV0,  1>;
+
+// This is quite rough, latency depends on precision
+def : WriteRes<WriteFMul, [FPC_RSV0, SMFPMultiplier]> {
+  let Latency = 5;
+  let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteFMulLd, [MEC_RSV, FPC_RSV0, SMFPMultiplier]> {
+  let Latency = 8;
+  let ResourceCycles = [1, 1, 2];
+}
+
+def : WriteRes<WriteFDiv, [FPC_RSV0, SMFPDivider]> {
+  let Latency = 34;
+  let ResourceCycles = [1, 34];
+}
+def : WriteRes<WriteFDivLd, [MEC_RSV, FPC_RSV0, SMFPDivider]> {
+  let Latency = 37;
+  let ResourceCycles = [1, 1, 34];
+}
+
+// Vector integer operations.
+defm : SMWriteResPair<WriteVecShift, FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteVecLogic, FPC_RSV01, 1>;
+defm : SMWriteResPair<WriteVecALU,   FPC_RSV01,  1>;
+defm : SMWriteResPair<WriteVecIMul,  FPC_RSV0,   4>;
+defm : SMWriteResPair<WriteShuffle,  FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteBlend,  FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteMPSAD,  FPC_RSV0,  7>;
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [FPC_RSV0]> {
+  let Latency = 13;
+  let ResourceCycles = [13];
+}
+def : WriteRes<WritePCmpIStrMLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 13;
+  let ResourceCycles = [13, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [FPC_RSV0]> {
+  let Latency = 17;
+  let ResourceCycles = [17];
+}
+def : WriteRes<WritePCmpEStrMLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 17;
+  let ResourceCycles = [17, 1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [FPC_RSV0]> {
+  let Latency = 17;
+  let ResourceCycles = [17];
+}
+def : WriteRes<WritePCmpIStrILd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 17;
+  let ResourceCycles = [17, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [FPC_RSV0]> {
+  let Latency = 21;
+  let ResourceCycles = [21];
+}
+def : WriteRes<WritePCmpEStrILd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 21;
+  let ResourceCycles = [21, 1];
+}
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [FPC_RSV0]> {
+  let Latency = 8;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESDecEncLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 8;
+  let ResourceCycles = [5, 1];
+}
+
+def : WriteRes<WriteAESIMC, [FPC_RSV0]> {
+  let Latency = 8;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESIMCLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 8;
+  let ResourceCycles = [5, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [FPC_RSV0]> {
+  let Latency = 8;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESKeyGenLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 8;
+  let ResourceCycles = [5, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [FPC_RSV0]> {
+  let Latency = 10;
+  let ResourceCycles = [10];
+}
+def : WriteRes<WriteCLMulLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 10;
+  let ResourceCycles = [10, 1];
+}
+
+
+def : WriteRes<WriteSystem,     [FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteFence, [MEC_RSV]>;
+def : WriteRes<WriteNop, []>;
+
+// AVX is not supported on that architecture, but we should define the basic
+// scheduling resources anyway.
+def  : WriteRes<WriteIMulH, [FPC_RSV0]>;
+defm : SMWriteResPair<WriteVarBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFVarBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFShuffle256, FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteShuffle256, FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteVarVecShift, FPC_RSV0,  1>;
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
new file mode 100644
index 0000000..b1a0161
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -0,0 +1,284 @@
+//===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86ISelLowering.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86SelectionDAGInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-selectiondag-info"
+
+bool X86SelectionDAGInfo::isBaseRegConflictPossible(
+    SelectionDAG &DAG, ArrayRef<unsigned> ClobberSet) const {
+  // We cannot use TRI->hasBasePointer() until *after* we select all basic
+  // blocks.  Legalization may introduce new stack temporaries with large
+  // alignment requirements.  Fall back to generic code if there are any
+  // dynamic stack adjustments (hopefully rare) and the base pointer would
+  // conflict if we had to use it.
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  if (!MFI->hasVarSizedObjects() && !MFI->hasOpaqueSPAdjustment())
+    return false;
+
+  const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
+  unsigned BaseReg = TRI->getBaseRegister();
+  for (unsigned R : ClobberSet)
+    if (BaseReg == R)
+      return true;
+  return false;
+}
+
+SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo) const {
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  const X86Subtarget &Subtarget =
+      DAG.getMachineFunction().getSubtarget<X86Subtarget>();
+
+#ifndef NDEBUG
+  // If the base register might conflict with our physical registers, bail out.
+  const unsigned ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
+                                 X86::ECX, X86::EAX, X86::EDI};
+  assert(!isBaseRegConflictPossible(DAG, ClobberSet));
+#endif
+
+  // If to a segment-relative address space, use the default lowering.
+  if (DstPtrInfo.getAddrSpace() >= 256)
+    return SDValue();
+
+  // If not DWORD aligned or size is more than the threshold, call the library.
+  // The libc version is likely to be faster for these cases. It can use the
+  // address value and run time information about the CPU.
+  if ((Align & 3) != 0 || !ConstantSize ||
+      ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) {
+    // Check to see if there is a specialized entry-point for memory zeroing.
+    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+
+    if (const char *bzeroEntry = V &&
+        V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) {
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
+      Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+      TargetLowering::ArgListTy Args;
+      TargetLowering::ArgListEntry Entry;
+      Entry.Node = Dst;
+      Entry.Ty = IntPtrTy;
+      Args.push_back(Entry);
+      Entry.Node = Size;
+      Args.push_back(Entry);
+
+      TargetLowering::CallLoweringInfo CLI(DAG);
+      CLI.setDebugLoc(dl).setChain(Chain)
+        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                   DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args),
+                   0)
+        .setDiscardResult();
+
+      std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+      return CallResult.second;
+    }
+
+    // Otherwise have the target-independent code call memset.
+    return SDValue();
+  }
+
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  SDValue InFlag;
+  EVT AVT;
+  SDValue Count;
+  ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
+  unsigned BytesLeft = 0;
+  bool TwoRepStos = false;
+  if (ValC) {
+    unsigned ValReg;
+    uint64_t Val = ValC->getZExtValue() & 255;
+
+    // If the value is a constant, then we can potentially use larger sets.
+    switch (Align & 3) {
+    case 2:   // WORD aligned
+      AVT = MVT::i16;
+      ValReg = X86::AX;
+      Val = (Val << 8) | Val;
+      break;
+    case 0:  // DWORD aligned
+      AVT = MVT::i32;
+      ValReg = X86::EAX;
+      Val = (Val << 8)  | Val;
+      Val = (Val << 16) | Val;
+      if (Subtarget.is64Bit() && ((Align & 0x7) == 0)) {  // QWORD aligned
+        AVT = MVT::i64;
+        ValReg = X86::RAX;
+        Val = (Val << 32) | Val;
+      }
+      break;
+    default:  // Byte aligned
+      AVT = MVT::i8;
+      ValReg = X86::AL;
+      Count = DAG.getIntPtrConstant(SizeVal, dl);
+      break;
+    }
+
+    if (AVT.bitsGT(MVT::i8)) {
+      unsigned UBytes = AVT.getSizeInBits() / 8;
+      Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl);
+      BytesLeft = SizeVal % UBytes;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
+                             InFlag);
+    InFlag = Chain.getValue(1);
+  } else {
+    AVT = MVT::i8;
+    Count  = DAG.getIntPtrConstant(SizeVal, dl);
+    Chain  = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
+                           Count, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
+                           Dst, InFlag);
+  InFlag = Chain.getValue(1);
+
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+  Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
+
+  if (TwoRepStos) {
+    InFlag = Chain.getValue(1);
+    Count  = Size;
+    EVT CVT = Count.getValueType();
+    SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
+                               DAG.getConstant((AVT == MVT::i64) ? 7 : 3, dl,
+                                               CVT));
+    Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : X86::ECX,
+                             Left, InFlag);
+    InFlag = Chain.getValue(1);
+    Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
+    Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
+  } else if (BytesLeft) {
+    // Handle the last 1 - 7 bytes.
+    unsigned Offset = SizeVal - BytesLeft;
+    EVT AddrVT = Dst.getValueType();
+    EVT SizeVT = Size.getValueType();
+
+    Chain = DAG.getMemset(Chain, dl,
+                          DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
+                                      DAG.getConstant(Offset, dl, AddrVT)),
+                          Src,
+                          DAG.getConstant(BytesLeft, dl, SizeVT),
+                          Align, isVolatile, false,
+                          DstPtrInfo.getWithOffset(Offset));
+  }
+
+  // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
+  return Chain;
+}
+
+SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+  // This requires the copy size to be a constant, preferably
+  // within a subtarget-specific limit.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  const X86Subtarget &Subtarget =
+      DAG.getMachineFunction().getSubtarget<X86Subtarget>();
+  if (!ConstantSize)
+    return SDValue();
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
+    return SDValue();
+
+  /// If not DWORD aligned, it is more efficient to call the library.  However
+  /// if calling the library is not allowed (AlwaysInline), then soldier on as
+  /// the code generated here is better than the long load-store sequence we
+  /// would otherwise get.
+  if (!AlwaysInline && (Align & 3) != 0)
+    return SDValue();
+
+  // If to a segment-relative address space, use the default lowering.
+  if (DstPtrInfo.getAddrSpace() >= 256 ||
+      SrcPtrInfo.getAddrSpace() >= 256)
+    return SDValue();
+
+  // If the base register might conflict with our physical registers, bail out.
+  const unsigned ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI,
+                                 X86::ECX, X86::ESI, X86::EDI};
+  if (isBaseRegConflictPossible(DAG, ClobberSet))
+    return SDValue();
+
+  MVT AVT;
+  if (Align & 1)
+    AVT = MVT::i8;
+  else if (Align & 2)
+    AVT = MVT::i16;
+  else if (Align & 4)
+    // DWORD aligned
+    AVT = MVT::i32;
+  else
+    // QWORD aligned
+    AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
+
+  unsigned UBytes = AVT.getSizeInBits() / 8;
+  unsigned CountVal = SizeVal / UBytes;
+  SDValue Count = DAG.getIntPtrConstant(CountVal, dl);
+  unsigned BytesLeft = SizeVal % UBytes;
+
+  SDValue InFlag;
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
+                           Count, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
+                           Dst, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : X86::ESI,
+                           Src, InFlag);
+  InFlag = Chain.getValue(1);
+
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+  SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
+
+  SmallVector<SDValue, 4> Results;
+  Results.push_back(RepMovs);
+  if (BytesLeft) {
+    // Handle the last 1 - 7 bytes.
+    unsigned Offset = SizeVal - BytesLeft;
+    EVT DstVT = Dst.getValueType();
+    EVT SrcVT = Src.getValueType();
+    EVT SizeVT = Size.getValueType();
+    Results.push_back(DAG.getMemcpy(Chain, dl,
+                                    DAG.getNode(ISD::ADD, dl, DstVT, Dst,
+                                                DAG.getConstant(Offset, dl,
+                                                                DstVT)),
+                                    DAG.getNode(ISD::ADD, dl, SrcVT, Src,
+                                                DAG.getConstant(Offset, dl,
+                                                                SrcVT)),
+                                    DAG.getConstant(BytesLeft, dl, SizeVT),
+                                    Align, isVolatile, AlwaysInline, false,
+                                    DstPtrInfo.getWithOffset(Offset),
+                                    SrcPtrInfo.getWithOffset(Offset)));
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h
new file mode 100644
index 0000000..961bd8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h
@@ -0,0 +1,52 @@
+//===-- X86SelectionDAGInfo.h - X86 SelectionDAG Info -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class X86TargetLowering;
+class X86TargetMachine;
+class X86Subtarget;
+
+class X86SelectionDAGInfo : public TargetSelectionDAGInfo {
+  /// Returns true if it is possible for the base register to conflict with the
+  /// given set of clobbers for a memory intrinsic.
+  bool isBaseRegConflictPossible(SelectionDAG &DAG,
+                                 ArrayRef<unsigned> ClobberSet) const;
+
+public:
+  explicit X86SelectionDAGInfo() = default;
+
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
+                                  SDValue Chain,
+                                  SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align,
+                                  bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const override;
+
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
+                                  SDValue Chain,
+                                  SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align,
+                                  bool isVolatile, bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
new file mode 100644
index 0000000..ef16c5b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -0,0 +1,190 @@
+//===-- X86ShuffleDecodeConstantPool.cpp - X86 shuffle decode -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics using
+// constants from the constant pool.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleDecodeConstantPool.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/IR/Constants.h"
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  // It is not an error for the PSHUFB mask to not be a vector of i8 because the
+  // constant pool uniques constants by their bit representation.
+  // e.g. the following take up the same space in the constant pool:
+  //   i128 -170141183420855150465331762880109871104
+  //
+  //   <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160>
+  //
+  //   <4 x i32> <i32 -2147483648, i32 -2147483648,
+  //              i32 -2147483648, i32 -2147483648>
+
+#ifndef NDEBUG
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  assert(MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512);
+#endif
+
+  // This is a straightforward byte vector.
+  if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) {
+    int NumElements = MaskTy->getVectorNumElements();
+    ShuffleMask.reserve(NumElements);
+
+    for (int i = 0; i < NumElements; ++i) {
+      // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
+      // lane of the vector we're inside.
+      int Base = i & ~0xf;
+      Constant *COp = C->getAggregateElement(i);
+      if (!COp) {
+        ShuffleMask.clear();
+        return;
+      } else if (isa<UndefValue>(COp)) {
+        ShuffleMask.push_back(SM_SentinelUndef);
+        continue;
+      }
+      uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+      // If the high bit (7) of the byte is set, the element is zeroed.
+      if (Element & (1 << 7))
+        ShuffleMask.push_back(SM_SentinelZero);
+      else {
+        // Only the least significant 4 bits of the byte are used.
+        int Index = Base + (Element & 0xf);
+        ShuffleMask.push_back(Index);
+      }
+    }
+  }
+  // TODO: Handle funny-looking vectors too.
+}
+
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+                        SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  // It is not an error for the PSHUFB mask to not be a vector of i8 because the
+  // constant pool uniques constants by their bit representation.
+  // e.g. the following take up the same space in the constant pool:
+  //   i128 -170141183420855150465331762880109871104
+  //
+  //   <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160>
+  //
+  //   <4 x i32> <i32 -2147483648, i32 -2147483648,
+  //              i32 -2147483648, i32 -2147483648>
+
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+
+  if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512.
+    return;
+
+  // Only support vector types.
+  if (!MaskTy->isVectorTy())
+    return;
+
+  // Make sure its an integer type.
+  Type *VecEltTy = MaskTy->getVectorElementType();
+  if (!VecEltTy->isIntegerTy())
+    return;
+
+  // Support any element type from byte up to element size.
+  // This is necesary primarily because 64-bit elements get split to 32-bit
+  // in the constant pool on 32-bit target.
+  unsigned EltTySize = VecEltTy->getIntegerBitWidth();
+  if (EltTySize < 8 || EltTySize > ElSize)
+    return;
+
+  unsigned NumElements = MaskTySize / ElSize;
+  assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
+         "Unexpected number of vector elements.");
+  ShuffleMask.reserve(NumElements);
+  unsigned NumElementsPerLane = 128 / ElSize;
+  unsigned Factor = ElSize / EltTySize;
+
+  for (unsigned i = 0; i < NumElements; ++i) {
+    Constant *COp = C->getAggregateElement(i * Factor);
+    if (!COp) {
+      ShuffleMask.clear();
+      return;
+    } else if (isa<UndefValue>(COp)) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+    int Index = i & ~(NumElementsPerLane - 1);
+    uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+    if (ElSize == 64)
+      Index += (Element >> 1) & 0x1;
+    else
+      Index += Element & 0x3;
+    ShuffleMask.push_back(Index);
+  }
+
+  // TODO: Handle funny-looking vectors too.
+}
+
+void DecodeVPERMVMask(const Constant *C, MVT VT,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  if (MaskTy->isVectorTy()) {
+    unsigned NumElements = MaskTy->getVectorNumElements();
+    if (NumElements == VT.getVectorNumElements()) {
+      for (unsigned i = 0; i < NumElements; ++i) {
+        Constant *COp = C->getAggregateElement(i);
+        if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) {
+          ShuffleMask.clear();
+          return;
+        }
+        if (isa<UndefValue>(COp))
+          ShuffleMask.push_back(SM_SentinelUndef);
+        else {
+          uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+          Element &= (1 << NumElements) - 1;
+          ShuffleMask.push_back(Element);
+        }
+      }
+    }
+    return;
+  }
+  // Scalar value; just broadcast it
+  if (!isa<ConstantInt>(C))
+    return;
+  uint64_t Element = cast<ConstantInt>(C)->getZExtValue();
+  int NumElements = VT.getVectorNumElements();
+  Element &= (1 << NumElements) - 1;
+  for (int i = 0; i < NumElements; ++i)
+    ShuffleMask.push_back(Element);
+}
+
+void DecodeVPERMV3Mask(const Constant *C, MVT VT,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  unsigned NumElements = MaskTy->getVectorNumElements();
+  if (NumElements == VT.getVectorNumElements()) {
+    for (unsigned i = 0; i < NumElements; ++i) {
+      Constant *COp = C->getAggregateElement(i);
+      if (!COp) {
+        ShuffleMask.clear();
+        return;
+      }
+      if (isa<UndefValue>(COp))
+        ShuffleMask.push_back(SM_SentinelUndef);
+      else {
+        uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+        Element &= (1 << NumElements*2) - 1;
+        ShuffleMask.push_back(Element);
+      }
+    }
+  }
+}
+} // llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
new file mode 100644
index 0000000..bcf4632
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -0,0 +1,45 @@
+//===-- X86ShuffleDecodeConstantPool.h - X86 shuffle decode -----*-C++-*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics using
+// constants from the constant pool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H
+#define LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H
+
+#include "llvm/ADT/SmallVector.h"
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+class Constant;
+class MVT;
+
+/// \brief Decode a PSHUFB mask from an IR-level vector constant.
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERMILP variable mask from an IR-level vector constant.
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+                        SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
+void DecodeVPERMVMask(const Constant *C, MVT VT,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
+void DecodeVPERMV3Mask(const Constant *C, MVT VT,
+                       SmallVectorImpl<int> &ShuffleMask);
+
+} // llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
new file mode 100644
index 0000000..8ef08c9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -0,0 +1,343 @@
+//===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86Subtarget.h"
+#include "X86InstrInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
+#define DEBUG_TYPE "subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "X86GenSubtargetInfo.inc"
+
+// Temporary option to control early if-conversion for x86 while adding machine
+// models.
+static cl::opt<bool>
+X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
+               cl::desc("Enable early if-conversion on X86"));
+
+
+/// Classify a blockaddress reference for the current subtarget according to how
+/// we should reference it in a non-pcrel context.
+unsigned char X86Subtarget::ClassifyBlockAddressReference() const {
+  if (isPICStyleGOT())    // 32-bit ELF targets.
+    return X86II::MO_GOTOFF;
+
+  if (isPICStyleStubPIC())   // Darwin/32 in PIC mode.
+    return X86II::MO_PIC_BASE_OFFSET;
+
+  // Direct static reference to label.
+  return X86II::MO_NO_FLAG;
+}
+
+/// Classify a global variable reference for the current subtarget according to
+/// how we should reference it in a non-pcrel context.
+unsigned char X86Subtarget::
+ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
+  // DLLImport only exists on windows, it is implemented as a load from a
+  // DLLIMPORT stub.
+  if (GV->hasDLLImportStorageClass())
+    return X86II::MO_DLLIMPORT;
+
+  bool isDef = GV->isStrongDefinitionForLinker();
+
+  // X86-64 in PIC mode.
+  if (isPICStyleRIPRel()) {
+    // Large model never uses stubs.
+    if (TM.getCodeModel() == CodeModel::Large)
+      return X86II::MO_NO_FLAG;
+
+    if (isTargetDarwin()) {
+      // If symbol visibility is hidden, the extra load is not needed if
+      // target is x86-64 or the symbol is definitely defined in the current
+      // translation unit.
+      if (GV->hasDefaultVisibility() && !isDef)
+        return X86II::MO_GOTPCREL;
+    } else if (!isTargetWin64()) {
+      assert(isTargetELF() && "Unknown rip-relative target");
+
+      // Extra load is needed for all externally visible.
+      if (!GV->hasLocalLinkage() && GV->hasDefaultVisibility())
+        return X86II::MO_GOTPCREL;
+    }
+
+    return X86II::MO_NO_FLAG;
+  }
+
+  if (isPICStyleGOT()) {   // 32-bit ELF targets.
+    // Extra load is needed for all externally visible.
+    if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
+      return X86II::MO_GOTOFF;
+    return X86II::MO_GOT;
+  }
+
+  if (isPICStyleStubPIC()) {  // Darwin/32 in PIC mode.
+    // Determine whether we have a stub reference and/or whether the reference
+    // is relative to the PIC base or not.
+
+    // If this is a strong reference to a definition, it is definitely not
+    // through a stub.
+    if (isDef)
+      return X86II::MO_PIC_BASE_OFFSET;
+
+    // Unless we have a symbol with hidden visibility, we have to go through a
+    // normal $non_lazy_ptr stub because this symbol might be resolved late.
+    if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+      return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+
+    // If symbol visibility is hidden, we have a stub for common symbol
+    // references and external declarations.
+    if (GV->isDeclarationForLinker() || GV->hasCommonLinkage()) {
+      // Hidden $non_lazy_ptr reference.
+      return X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE;
+    }
+
+    // Otherwise, no stub.
+    return X86II::MO_PIC_BASE_OFFSET;
+  }
+
+  if (isPICStyleStubNoDynamic()) {  // Darwin/32 in -mdynamic-no-pic mode.
+    // Determine whether we have a stub reference.
+
+    // If this is a strong reference to a definition, it is definitely not
+    // through a stub.
+    if (isDef)
+      return X86II::MO_NO_FLAG;
+
+    // Unless we have a symbol with hidden visibility, we have to go through a
+    // normal $non_lazy_ptr stub because this symbol might be resolved late.
+    if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+      return X86II::MO_DARWIN_NONLAZY;
+
+    // Otherwise, no stub.
+    return X86II::MO_NO_FLAG;
+  }
+
+  // Direct static reference to global.
+  return X86II::MO_NO_FLAG;
+}
+
+
+/// This function returns the name of a function which has an interface like
+/// the non-standard bzero function, if such a function exists on the
+/// current subtarget and it is considered preferable over memset with zero
+/// passed as the second argument. Otherwise it returns null.
+const char *X86Subtarget::getBZeroEntry() const {
+  // Darwin 10 has a __bzero entry point for this purpose.
+  if (getTargetTriple().isMacOSX() &&
+      !getTargetTriple().isMacOSXVersionLT(10, 6))
+    return "__bzero";
+
+  return nullptr;
+}
+
+bool X86Subtarget::hasSinCos() const {
+  return getTargetTriple().isMacOSX() &&
+    !getTargetTriple().isMacOSXVersionLT(10, 9) &&
+    is64Bit();
+}
+
+/// Return true if the subtarget allows calls to immediate address.
+bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
+  // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32
+  // but WinCOFFObjectWriter::RecordRelocation cannot emit them.  Once it does,
+  // the following check for Win32 should be removed.
+  if (In64BitMode || isTargetWin32())
+    return false;
+  return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
+}
+
+void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "generic";
+
+  // Make sure 64-bit features are available in 64-bit mode. (But make sure
+  // SSE2 can be turned off explicitly.)
+  std::string FullFS = FS;
+  if (In64BitMode) {
+    if (!FullFS.empty())
+      FullFS = "+64bit,+sse2," + FullFS;
+    else
+      FullFS = "+64bit,+sse2";
+  }
+
+  // LAHF/SAHF are always supported in non-64-bit mode.
+  if (!In64BitMode) {
+    if (!FullFS.empty())
+      FullFS = "+sahf," + FullFS;
+    else
+      FullFS = "+sahf";
+  }
+
+
+  // Parse features string and set the CPU.
+  ParseSubtargetFeatures(CPUName, FullFS);
+
+  // All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of
+  // 16-bytes and under that are reasonably fast. These features were
+  // introduced with Intel's Nehalem/Silvermont and AMD's Family10h
+  // micro-architectures respectively.
+  if (hasSSE42() || hasSSE4A())
+    IsUAMem16Slow = false;
+  
+  InstrItins = getInstrItineraryForCPU(CPUName);
+
+  // It's important to keep the MCSubtargetInfo feature bits in sync with
+  // target data structure which is shared with MC code emitter, etc.
+  if (In64BitMode)
+    ToggleFeature(X86::Mode64Bit);
+  else if (In32BitMode)
+    ToggleFeature(X86::Mode32Bit);
+  else if (In16BitMode)
+    ToggleFeature(X86::Mode16Bit);
+  else
+    llvm_unreachable("Not 16-bit, 32-bit or 64-bit mode!");
+
+  DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
+               << ", 3DNowLevel " << X863DNowLevel
+               << ", 64bit " << HasX86_64 << "\n");
+  assert((!In64BitMode || HasX86_64) &&
+         "64-bit code requested on a subtarget that doesn't support it!");
+
+  // Stack alignment is 16 bytes on Darwin, Linux and Solaris (both
+  // 32 and 64 bit) and for all 64-bit targets.
+  if (StackAlignOverride)
+    stackAlignment = StackAlignOverride;
+  else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
+           In64BitMode)
+    stackAlignment = 16;
+}
+
+void X86Subtarget::initializeEnvironment() {
+  X86SSELevel = NoSSE;
+  X863DNowLevel = NoThreeDNow;
+  HasCMov = false;
+  HasX86_64 = false;
+  HasPOPCNT = false;
+  HasSSE4A = false;
+  HasAES = false;
+  HasFXSR = false;
+  HasXSAVE = false;
+  HasXSAVEOPT = false;
+  HasXSAVEC = false;
+  HasXSAVES = false;
+  HasPCLMUL = false;
+  HasFMA = false;
+  HasFMA4 = false;
+  HasXOP = false;
+  HasTBM = false;
+  HasMOVBE = false;
+  HasRDRAND = false;
+  HasF16C = false;
+  HasFSGSBase = false;
+  HasLZCNT = false;
+  HasBMI = false;
+  HasBMI2 = false;
+  HasRTM = false;
+  HasHLE = false;
+  HasERI = false;
+  HasCDI = false;
+  HasPFI = false;
+  HasDQI = false;
+  HasBWI = false;
+  HasVLX = false;
+  HasADX = false;
+  HasPKU = false;
+  HasSHA = false;
+  HasPRFCHW = false;
+  HasRDSEED = false;
+  HasLAHFSAHF = false;
+  HasMPX = false;
+  IsBTMemSlow = false;
+  IsSHLDSlow = false;
+  IsUAMem16Slow = false;
+  IsUAMem32Slow = false;
+  HasSSEUnalignedMem = false;
+  HasCmpxchg16b = false;
+  UseLeaForSP = false;
+  HasSlowDivide32 = false;
+  HasSlowDivide64 = false;
+  PadShortFunctions = false;
+  CallRegIndirect = false;
+  LEAUsesAG = false;
+  SlowLEA = false;
+  SlowIncDec = false;
+  stackAlignment = 4;
+  // FIXME: this is a known good value for Yonah. How about others?
+  MaxInlineSizeThreshold = 128;
+  UseSoftFloat = false;
+}
+
+X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                            StringRef FS) {
+  initializeEnvironment();
+  initSubtargetFeatures(CPU, FS);
+  return *this;
+}
+
+X86Subtarget::X86Subtarget(const Triple &TT, const std::string &CPU,
+                           const std::string &FS, const X86TargetMachine &TM,
+                           unsigned StackAlignOverride)
+    : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
+      PICStyle(PICStyles::None), TargetTriple(TT),
+      StackAlignOverride(StackAlignOverride),
+      In64BitMode(TargetTriple.getArch() == Triple::x86_64),
+      In32BitMode(TargetTriple.getArch() == Triple::x86 &&
+                  TargetTriple.getEnvironment() != Triple::CODE16),
+      In16BitMode(TargetTriple.getArch() == Triple::x86 &&
+                  TargetTriple.getEnvironment() == Triple::CODE16),
+      TSInfo(), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+      TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) {
+  // Determine the PICStyle based on the target selected.
+  if (TM.getRelocationModel() == Reloc::Static) {
+    // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
+    setPICStyle(PICStyles::None);
+  } else if (is64Bit()) {
+    // PIC in 64 bit mode is always rip-rel.
+    setPICStyle(PICStyles::RIPRel);
+  } else if (isTargetCOFF()) {
+    setPICStyle(PICStyles::None);
+  } else if (isTargetDarwin()) {
+    if (TM.getRelocationModel() == Reloc::PIC_)
+      setPICStyle(PICStyles::StubPIC);
+    else {
+      assert(TM.getRelocationModel() == Reloc::DynamicNoPIC);
+      setPICStyle(PICStyles::StubDynamicNoPIC);
+    }
+  } else if (isTargetELF()) {
+    setPICStyle(PICStyles::GOT);
+  }
+}
+
+bool X86Subtarget::enableEarlyIfConversion() const {
+  return hasCMov() && X86EarlyIfConv;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h
new file mode 100644
index 0000000..13d1026
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h
@@ -0,0 +1,546 @@
+//===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86SUBTARGET_H
+#define LLVM_LIB_TARGET_X86_X86SUBTARGET_H
+
+#include "X86FrameLowering.h"
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+#include "X86SelectionDAGInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "X86GenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+class TargetMachine;
+
+/// The X86 backend supports a number of different styles of PIC.
+///
+namespace PICStyles {
+enum Style {
+  StubPIC,          // Used on i386-darwin in -fPIC mode.
+  StubDynamicNoPIC, // Used on i386-darwin in -mdynamic-no-pic mode.
+  GOT,              // Used on many 32-bit unices in -fPIC mode.
+  RIPRel,           // Used on X86-64 when not in -static mode.
+  None              // Set when in -static mode (not PIC or DynamicNoPIC mode).
+};
+}
+
+class X86Subtarget final : public X86GenSubtargetInfo {
+
+protected:
+  enum X86SSEEnum {
+    NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
+  };
+
+  enum X863DNowEnum {
+    NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
+  };
+
+  enum X86ProcFamilyEnum {
+    Others, IntelAtom, IntelSLM
+  };
+
+  /// X86 processor family: Intel Atom, and others
+  X86ProcFamilyEnum X86ProcFamily;
+
+  /// Which PIC style to use
+  PICStyles::Style PICStyle;
+
+  /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported.
+  X86SSEEnum X86SSELevel;
+
+  /// MMX, 3DNow, 3DNow Athlon, or none supported.
+  X863DNowEnum X863DNowLevel;
+
+  /// True if this processor has conditional move instructions
+  /// (generally pentium pro+).
+  bool HasCMov;
+
+  /// True if the processor supports X86-64 instructions.
+  bool HasX86_64;
+
+  /// True if the processor supports POPCNT.
+  bool HasPOPCNT;
+
+  /// True if the processor supports SSE4A instructions.
+  bool HasSSE4A;
+
+  /// Target has AES instructions
+  bool HasAES;
+
+  /// Target has FXSAVE/FXRESTOR instructions
+  bool HasFXSR;
+
+  /// Target has XSAVE instructions
+  bool HasXSAVE;
+  /// Target has XSAVEOPT instructions
+  bool HasXSAVEOPT;
+  /// Target has XSAVEC instructions
+  bool HasXSAVEC;
+  /// Target has XSAVES instructions
+  bool HasXSAVES;
+
+  /// Target has carry-less multiplication
+  bool HasPCLMUL;
+
+  /// Target has 3-operand fused multiply-add
+  bool HasFMA;
+
+  /// Target has 4-operand fused multiply-add
+  bool HasFMA4;
+
+  /// Target has XOP instructions
+  bool HasXOP;
+
+  /// Target has TBM instructions.
+  bool HasTBM;
+
+  /// True if the processor has the MOVBE instruction.
+  bool HasMOVBE;
+
+  /// True if the processor has the RDRAND instruction.
+  bool HasRDRAND;
+
+  /// Processor has 16-bit floating point conversion instructions.
+  bool HasF16C;
+
+  /// Processor has FS/GS base insturctions.
+  bool HasFSGSBase;
+
+  /// Processor has LZCNT instruction.
+  bool HasLZCNT;
+
+  /// Processor has BMI1 instructions.
+  bool HasBMI;
+
+  /// Processor has BMI2 instructions.
+  bool HasBMI2;
+
+  /// Processor has RTM instructions.
+  bool HasRTM;
+
+  /// Processor has HLE.
+  bool HasHLE;
+
+  /// Processor has ADX instructions.
+  bool HasADX;
+
+  /// Processor has SHA instructions.
+  bool HasSHA;
+
+  /// Processor has PRFCHW instructions.
+  bool HasPRFCHW;
+
+  /// Processor has RDSEED instructions.
+  bool HasRDSEED;
+
+  /// Processor has LAHF/SAHF instructions.
+  bool HasLAHFSAHF;
+
+  /// True if BT (bit test) of memory instructions are slow.
+  bool IsBTMemSlow;
+
+  /// True if SHLD instructions are slow.
+  bool IsSHLDSlow;
+
+  /// True if unaligned memory accesses of 16-bytes are slow.
+  bool IsUAMem16Slow;
+
+  /// True if unaligned memory accesses of 32-bytes are slow.
+  bool IsUAMem32Slow;
+
+  /// True if SSE operations can have unaligned memory operands.
+  /// This may require setting a configuration bit in the processor.
+  bool HasSSEUnalignedMem;
+
+  /// True if this processor has the CMPXCHG16B instruction;
+  /// this is true for most x86-64 chips, but not the first AMD chips.
+  bool HasCmpxchg16b;
+
+  /// True if the LEA instruction should be used for adjusting
+  /// the stack pointer. This is an optimization for Intel Atom processors.
+  bool UseLeaForSP;
+
+  /// True if 8-bit divisions are significantly faster than
+  /// 32-bit divisions and should be used when possible.
+  bool HasSlowDivide32;
+
+  /// True if 16-bit divides are significantly faster than
+  /// 64-bit divisions and should be used when possible.
+  bool HasSlowDivide64;
+
+  /// True if the short functions should be padded to prevent
+  /// a stall when returning too early.
+  bool PadShortFunctions;
+
+  /// True if the Calls with memory reference should be converted
+  /// to a register-based indirect call.
+  bool CallRegIndirect;
+
+  /// True if the LEA instruction inputs have to be ready at address generation
+  /// (AG) time.
+  bool LEAUsesAG;
+
+  /// True if the LEA instruction with certain arguments is slow
+  bool SlowLEA;
+
+  /// True if INC and DEC instructions are slow when writing to flags
+  bool SlowIncDec;
+
+  /// Processor has AVX-512 PreFetch Instructions
+  bool HasPFI;
+
+  /// Processor has AVX-512 Exponential and Reciprocal Instructions
+  bool HasERI;
+
+  /// Processor has AVX-512 Conflict Detection Instructions
+  bool HasCDI;
+
+  /// Processor has AVX-512 Doubleword and Quadword instructions
+  bool HasDQI;
+
+  /// Processor has AVX-512 Byte and Word instructions
+  bool HasBWI;
+
+  /// Processor has AVX-512 Vector Length eXtenstions
+  bool HasVLX;
+
+  /// Processor has PKU extenstions
+  bool HasPKU;
+
+  /// Processot supports MPX - Memory Protection Extensions
+  bool HasMPX;
+
+  /// Use software floating point for code generation.
+  bool UseSoftFloat;
+
+  /// The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned stackAlignment;
+
+  /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
+  ///
+  unsigned MaxInlineSizeThreshold;
+
+  /// What processor and OS we're targeting.
+  Triple TargetTriple;
+
+  /// Instruction itineraries for scheduling
+  InstrItineraryData InstrItins;
+
+private:
+
+  /// Override the stack alignment.
+  unsigned StackAlignOverride;
+
+  /// True if compiling for 64-bit, false for 16-bit or 32-bit.
+  bool In64BitMode;
+
+  /// True if compiling for 32-bit, false for 16-bit or 64-bit.
+  bool In32BitMode;
+
+  /// True if compiling for 16-bit, false for 32-bit or 64-bit.
+  bool In16BitMode;
+
+  X86SelectionDAGInfo TSInfo;
+  // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
+  // X86TargetLowering needs.
+  X86InstrInfo InstrInfo;
+  X86TargetLowering TLInfo;
+  X86FrameLowering FrameLowering;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  X86Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
+               const X86TargetMachine &TM, unsigned StackAlignOverride);
+
+  const X86TargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const X86FrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const X86RegisterInfo *getRegisterInfo() const override {
+    return &getInstrInfo()->getRegisterInfo();
+  }
+
+  /// Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return stackAlignment; }
+
+  /// Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+private:
+  /// Initialize the full set of dependencies so we can use an initializer
+  /// list for X86Subtarget.
+  X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+  void initializeEnvironment();
+  void initSubtargetFeatures(StringRef CPU, StringRef FS);
+public:
+  /// Is this x86_64? (disregarding specific ABI / programming model)
+  bool is64Bit() const {
+    return In64BitMode;
+  }
+
+  bool is32Bit() const {
+    return In32BitMode;
+  }
+
+  bool is16Bit() const {
+    return In16BitMode;
+  }
+
+  /// Is this x86_64 with the ILP32 programming model (x32 ABI)?
+  bool isTarget64BitILP32() const {
+    return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32 ||
+                           TargetTriple.isOSNaCl());
+  }
+
+  /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
+  bool isTarget64BitLP64() const {
+    return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32 &&
+                           !TargetTriple.isOSNaCl());
+  }
+
+  PICStyles::Style getPICStyle() const { return PICStyle; }
+  void setPICStyle(PICStyles::Style Style)  { PICStyle = Style; }
+
+  bool hasCMov() const { return HasCMov; }
+  bool hasSSE1() const { return X86SSELevel >= SSE1; }
+  bool hasSSE2() const { return X86SSELevel >= SSE2; }
+  bool hasSSE3() const { return X86SSELevel >= SSE3; }
+  bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
+  bool hasSSE41() const { return X86SSELevel >= SSE41; }
+  bool hasSSE42() const { return X86SSELevel >= SSE42; }
+  bool hasAVX() const { return X86SSELevel >= AVX; }
+  bool hasAVX2() const { return X86SSELevel >= AVX2; }
+  bool hasAVX512() const { return X86SSELevel >= AVX512F; }
+  bool hasFp256() const { return hasAVX(); }
+  bool hasInt256() const { return hasAVX2(); }
+  bool hasSSE4A() const { return HasSSE4A; }
+  bool hasMMX() const { return X863DNowLevel >= MMX; }
+  bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
+  bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
+  bool hasPOPCNT() const { return HasPOPCNT; }
+  bool hasAES() const { return HasAES; }
+  bool hasFXSR() const { return HasFXSR; }
+  bool hasXSAVE() const { return HasXSAVE; }
+  bool hasXSAVEOPT() const { return HasXSAVEOPT; }
+  bool hasXSAVEC() const { return HasXSAVEC; }
+  bool hasXSAVES() const { return HasXSAVES; }
+  bool hasPCLMUL() const { return HasPCLMUL; }
+  // Prefer FMA4 to FMA - its better for commutation/memory folding and
+  // has equal or better performance on all supported targets.
+  bool hasFMA() const { return HasFMA && !HasFMA4; }
+  bool hasFMA4() const { return HasFMA4; }
+  bool hasAnyFMA() const { return hasFMA() || hasFMA4() || hasAVX512(); }
+  bool hasXOP() const { return HasXOP; }
+  bool hasTBM() const { return HasTBM; }
+  bool hasMOVBE() const { return HasMOVBE; }
+  bool hasRDRAND() const { return HasRDRAND; }
+  bool hasF16C() const { return HasF16C; }
+  bool hasFSGSBase() const { return HasFSGSBase; }
+  bool hasLZCNT() const { return HasLZCNT; }
+  bool hasBMI() const { return HasBMI; }
+  bool hasBMI2() const { return HasBMI2; }
+  bool hasRTM() const { return HasRTM; }
+  bool hasHLE() const { return HasHLE; }
+  bool hasADX() const { return HasADX; }
+  bool hasSHA() const { return HasSHA; }
+  bool hasPRFCHW() const { return HasPRFCHW; }
+  bool hasRDSEED() const { return HasRDSEED; }
+  bool hasLAHFSAHF() const { return HasLAHFSAHF; }
+  bool isBTMemSlow() const { return IsBTMemSlow; }
+  bool isSHLDSlow() const { return IsSHLDSlow; }
+  bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
+  bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
+  bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
+  bool hasCmpxchg16b() const { return HasCmpxchg16b; }
+  bool useLeaForSP() const { return UseLeaForSP; }
+  bool hasSlowDivide32() const { return HasSlowDivide32; }
+  bool hasSlowDivide64() const { return HasSlowDivide64; }
+  bool padShortFunctions() const { return PadShortFunctions; }
+  bool callRegIndirect() const { return CallRegIndirect; }
+  bool LEAusesAG() const { return LEAUsesAG; }
+  bool slowLEA() const { return SlowLEA; }
+  bool slowIncDec() const { return SlowIncDec; }
+  bool hasCDI() const { return HasCDI; }
+  bool hasPFI() const { return HasPFI; }
+  bool hasERI() const { return HasERI; }
+  bool hasDQI() const { return HasDQI; }
+  bool hasBWI() const { return HasBWI; }
+  bool hasVLX() const { return HasVLX; }
+  bool hasPKU() const { return HasPKU; }
+  bool hasMPX() const { return HasMPX; }
+
+  bool isAtom() const { return X86ProcFamily == IntelAtom; }
+  bool isSLM() const { return X86ProcFamily == IntelSLM; }
+  bool useSoftFloat() const { return UseSoftFloat; }
+
+  const Triple &getTargetTriple() const { return TargetTriple; }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+  bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); }
+  bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); }
+  bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); }
+  bool isTargetPS4() const { return TargetTriple.isPS4(); }
+
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+  bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
+  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+  bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
+  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+  bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
+  bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
+  bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); }
+
+  bool isTargetWindowsMSVC() const {
+    return TargetTriple.isWindowsMSVCEnvironment();
+  }
+
+  bool isTargetKnownWindowsMSVC() const {
+    return TargetTriple.isKnownWindowsMSVCEnvironment();
+  }
+
+  bool isTargetWindowsCoreCLR() const {
+    return TargetTriple.isWindowsCoreCLREnvironment();
+  }
+
+  bool isTargetWindowsCygwin() const {
+    return TargetTriple.isWindowsCygwinEnvironment();
+  }
+
+  bool isTargetWindowsGNU() const {
+    return TargetTriple.isWindowsGNUEnvironment();
+  }
+
+  bool isTargetWindowsItanium() const {
+    return TargetTriple.isWindowsItaniumEnvironment();
+  }
+
+  bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }
+
+  bool isOSWindows() const { return TargetTriple.isOSWindows(); }
+
+  bool isTargetWin64() const {
+    return In64BitMode && TargetTriple.isOSWindows();
+  }
+
+  bool isTargetWin32() const {
+    return !In64BitMode && (isTargetCygMing() || isTargetKnownWindowsMSVC());
+  }
+
+  bool isPICStyleSet() const { return PICStyle != PICStyles::None; }
+  bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; }
+  bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; }
+
+  bool isPICStyleStubPIC() const {
+    return PICStyle == PICStyles::StubPIC;
+  }
+
+  bool isPICStyleStubNoDynamic() const {
+    return PICStyle == PICStyles::StubDynamicNoPIC;
+  }
+  bool isPICStyleStubAny() const {
+    return PICStyle == PICStyles::StubDynamicNoPIC ||
+           PICStyle == PICStyles::StubPIC;
+  }
+
+  bool isCallingConvWin64(CallingConv::ID CC) const {
+    switch (CC) {
+    // On Win64, all these conventions just use the default convention.
+    case CallingConv::C:
+    case CallingConv::Fast:
+    case CallingConv::X86_FastCall:
+    case CallingConv::X86_StdCall:
+    case CallingConv::X86_ThisCall:
+    case CallingConv::X86_VectorCall:
+    case CallingConv::Intel_OCL_BI:
+      return isTargetWin64();
+    // This convention allows using the Win64 convention on other targets.
+    case CallingConv::X86_64_Win64:
+      return true;
+    // This convention allows using the SysV convention on Windows targets.
+    case CallingConv::X86_64_SysV:
+      return false;
+    // Otherwise, who knows what this is.
+    default:
+      return false;
+    }
+  }
+
+  /// ClassifyGlobalReference - Classify a global variable reference for the
+  /// current subtarget according to how we should reference it in a non-pcrel
+  /// context.
+  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM)const;
+
+  /// Classify a blockaddress reference for the current subtarget according to
+  /// how we should reference it in a non-pcrel context.
+  unsigned char ClassifyBlockAddressReference() const;
+
+  /// Return true if the subtarget allows calls to immediate address.
+  bool IsLegalToCallImmediateAddr(const TargetMachine &TM) const;
+
+  /// This function returns the name of a function which has an interface
+  /// like the non-standard bzero function, if such a function exists on
+  /// the current subtarget and it is considered prefereable over
+  /// memset with zero passed as the second argument. Otherwise it
+  /// returns null.
+  const char *getBZeroEntry() const;
+
+  /// This function returns true if the target has sincos() routine in its
+  /// compiler runtime or math libraries.
+  bool hasSinCos() const;
+
+  /// Enable the MachineScheduler pass for all X86 subtargets.
+  bool enableMachineScheduler() const override { return true; }
+
+  bool enableEarlyIfConversion() const override;
+
+  /// Return the instruction itineraries based on the subtarget selection.
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
+
+  AntiDepBreakMode getAntiDepBreakMode() const override {
+    return TargetSubtargetInfo::ANTIDEP_CRITICAL;
+  }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
new file mode 100644
index 0000000..0e7e4c0
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -0,0 +1,280 @@
+//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetMachine.h"
+#include "X86.h"
+#include "X86TargetObjectFile.h"
+#include "X86TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
+                               cl::desc("Enable the machine combiner pass"),
+                               cl::init(true), cl::Hidden);
+
+namespace llvm {
+void initializeWinEHStatePassPass(PassRegistry &);
+}
+
+extern "C" void LLVMInitializeX86Target() {
+  // Register the target.
+  RegisterTargetMachine<X86TargetMachine> X(TheX86_32Target);
+  RegisterTargetMachine<X86TargetMachine> Y(TheX86_64Target);
+
+  PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeWinEHStatePassPass(PR);
+}
+
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatMachO()) {
+    if (TT.getArch() == Triple::x86_64)
+      return make_unique<X86_64MachoTargetObjectFile>();
+    return make_unique<TargetLoweringObjectFileMachO>();
+  }
+
+  if (TT.isOSLinux() || TT.isOSNaCl())
+    return make_unique<X86LinuxNaClTargetObjectFile>();
+  if (TT.isOSBinFormatELF())
+    return make_unique<X86ELFTargetObjectFile>();
+  if (TT.isKnownWindowsMSVCEnvironment() || TT.isWindowsCoreCLREnvironment())
+    return make_unique<X86WindowsTargetObjectFile>();
+  if (TT.isOSBinFormatCOFF())
+    return make_unique<TargetLoweringObjectFileCOFF>();
+  llvm_unreachable("unknown subtarget type");
+}
+
+static std::string computeDataLayout(const Triple &TT) {
+  // X86 is little endian
+  std::string Ret = "e";
+
+  Ret += DataLayout::getManglingComponent(TT);
+  // X86 and x32 have 32 bit pointers.
+  if ((TT.isArch64Bit() &&
+       (TT.getEnvironment() == Triple::GNUX32 || TT.isOSNaCl())) ||
+      !TT.isArch64Bit())
+    Ret += "-p:32:32";
+
+  // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
+  if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl())
+    Ret += "-i64:64";
+  else
+    Ret += "-f64:32:64";
+
+  // Some ABIs align long double to 128 bits, others to 32.
+  if (TT.isOSNaCl())
+    ; // No f80
+  else if (TT.isArch64Bit() || TT.isOSDarwin())
+    Ret += "-f80:128";
+  else
+    Ret += "-f80:32";
+
+  // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
+  if (TT.isArch64Bit())
+    Ret += "-n8:16:32:64";
+  else
+    Ret += "-n8:16:32";
+
+  // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
+  if (!TT.isArch64Bit() && TT.isOSWindows())
+    Ret += "-a:0:32-S32";
+  else
+    Ret += "-S128";
+
+  return Ret;
+}
+
+/// X86TargetMachine ctor - Create an X86 target.
+///
+X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
+                                   StringRef CPU, StringRef FS,
+                                   const TargetOptions &Options,
+                                   Reloc::Model RM, CodeModel::Model CM,
+                                   CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM,
+                        OL),
+      TLOF(createTLOF(getTargetTriple())),
+      Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) {
+  // Windows stack unwinder gets confused when execution flow "falls through"
+  // after a call to 'noreturn' function.
+  // To prevent that, we emit a trap for 'unreachable' IR instructions.
+  // (which on X86, happens to be the 'ud2' instruction)
+  if (Subtarget.isTargetWin64())
+    this->Options.TrapUnreachable = true;
+
+  // By default (and when -ffast-math is on), enable estimate codegen for
+  // everything except scalar division. By default, use 1 refinement step for
+  // all operations. Defaults may be overridden by using command-line options.
+  // Scalar division estimates are disabled because they break too much
+  // real-world code. These defaults match GCC behavior.
+  this->Options.Reciprocals.setDefaults("sqrtf", true, 1);
+  this->Options.Reciprocals.setDefaults("divf", false, 1);
+  this->Options.Reciprocals.setDefaults("vec-sqrtf", true, 1);
+  this->Options.Reciprocals.setDefaults("vec-divf", true, 1);
+
+  initAsmInfo();
+}
+
+X86TargetMachine::~X86TargetMachine() {}
+
+const X86Subtarget *
+X86TargetMachine::getSubtargetImpl(const Function &F) const {
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+                        ? CPUAttr.getValueAsString().str()
+                        : TargetCPU;
+  std::string FS = !FSAttr.hasAttribute(Attribute::None)
+                       ? FSAttr.getValueAsString().str()
+                       : TargetFS;
+
+  // FIXME: This is related to the code below to reset the target options,
+  // we need to know whether or not the soft float flag is set on the
+  // function before we can generate a subtarget. We also need to use
+  // it as a key for the subtarget since that can be the only difference
+  // between two functions.
+  bool SoftFloat =
+      F.hasFnAttribute("use-soft-float") &&
+      F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+  // If the soft float attribute is set on the function turn on the soft float
+  // subtarget feature.
+  if (SoftFloat)
+    FS += FS.empty() ? "+soft-float" : ",+soft-float";
+
+  auto &I = SubtargetMap[CPU + FS];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
+                                        Options.StackAlignmentOverride);
+  }
+  return I.get();
+}
+
+//===----------------------------------------------------------------------===//
+// Command line options for x86
+//===----------------------------------------------------------------------===//
+static cl::opt<bool>
+UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
+  cl::desc("Minimize AVX to SSE transition penalty"),
+  cl::init(true));
+
+//===----------------------------------------------------------------------===//
+// X86 TTI query.
+//===----------------------------------------------------------------------===//
+
+TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(X86TTIImpl(this, F));
+  });
+}
+
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// X86 Code Generator Pass Configuration Options.
+class X86PassConfig : public TargetPassConfig {
+public:
+  X86PassConfig(X86TargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  X86TargetMachine &getX86TargetMachine() const {
+    return getTM<X86TargetMachine>();
+  }
+
+  void addIRPasses() override;
+  bool addInstSelector() override;
+  bool addILPOpts() override;
+  bool addPreISel() override;
+  void addPreRegAlloc() override;
+  void addPostRegAlloc() override;
+  void addPreEmitPass() override;
+  void addPreSched2() override;
+};
+} // namespace
+
+TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new X86PassConfig(this, PM);
+}
+
+void X86PassConfig::addIRPasses() {
+  addPass(createAtomicExpandPass(&getX86TargetMachine()));
+
+  TargetPassConfig::addIRPasses();
+}
+
+bool X86PassConfig::addInstSelector() {
+  // Install an instruction selector.
+  addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
+
+  // For ELF, cleanup any local-dynamic TLS accesses.
+  if (TM->getTargetTriple().isOSBinFormatELF() &&
+      getOptLevel() != CodeGenOpt::None)
+    addPass(createCleanupLocalDynamicTLSPass());
+
+  addPass(createX86GlobalBaseRegPass());
+
+  return false;
+}
+
+bool X86PassConfig::addILPOpts() {
+  addPass(&EarlyIfConverterID);
+  if (EnableMachineCombinerPass)
+    addPass(&MachineCombinerID);
+  return true;
+}
+
+bool X86PassConfig::addPreISel() {
+  // Only add this pass for 32-bit x86 Windows.
+  const Triple &TT = TM->getTargetTriple();
+  if (TT.isOSWindows() && TT.getArch() == Triple::x86)
+    addPass(createX86WinEHStatePass());
+  return true;
+}
+
+void X86PassConfig::addPreRegAlloc() {
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createX86OptimizeLEAs());
+
+  addPass(createX86CallFrameOptimization());
+}
+
+void X86PassConfig::addPostRegAlloc() {
+  addPass(createX86FloatingPointStackifierPass());
+}
+
+void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); }
+
+void X86PassConfig::addPreEmitPass() {
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createExecutionDependencyFixPass(&X86::VR128RegClass));
+
+  if (UseVZeroUpper)
+    addPass(createX86IssueVZeroUpperPass());
+
+  if (getOptLevel() != CodeGenOpt::None) {
+    addPass(createX86PadShortFunctions());
+    addPass(createX86FixupLEAs());
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
new file mode 100644
index 0000000..2629556
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
@@ -0,0 +1,49 @@
+//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
+#define LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class StringRef;
+
+class X86TargetMachine final : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  X86Subtarget Subtarget;
+
+  mutable StringMap<std::unique_ptr<X86Subtarget>> SubtargetMap;
+
+public:
+  X86TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                   StringRef FS, const TargetOptions &Options, Reloc::Model RM,
+                   CodeModel::Model CM, CodeGenOpt::Level OL);
+  ~X86TargetMachine() override;
+  const X86Subtarget *getSubtargetImpl(const Function &F) const override;
+
+  TargetIRAnalysis getTargetIRAnalysis() override;
+
+  // Set up the pass pipeline.
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
new file mode 100644
index 0000000..782768d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -0,0 +1,175 @@
+//===-- X86TargetObjectFile.cpp - X86 Object Info -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetObjectFile.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+using namespace dwarf;
+
+const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+    const TargetMachine &TM, MachineModuleInfo *MMI,
+    MCStreamer &Streamer) const {
+
+  // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
+  // is an indirect pc-relative reference.
+  if ((Encoding & DW_EH_PE_indirect) && (Encoding & DW_EH_PE_pcrel)) {
+    const MCSymbol *Sym = TM.getSymbol(GV, Mang);
+    const MCExpr *Res =
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
+    const MCExpr *Four = MCConstantExpr::create(4, getContext());
+    return MCBinaryExpr::createAdd(Res, Four, getContext());
+  }
+
+  return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+      GV, Encoding, Mang, TM, MMI, Streamer);
+}
+
+MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol(
+    const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  return TM.getSymbol(GV, Mang);
+}
+
+const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
+    const MCSymbol *Sym, const MCValue &MV, int64_t Offset,
+    MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+  // On Darwin/X86-64, we need to use foo@GOTPCREL+4 to access the got entry
+  // from a data section. In case there's an additional offset, then use
+  // foo@GOTPCREL+4+<offset>.
+  unsigned FinalOff = Offset+MV.getConstant()+4;
+  const MCExpr *Res =
+    MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
+  const MCExpr *Off = MCConstantExpr::create(FinalOff, getContext());
+  return MCBinaryExpr::createAdd(Res, Off, getContext());
+}
+
+const MCExpr *X86ELFTargetObjectFile::getDebugThreadLocalSymbol(
+    const MCSymbol *Sym) const {
+  return MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext());
+}
+
+void
+X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx,
+                                         const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
+const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol(
+    const ConstantExpr *CE, Mangler &Mang, const TargetMachine &TM) const {
+  // We are looking for the difference of two symbols, need a subtraction
+  // operation.
+  const SubOperator *Sub = dyn_cast<SubOperator>(CE);
+  if (!Sub)
+    return nullptr;
+
+  // Symbols must first be numbers before we can subtract them, we need to see a
+  // ptrtoint on both subtraction operands.
+  const PtrToIntOperator *SubLHS =
+      dyn_cast<PtrToIntOperator>(Sub->getOperand(0));
+  const PtrToIntOperator *SubRHS =
+      dyn_cast<PtrToIntOperator>(Sub->getOperand(1));
+  if (!SubLHS || !SubRHS)
+    return nullptr;
+
+  // Our symbols should exist in address space zero, cowardly no-op if
+  // otherwise.
+  if (SubLHS->getPointerAddressSpace() != 0 ||
+      SubRHS->getPointerAddressSpace() != 0)
+    return nullptr;
+
+  // Both ptrtoint instructions must wrap global objects:
+  // - Only global variables are eligible for image relative relocations.
+  // - The subtrahend refers to the special symbol __ImageBase, a GlobalVariable.
+  const auto *GOLHS = dyn_cast<GlobalObject>(SubLHS->getPointerOperand());
+  const auto *GVRHS = dyn_cast<GlobalVariable>(SubRHS->getPointerOperand());
+  if (!GOLHS || !GVRHS)
+    return nullptr;
+
+  // We expect __ImageBase to be a global variable without a section, externally
+  // defined.
+  //
+  // It should look something like this: @__ImageBase = external constant i8
+  if (GVRHS->isThreadLocal() || GVRHS->getName() != "__ImageBase" ||
+      !GVRHS->hasExternalLinkage() || GVRHS->hasInitializer() ||
+      GVRHS->hasSection())
+    return nullptr;
+
+  // An image-relative, thread-local, symbol makes no sense.
+  if (GOLHS->isThreadLocal())
+    return nullptr;
+
+  return MCSymbolRefExpr::create(TM.getSymbol(GOLHS, Mang),
+                                 MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                 getContext());
+}
+
+static std::string APIntToHexString(const APInt &AI) {
+  unsigned Width = (AI.getBitWidth() / 8) * 2;
+  std::string HexString = utohexstr(AI.getLimitedValue(), /*LowerCase=*/true);
+  unsigned Size = HexString.size();
+  assert(Width >= Size && "hex string is too large!");
+  HexString.insert(HexString.begin(), Width - Size, '0');
+
+  return HexString;
+}
+
+static std::string scalarConstantToHexString(const Constant *C) {
+  Type *Ty = C->getType();
+  if (isa<UndefValue>(C)) {
+    return APIntToHexString(APInt::getNullValue(Ty->getPrimitiveSizeInBits()));
+  } else if (const auto *CFP = dyn_cast<ConstantFP>(C)) {
+    return APIntToHexString(CFP->getValueAPF().bitcastToAPInt());
+  } else if (const auto *CI = dyn_cast<ConstantInt>(C)) {
+    return APIntToHexString(CI->getValue());
+  } else {
+    unsigned NumElements;
+    if (isa<VectorType>(Ty))
+      NumElements = Ty->getVectorNumElements();
+    else
+      NumElements = Ty->getArrayNumElements();
+    std::string HexString;
+    for (int I = NumElements - 1, E = -1; I != E; --I)
+      HexString += scalarConstantToHexString(C->getAggregateElement(I));
+    return HexString;
+  }
+}
+
+MCSection *X86WindowsTargetObjectFile::getSectionForConstant(
+    const DataLayout &DL, SectionKind Kind, const Constant *C) const {
+  if (Kind.isMergeableConst() && C) {
+    const unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                     COFF::IMAGE_SCN_MEM_READ |
+                                     COFF::IMAGE_SCN_LNK_COMDAT;
+    std::string COMDATSymName;
+    if (Kind.isMergeableConst4() || Kind.isMergeableConst8())
+      COMDATSymName = "__real@" + scalarConstantToHexString(C);
+    else if (Kind.isMergeableConst16())
+      COMDATSymName = "__xmm@" + scalarConstantToHexString(C);
+
+    if (!COMDATSymName.empty())
+      return getContext().getCOFFSection(".rdata", Characteristics, Kind,
+                                         COMDATSymName,
+                                         COFF::IMAGE_COMDAT_SELECT_ANY);
+  }
+
+  return TargetLoweringObjectFile::getSectionForConstant(DL, Kind, C);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
new file mode 100644
index 0000000..6b2448c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -0,0 +1,67 @@
+//===-- X86TargetObjectFile.h - X86 Object Info -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+
+  /// X86_64MachoTargetObjectFile - This TLOF implementation is used for Darwin
+  /// x86-64.
+  class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+  public:
+    const MCExpr *
+    getTTypeGlobalReference(const GlobalValue *GV, unsigned Encoding,
+                            Mangler &Mang, const TargetMachine &TM,
+                            MachineModuleInfo *MMI,
+                            MCStreamer &Streamer) const override;
+
+    // getCFIPersonalitySymbol - The symbol that gets passed to
+    // .cfi_personality.
+    MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang,
+                                      const TargetMachine &TM,
+                                      MachineModuleInfo *MMI) const override;
+
+    const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym,
+                                            const MCValue &MV, int64_t Offset,
+                                            MachineModuleInfo *MMI,
+                                            MCStreamer &Streamer) const override;
+  };
+
+  /// \brief This implemenatation is used for X86 ELF targets that don't
+  /// have a further specialization.
+  class X86ELFTargetObjectFile : public TargetLoweringObjectFileELF {
+    /// \brief Describe a TLS variable address within debug info.
+    const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
+  };
+
+  /// X86LinuxNaClTargetObjectFile - This implementation is used for linux and
+  /// Native Client on x86 and x86-64.
+  class X86LinuxNaClTargetObjectFile : public X86ELFTargetObjectFile {
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+  };
+
+  /// \brief This implementation is used for Windows targets on x86 and x86-64.
+  class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF {
+    const MCExpr *
+    getExecutableRelativeSymbol(const ConstantExpr *CE, Mangler &Mang,
+                                const TargetMachine &TM) const override;
+
+    /// \brief Given a mergeable constant with the specified size and relocation
+    /// information, return a section that it should be placed in.
+    MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
+                                     const Constant *C) const override;
+  };
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
new file mode 100644
index 0000000..2e7bbb2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -0,0 +1,1487 @@
+//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// X86 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86tti"
+
+//===----------------------------------------------------------------------===//
+//
+// X86 cost model.
+//
+//===----------------------------------------------------------------------===//
+
+TargetTransformInfo::PopcntSupportKind
+X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  // TODO: Currently the __builtin_popcount() implementation using SSE3
+  //   instructions is inefficient. Once the problem is fixed, we should
+  //   call ST->hasSSE3() instead of ST->hasPOPCNT().
+  return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
+}
+
+unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
+  if (Vector && !ST->hasSSE1())
+    return 0;
+
+  if (ST->is64Bit()) {
+    if (Vector && ST->hasAVX512())
+      return 32;
+    return 16;
+  }
+  return 8;
+}
+
+unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
+  if (Vector) {
+    if (ST->hasAVX512()) return 512;
+    if (ST->hasAVX()) return 256;
+    if (ST->hasSSE1()) return 128;
+    return 0;
+  }
+
+  if (ST->is64Bit())
+    return 64;
+
+  return 32;
+}
+
+unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
+  // If the loop will not be vectorized, don't interleave the loop.
+  // Let regular unroll to unroll the loop, which saves the overflow
+  // check and memory check cost.
+  if (VF == 1)
+    return 1;
+
+  if (ST->isAtom())
+    return 1;
+
+  // Sandybridge and Haswell have multiple execution ports and pipelined
+  // vector units.
+  if (ST->hasAVX())
+    return 4;
+
+  return 2;
+}
+
+int X86TTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
+    TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo) {
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  if (ISD == ISD::SDIV &&
+      Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
+    // On X86, vector signed division by constants power-of-two are
+    // normally expanded to the sequence SRA + SRL + ADD + SRA.
+    // The OperandValue properties many not be same as that of previous
+    // operation;conservatively assume OP_None.
+    int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info,
+                                          Op2Info, TargetTransformInfo::OP_None,
+                                          TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
+
+    return Cost;
+  }
+
+  static const CostTblEntry AVX2UniformConstCostTable[] = {
+    { ISD::SRA,  MVT::v4i64,   4 }, // 2 x psrad + shuffle.
+
+    { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
+    { ISD::UDIV, MVT::v16i16,  6 }, // vpmulhuw sequence
+    { ISD::SDIV, MVT::v8i32,  15 }, // vpmuldq sequence
+    { ISD::UDIV, MVT::v8i32,  15 }, // vpmuludq sequence
+  };
+
+  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      ST->hasAVX2()) {
+    if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX512CostTable[] = {
+    { ISD::SHL,     MVT::v16i32,    1 },
+    { ISD::SRL,     MVT::v16i32,    1 },
+    { ISD::SRA,     MVT::v16i32,    1 },
+    { ISD::SHL,     MVT::v8i64,    1 },
+    { ISD::SRL,     MVT::v8i64,    1 },
+    { ISD::SRA,     MVT::v8i64,    1 },
+  };
+
+  if (ST->hasAVX512()) {
+    if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX2CostTable[] = {
+    // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
+    // customize them to detect the cases where shift amount is a scalar one.
+    { ISD::SHL,     MVT::v4i32,    1 },
+    { ISD::SRL,     MVT::v4i32,    1 },
+    { ISD::SRA,     MVT::v4i32,    1 },
+    { ISD::SHL,     MVT::v8i32,    1 },
+    { ISD::SRL,     MVT::v8i32,    1 },
+    { ISD::SRA,     MVT::v8i32,    1 },
+    { ISD::SHL,     MVT::v2i64,    1 },
+    { ISD::SRL,     MVT::v2i64,    1 },
+    { ISD::SHL,     MVT::v4i64,    1 },
+    { ISD::SRL,     MVT::v4i64,    1 },
+  };
+
+  // Look for AVX2 lowering tricks.
+  if (ST->hasAVX2()) {
+    if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
+        (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+         Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
+      // On AVX2, a packed v16i16 shift left by a constant build_vector
+      // is lowered into a vector multiply (vpmullw).
+      return LT.first;
+
+    if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry XOPCostTable[] = {
+    // 128bit shifts take 1cy, but right shifts require negation beforehand.
+    { ISD::SHL,     MVT::v16i8,    1 },
+    { ISD::SRL,     MVT::v16i8,    2 },
+    { ISD::SRA,     MVT::v16i8,    2 },
+    { ISD::SHL,     MVT::v8i16,    1 },
+    { ISD::SRL,     MVT::v8i16,    2 },
+    { ISD::SRA,     MVT::v8i16,    2 },
+    { ISD::SHL,     MVT::v4i32,    1 },
+    { ISD::SRL,     MVT::v4i32,    2 },
+    { ISD::SRA,     MVT::v4i32,    2 },
+    { ISD::SHL,     MVT::v2i64,    1 },
+    { ISD::SRL,     MVT::v2i64,    2 },
+    { ISD::SRA,     MVT::v2i64,    2 },
+    // 256bit shifts require splitting if AVX2 didn't catch them above.
+    { ISD::SHL,     MVT::v32i8,    2 },
+    { ISD::SRL,     MVT::v32i8,    4 },
+    { ISD::SRA,     MVT::v32i8,    4 },
+    { ISD::SHL,     MVT::v16i16,   2 },
+    { ISD::SRL,     MVT::v16i16,   4 },
+    { ISD::SRA,     MVT::v16i16,   4 },
+    { ISD::SHL,     MVT::v8i32,    2 },
+    { ISD::SRL,     MVT::v8i32,    4 },
+    { ISD::SRA,     MVT::v8i32,    4 },
+    { ISD::SHL,     MVT::v4i64,    2 },
+    { ISD::SRL,     MVT::v4i64,    4 },
+    { ISD::SRA,     MVT::v4i64,    4 },
+  };
+
+  // Look for XOP lowering tricks.
+  if (ST->hasXOP()) {
+    if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX2CustomCostTable[] = {
+    { ISD::SHL,  MVT::v32i8,      11 }, // vpblendvb sequence.
+    { ISD::SHL,  MVT::v16i16,     10 }, // extend/vpsrlvd/pack sequence.
+
+    { ISD::SRL,  MVT::v32i8,      11 }, // vpblendvb sequence.
+    { ISD::SRL,  MVT::v16i16,     10 }, // extend/vpsrlvd/pack sequence.
+
+    { ISD::SRA,  MVT::v32i8,      24 }, // vpblendvb sequence.
+    { ISD::SRA,  MVT::v16i16,     10 }, // extend/vpsravd/pack sequence.
+    { ISD::SRA,  MVT::v2i64,       4 }, // srl/xor/sub sequence.
+    { ISD::SRA,  MVT::v4i64,       4 }, // srl/xor/sub sequence.
+
+    // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+    { ISD::SDIV,  MVT::v32i8,  32*20 },
+    { ISD::SDIV,  MVT::v16i16, 16*20 },
+    { ISD::SDIV,  MVT::v8i32,  8*20 },
+    { ISD::SDIV,  MVT::v4i64,  4*20 },
+    { ISD::UDIV,  MVT::v32i8,  32*20 },
+    { ISD::UDIV,  MVT::v16i16, 16*20 },
+    { ISD::UDIV,  MVT::v8i32,  8*20 },
+    { ISD::UDIV,  MVT::v4i64,  4*20 },
+  };
+
+  // Look for AVX2 lowering tricks for custom cases.
+  if (ST->hasAVX2()) {
+    if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry
+  SSE2UniformConstCostTable[] = {
+    // We don't correctly identify costs of casts because they are marked as
+    // custom.
+    // Constant splats are cheaper for the following instructions.
+    { ISD::SHL,  MVT::v16i8,  1 }, // psllw.
+    { ISD::SHL,  MVT::v32i8,  2 }, // psllw.
+    { ISD::SHL,  MVT::v8i16,  1 }, // psllw.
+    { ISD::SHL,  MVT::v16i16, 2 }, // psllw.
+    { ISD::SHL,  MVT::v4i32,  1 }, // pslld
+    { ISD::SHL,  MVT::v8i32,  2 }, // pslld
+    { ISD::SHL,  MVT::v2i64,  1 }, // psllq.
+    { ISD::SHL,  MVT::v4i64,  2 }, // psllq.
+
+    { ISD::SRL,  MVT::v16i8,  1 }, // psrlw.
+    { ISD::SRL,  MVT::v32i8,  2 }, // psrlw.
+    { ISD::SRL,  MVT::v8i16,  1 }, // psrlw.
+    { ISD::SRL,  MVT::v16i16, 2 }, // psrlw.
+    { ISD::SRL,  MVT::v4i32,  1 }, // psrld.
+    { ISD::SRL,  MVT::v8i32,  2 }, // psrld.
+    { ISD::SRL,  MVT::v2i64,  1 }, // psrlq.
+    { ISD::SRL,  MVT::v4i64,  2 }, // psrlq.
+
+    { ISD::SRA,  MVT::v16i8,  4 }, // psrlw, pand, pxor, psubb.
+    { ISD::SRA,  MVT::v32i8,  8 }, // psrlw, pand, pxor, psubb.
+    { ISD::SRA,  MVT::v8i16,  1 }, // psraw.
+    { ISD::SRA,  MVT::v16i16, 2 }, // psraw.
+    { ISD::SRA,  MVT::v4i32,  1 }, // psrad.
+    { ISD::SRA,  MVT::v8i32,  2 }, // psrad.
+    { ISD::SRA,  MVT::v2i64,  4 }, // 2 x psrad + shuffle.
+    { ISD::SRA,  MVT::v4i64,  8 }, // 2 x psrad + shuffle.
+
+    { ISD::SDIV, MVT::v8i16,  6 }, // pmulhw sequence
+    { ISD::UDIV, MVT::v8i16,  6 }, // pmulhuw sequence
+    { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
+    { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
+  };
+
+  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      ST->hasSSE2()) {
+    // pmuldq sequence.
+    if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
+      return LT.first * 15;
+
+    if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  if (ISD == ISD::SHL &&
+      Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
+    MVT VT = LT.second;
+    // Vector shift left by non uniform constant can be lowered
+    // into vector multiply (pmullw/pmulld).
+    if ((VT == MVT::v8i16 && ST->hasSSE2()) ||
+        (VT == MVT::v4i32 && ST->hasSSE41()))
+      return LT.first;
+
+    // v16i16 and v8i32 shifts by non-uniform constants are lowered into a
+    // sequence of extract + two vector multiply + insert.
+    if ((VT == MVT::v8i32 || VT == MVT::v16i16) &&
+       (ST->hasAVX() && !ST->hasAVX2()))
+      ISD = ISD::MUL;
+
+    // A vector shift left by non uniform constant is converted
+    // into a vector multiply; the new multiply is eventually
+    // lowered into a sequence of shuffles and 2 x pmuludq.
+    if (VT == MVT::v4i32 && ST->hasSSE2())
+      ISD = ISD::MUL;
+  }
+
+  static const CostTblEntry SSE2CostTable[] = {
+    // We don't correctly identify costs of casts because they are marked as
+    // custom.
+    // For some cases, where the shift amount is a scalar we would be able
+    // to generate better code. Unfortunately, when this is the case the value
+    // (the splat) will get hoisted out of the loop, thereby making it invisible
+    // to ISel. The cost model must return worst case assumptions because it is
+    // used for vectorization and we don't want to make vectorized code worse
+    // than scalar code.
+    { ISD::SHL,  MVT::v16i8,    26 }, // cmpgtb sequence.
+    { ISD::SHL,  MVT::v32i8,  2*26 }, // cmpgtb sequence.
+    { ISD::SHL,  MVT::v8i16,    32 }, // cmpgtb sequence.
+    { ISD::SHL,  MVT::v16i16, 2*32 }, // cmpgtb sequence.
+    { ISD::SHL,  MVT::v4i32,   2*5 }, // We optimized this using mul.
+    { ISD::SHL,  MVT::v8i32, 2*2*5 }, // We optimized this using mul.
+    { ISD::SHL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
+    { ISD::SHL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
+
+    { ISD::SRL,  MVT::v16i8,    26 }, // cmpgtb sequence.
+    { ISD::SRL,  MVT::v32i8,  2*26 }, // cmpgtb sequence.
+    { ISD::SRL,  MVT::v8i16,    32 }, // cmpgtb sequence.
+    { ISD::SRL,  MVT::v16i16, 2*32 }, // cmpgtb sequence.
+    { ISD::SRL,  MVT::v4i32,    16 }, // Shift each lane + blend.
+    { ISD::SRL,  MVT::v8i32,  2*16 }, // Shift each lane + blend.
+    { ISD::SRL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
+    { ISD::SRL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
+
+    { ISD::SRA,  MVT::v16i8,    54 }, // unpacked cmpgtb sequence.
+    { ISD::SRA,  MVT::v32i8,  2*54 }, // unpacked cmpgtb sequence.
+    { ISD::SRA,  MVT::v8i16,    32 }, // cmpgtb sequence.
+    { ISD::SRA,  MVT::v16i16, 2*32 }, // cmpgtb sequence.
+    { ISD::SRA,  MVT::v4i32,    16 }, // Shift each lane + blend.
+    { ISD::SRA,  MVT::v8i32,  2*16 }, // Shift each lane + blend.
+    { ISD::SRA,  MVT::v2i64,    12 }, // srl/xor/sub sequence.
+    { ISD::SRA,  MVT::v4i64,  2*12 }, // srl/xor/sub sequence.
+
+    // It is not a good idea to vectorize division. We have to scalarize it and
+    // in the process we will often end up having to spilling regular
+    // registers. The overhead of division is going to dominate most kernels
+    // anyways so try hard to prevent vectorization of division - it is
+    // generally a bad idea. Assume somewhat arbitrarily that we have to be able
+    // to hide "20 cycles" for each lane.
+    { ISD::SDIV,  MVT::v16i8,  16*20 },
+    { ISD::SDIV,  MVT::v8i16,  8*20 },
+    { ISD::SDIV,  MVT::v4i32,  4*20 },
+    { ISD::SDIV,  MVT::v2i64,  2*20 },
+    { ISD::UDIV,  MVT::v16i8,  16*20 },
+    { ISD::UDIV,  MVT::v8i16,  8*20 },
+    { ISD::UDIV,  MVT::v4i32,  4*20 },
+    { ISD::UDIV,  MVT::v2i64,  2*20 },
+  };
+
+  if (ST->hasSSE2()) {
+    if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX1CostTable[] = {
+    // We don't have to scalarize unsupported ops. We can issue two half-sized
+    // operations and we only need to extract the upper YMM half.
+    // Two ops + 1 extract + 1 insert = 4.
+    { ISD::MUL,     MVT::v16i16,   4 },
+    { ISD::MUL,     MVT::v8i32,    4 },
+    { ISD::SUB,     MVT::v8i32,    4 },
+    { ISD::ADD,     MVT::v8i32,    4 },
+    { ISD::SUB,     MVT::v4i64,    4 },
+    { ISD::ADD,     MVT::v4i64,    4 },
+    // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
+    // are lowered as a series of long multiplies(3), shifts(4) and adds(2)
+    // Because we believe v4i64 to be a legal type, we must also include the
+    // split factor of two in the cost table. Therefore, the cost here is 18
+    // instead of 9.
+    { ISD::MUL,     MVT::v4i64,    18 },
+  };
+
+  // Look for AVX1 lowering tricks.
+  if (ST->hasAVX() && !ST->hasAVX2()) {
+    MVT VT = LT.second;
+
+    if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, VT))
+      return LT.first * Entry->Cost;
+  }
+
+  // Custom lowering of vectors.
+  static const CostTblEntry CustomLowered[] = {
+    // A v2i64/v4i64 and multiply is custom lowered as a series of long
+    // multiplies(3), shifts(4) and adds(2).
+    { ISD::MUL,     MVT::v2i64,    9 },
+    { ISD::MUL,     MVT::v4i64,    9 },
+  };
+  if (const auto *Entry = CostTableLookup(CustomLowered, ISD, LT.second))
+    return LT.first * Entry->Cost;
+
+  // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle,
+  // 2x pmuludq, 2x shuffle.
+  if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
+      !ST->hasSSE41())
+    return LT.first * 6;
+
+  // Fallback to the default implementation.
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
+}
+
+int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                               Type *SubTp) {
+  // We only estimate the cost of reverse and alternate shuffles.
+  if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
+    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+
+  if (Kind == TTI::SK_Reverse) {
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+    int Cost = 1;
+    if (LT.second.getSizeInBits() > 128)
+      Cost = 3; // Extract + insert + copy.
+
+    // Multiple by the number of parts.
+    return Cost * LT.first;
+  }
+
+  if (Kind == TTI::SK_Alternate) {
+    // 64-bit packed float vectors (v2f32) are widened to type v4f32.
+    // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+    // The backend knows how to generate a single VEX.256 version of
+    // instruction VPBLENDW if the target supports AVX2.
+    if (ST->hasAVX2() && LT.second == MVT::v16i16)
+      return LT.first;
+
+    static const CostTblEntry AVXAltShuffleTbl[] = {
+      {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1},  // vblendpd
+      {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1},  // vblendpd
+
+      {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1},  // vblendps
+      {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1},  // vblendps
+
+      // This shuffle is custom lowered into a sequence of:
+      //  2x  vextractf128 , 2x vpblendw , 1x vinsertf128
+      {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5},
+
+      // This shuffle is custom lowered into a long sequence of:
+      //  2x vextractf128 , 4x vpshufb , 2x vpor ,  1x vinsertf128
+      {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9}
+    };
+
+    if (ST->hasAVX())
+      if (const auto *Entry = CostTableLookup(AVXAltShuffleTbl,
+                                              ISD::VECTOR_SHUFFLE, LT.second))
+        return LT.first * Entry->Cost;
+
+    static const CostTblEntry SSE41AltShuffleTbl[] = {
+      // These are lowered into movsd.
+      {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+      {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+
+      // packed float vectors with four elements are lowered into BLENDI dag
+      // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'.
+      {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
+      {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
+
+      // This shuffle generates a single pshufw.
+      {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
+
+      // There is no instruction that matches a v16i8 alternate shuffle.
+      // The backend will expand it into the sequence 'pshufb + pshufb + or'.
+      {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}
+    };
+
+    if (ST->hasSSE41())
+      if (const auto *Entry = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE,
+                                              LT.second))
+        return LT.first * Entry->Cost;
+
+    static const CostTblEntry SSSE3AltShuffleTbl[] = {
+      {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // movsd
+      {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // movsd
+
+      // SSE3 doesn't have 'blendps'. The following shuffles are expanded into
+      // the sequence 'shufps + pshufd'
+      {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+      {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+
+      {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or
+      {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}  // pshufb + pshufb + or
+    };
+
+    if (ST->hasSSSE3())
+      if (const auto *Entry = CostTableLookup(SSSE3AltShuffleTbl,
+                                              ISD::VECTOR_SHUFFLE, LT.second))
+        return LT.first * Entry->Cost;
+
+    static const CostTblEntry SSEAltShuffleTbl[] = {
+      {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // movsd
+      {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // movsd
+
+      {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd
+      {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd
+
+      // This is expanded into a long sequence of four extract + four insert.
+      {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw.
+
+      // 8 x (pinsrw + pextrw + and + movb + movzb + or)
+      {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48}
+    };
+
+    // Fall-back (SSE3 and SSE2).
+    if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl,
+                                            ISD::VECTOR_SHUFFLE, LT.second))
+      return LT.first * Entry->Cost;
+    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+  }
+
+  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
+
+int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  // FIXME: Need a better design of the cost table to handle non-simple types of
+  // potential massive combinations (elem_num x src_type x dst_type).
+
+  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
+
+    { ISD::FP_TO_UINT,  MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT,  MVT::v4i64, MVT::v4f64, 1 },
+    { ISD::FP_TO_UINT,  MVT::v8i64, MVT::v8f64, 1 },
+    { ISD::FP_TO_UINT,  MVT::v2i64, MVT::v2f32, 1 },
+    { ISD::FP_TO_UINT,  MVT::v4i64, MVT::v4f32, 1 },
+    { ISD::FP_TO_UINT,  MVT::v8i64, MVT::v8f32, 1 },
+  };
+
+  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
+    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,  1 },
+    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 },
+    { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,  1 },
+
+    { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32, 1 },
+    { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 1 },
+    { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,  1 },
+    { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,  1 },
+
+    // v16i1 -> v16i32 - load + broadcast
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
+
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
+
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
+
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i16,  2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i16,  5 },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64, 12 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
+
+    { ISD::FP_TO_UINT,  MVT::v2i32,  MVT::v2f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v16i32, MVT::v16f32, 1 },
+  };
+
+  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
+
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i64,  2 },
+    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i64,  2 },
+    { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  2 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  2 },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  2 },
+    { ISD::TRUNCATE,    MVT::v8i32,  MVT::v8i64,  4 },
+
+    { ISD::FP_EXTEND,   MVT::v8f64,  MVT::v8f32,  3 },
+    { ISD::FP_ROUND,    MVT::v8f32,  MVT::v8f64,  3 },
+
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  8 },
+  };
+
+  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,  7 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,  4 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  7 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,  6 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,  4 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,  6 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 6 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
+
+    { ISD::TRUNCATE,    MVT::v4i8,  MVT::v4i64,  4 },
+    { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i64,  4 },
+    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64,  4 },
+    { ISD::TRUNCATE,    MVT::v8i8,  MVT::v8i32,  4 },
+    { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32,  5 },
+    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i16, 4 },
+    { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64,  9 },
+
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i1,  8 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  8 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1,  3 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8,  3 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 3 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i1,  3 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i8,  3 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i16, 3 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i32, 1 },
+
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i1,  6 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i8,  5 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 9 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1,  7 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8,  2 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 6 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i1,  7 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i8,  2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i32, 6 },
+    // The generic code to compute the scalar overhead is currently broken.
+    // Workaround this limitation by estimating the scalarization overhead
+    // here. We have roughly 10 instructions per scalar element.
+    // Multiply that by the vector width.
+    // FIXME: remove that when PR19268 is fixed.
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i64, 2*10 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i64, 4*10 },
+
+    { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 7 },
+    { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
+    // This node is expanded into scalarized operations but BasicTTI is overly
+    // optimistic estimating its cost.  It computes 3 per element (one
+    // vector-extract, one scalar conversion and one vector-insert).  The
+    // problem is that the inserts form a read-modify-write chain so latency
+    // should be factored in too.  Inflating the cost per element by 1.
+    { ISD::FP_TO_UINT,  MVT::v8i32, MVT::v8f32, 8*4 },
+    { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f64, 4*4 },
+  };
+
+  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   2 },
+
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  3 },
+    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  1 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 30 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  3 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  1 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  1 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 },
+  };
+
+  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
+    // These are somewhat magic numbers justified by looking at the output of
+    // Intel's IACA, running some kernels and making sure when we take
+    // legalization into account the throughput will be overestimated.
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+    // There are faster sequences for float conversions.
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  4 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  2 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  9 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  12 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   2 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   3 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  4 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   6 },
+
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 10 },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
+    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  3 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 7 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  4 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  3 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  4 },
+  };
+
+  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
+  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
+
+  if (ST->hasSSE2() && !ST->hasAVX()) {
+    if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+                                                   LTDest.second, LTSrc.second))
+      return LTSrc.first * Entry->Cost;
+  }
+
+  EVT SrcTy = TLI->getValueType(DL, Src);
+  EVT DstTy = TLI->getValueType(DL, Dst);
+
+  // The function getSimpleVT only handles simple value types.
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return BaseT::getCastInstrCost(Opcode, Dst, Src);
+
+  if (ST->hasDQI())
+    if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+
+  if (ST->hasAVX512())
+    if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+
+  if (ST->hasAVX2()) {
+    if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+  }
+
+  if (ST->hasAVX()) {
+    if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+  }
+
+  if (ST->hasSSE41()) {
+    if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+  }
+
+  if (ST->hasSSE2()) {
+    if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+  }
+
+  return BaseT::getCastInstrCost(Opcode, Dst, Src);
+}
+
+int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+  MVT MTy = LT.second;
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  static const CostTblEntry SSE42CostTbl[] = {
+    { ISD::SETCC,   MVT::v2f64,   1 },
+    { ISD::SETCC,   MVT::v4f32,   1 },
+    { ISD::SETCC,   MVT::v2i64,   1 },
+    { ISD::SETCC,   MVT::v4i32,   1 },
+    { ISD::SETCC,   MVT::v8i16,   1 },
+    { ISD::SETCC,   MVT::v16i8,   1 },
+  };
+
+  static const CostTblEntry AVX1CostTbl[] = {
+    { ISD::SETCC,   MVT::v4f64,   1 },
+    { ISD::SETCC,   MVT::v8f32,   1 },
+    // AVX1 does not support 8-wide integer compare.
+    { ISD::SETCC,   MVT::v4i64,   4 },
+    { ISD::SETCC,   MVT::v8i32,   4 },
+    { ISD::SETCC,   MVT::v16i16,  4 },
+    { ISD::SETCC,   MVT::v32i8,   4 },
+  };
+
+  static const CostTblEntry AVX2CostTbl[] = {
+    { ISD::SETCC,   MVT::v4i64,   1 },
+    { ISD::SETCC,   MVT::v8i32,   1 },
+    { ISD::SETCC,   MVT::v16i16,  1 },
+    { ISD::SETCC,   MVT::v32i8,   1 },
+  };
+
+  static const CostTblEntry AVX512CostTbl[] = {
+    { ISD::SETCC,   MVT::v8i64,   1 },
+    { ISD::SETCC,   MVT::v16i32,  1 },
+    { ISD::SETCC,   MVT::v8f64,   1 },
+    { ISD::SETCC,   MVT::v16f32,  1 },
+  };
+
+  if (ST->hasAVX512())
+    if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasAVX2())
+    if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasAVX())
+    if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasSSE42())
+    if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  if (Index != -1U) {
+    // Legalize the type.
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
+
+    // This type is legalized to a scalar type.
+    if (!LT.second.isVector())
+      return 0;
+
+    // The type may be split. Normalize the index to the new type.
+    unsigned Width = LT.second.getVectorNumElements();
+    Index = Index % Width;
+
+    // Floating point scalars are already located in index #0.
+    if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
+      return 0;
+  }
+
+  return BaseT::getVectorInstrCost(Opcode, Val, Index);
+}
+
+int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
+  assert (Ty->isVectorTy() && "Can only scalarize vectors");
+  int Cost = 0;
+
+  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+    if (Insert)
+      Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i);
+    if (Extract)
+      Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+  }
+
+  return Cost;
+}
+
+int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                unsigned AddressSpace) {
+  // Handle non-power-of-two vectors such as <3 x float>
+  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
+    unsigned NumElem = VTy->getVectorNumElements();
+
+    // Handle a few common cases:
+    // <3 x float>
+    if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
+      // Cost = 64 bit store + extract + 32 bit store.
+      return 3;
+
+    // <3 x double>
+    if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
+      // Cost = 128 bit store + unpack + 64 bit store.
+      return 3;
+
+    // Assume that all other non-power-of-two numbers are scalarized.
+    if (!isPowerOf2_32(NumElem)) {
+      int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
+                                        AddressSpace);
+      int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
+                                               Opcode == Instruction::Store);
+      return NumElem * Cost + SplitCost;
+    }
+  }
+
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+         "Invalid Opcode");
+
+  // Each load/store unit costs 1.
+  int Cost = LT.first * 1;
+
+  // On Sandybridge 256bit load/stores are double pumped
+  // (but not on Haswell).
+  if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2())
+    Cost*=2;
+
+  return Cost;
+}
+
+int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace) {
+  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
+  if (!SrcVTy)
+    // To calculate scalar take the regular cost, without mask
+    return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
+
+  unsigned NumElem = SrcVTy->getVectorNumElements();
+  VectorType *MaskTy =
+    VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem);
+  if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
+      (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
+      !isPowerOf2_32(NumElem)) {
+    // Scalarization
+    int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
+    int ScalarCompareCost = getCmpSelInstrCost(
+        Instruction::ICmp, Type::getInt8Ty(getGlobalContext()), nullptr);
+    int BranchCost = getCFInstrCost(Instruction::Br);
+    int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
+
+    int ValueSplitCost = getScalarizationOverhead(
+        SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
+    int MemopCost =
+        NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+                                         Alignment, AddressSpace);
+    return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
+  }
+
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
+  auto VT = TLI->getValueType(DL, SrcVTy);
+  int Cost = 0;
+  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
+      LT.second.getVectorNumElements() == NumElem)
+    // Promotion requires expand/truncate for data and a shuffle for mask.
+    Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) +
+            getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr);
+
+  else if (LT.second.getVectorNumElements() > NumElem) {
+    VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
+                                            LT.second.getVectorNumElements());
+    // Expanding requires fill mask with zeroes
+    Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
+  }
+  if (!ST->hasAVX512())
+    return Cost + LT.first*4; // Each maskmov costs 4
+
+  // AVX-512 masked load/store is cheapper
+  return Cost+LT.first;
+}
+
+int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+  // Address computations in vectorized code with non-consecutive addresses will
+  // likely result in more instructions compared to scalar code where the
+  // computation can more often be merged into the index mode. The resulting
+  // extra micro-ops can significantly decrease throughput.
+  unsigned NumVectorInstToHideOverhead = 10;
+
+  if (Ty->isVectorTy() && IsComplex)
+    return NumVectorInstToHideOverhead;
+
+  return BaseT::getAddressComputationCost(Ty, IsComplex);
+}
+
+int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
+                                 bool IsPairwise) {
+
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+  MVT MTy = LT.second;
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
+  // and make it as the cost.
+
+  static const CostTblEntry SSE42CostTblPairWise[] = {
+    { ISD::FADD,  MVT::v2f64,   2 },
+    { ISD::FADD,  MVT::v4f32,   4 },
+    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
+    { ISD::ADD,   MVT::v8i16,   5 },
+  };
+
+  static const CostTblEntry AVX1CostTblPairWise[] = {
+    { ISD::FADD,  MVT::v4f32,   4 },
+    { ISD::FADD,  MVT::v4f64,   5 },
+    { ISD::FADD,  MVT::v8f32,   7 },
+    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
+    { ISD::ADD,   MVT::v4i64,   5 },      // The data reported by the IACA tool is "4.8".
+    { ISD::ADD,   MVT::v8i16,   5 },
+    { ISD::ADD,   MVT::v8i32,   5 },
+  };
+
+  static const CostTblEntry SSE42CostTblNoPairWise[] = {
+    { ISD::FADD,  MVT::v2f64,   2 },
+    { ISD::FADD,  MVT::v4f32,   4 },
+    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.3".
+    { ISD::ADD,   MVT::v8i16,   4 },      // The data reported by the IACA tool is "4.3".
+  };
+
+  static const CostTblEntry AVX1CostTblNoPairWise[] = {
+    { ISD::FADD,  MVT::v4f32,   3 },
+    { ISD::FADD,  MVT::v4f64,   3 },
+    { ISD::FADD,  MVT::v8f32,   4 },
+    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "2.8".
+    { ISD::ADD,   MVT::v4i64,   3 },
+    { ISD::ADD,   MVT::v8i16,   4 },
+    { ISD::ADD,   MVT::v8i32,   5 },
+  };
+
+  if (IsPairwise) {
+    if (ST->hasAVX())
+      if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
+
+    if (ST->hasSSE42())
+      if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
+  } else {
+    if (ST->hasAVX())
+      if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
+
+    if (ST->hasSSE42())
+      if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
+  }
+
+  return BaseT::getReductionCost(Opcode, ValTy, IsPairwise);
+}
+
+/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// method might only calculate a fraction of a larger immediate. Therefore it
+/// is valid to return a cost of ZERO.
+int X86TTIImpl::getIntImmCost(int64_t Val) {
+  if (Val == 0)
+    return TTI::TCC_Free;
+
+  if (isInt<32>(Val))
+    return TTI::TCC_Basic;
+
+  return 2 * TTI::TCC_Basic;
+}
+
+int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  // Never hoist constants larger than 128bit, because this might lead to
+  // incorrect code generation or assertions in codegen.
+  // Fixme: Create a cost model for types larger than i128 once the codegen
+  // issues have been fixed.
+  if (BitSize > 128)
+    return TTI::TCC_Free;
+
+  if (Imm == 0)
+    return TTI::TCC_Free;
+
+  // Sign-extend all constants to a multiple of 64-bit.
+  APInt ImmVal = Imm;
+  if (BitSize & 0x3f)
+    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
+
+  // Split the constant into 64-bit chunks and calculate the cost for each
+  // chunk.
+  int Cost = 0;
+  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
+    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
+    int64_t Val = Tmp.getSExtValue();
+    Cost += getIntImmCost(Val);
+  }
+  // We need at least one instruction to materialze the constant.
+  return std::max(1, Cost);
+}
+
+int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                              Type *Ty) {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TTI::TCC_Free;
+
+  unsigned ImmIdx = ~0U;
+  switch (Opcode) {
+  default:
+    return TTI::TCC_Free;
+  case Instruction::GetElementPtr:
+    // Always hoist the base address of a GetElementPtr. This prevents the
+    // creation of new constants for every base constant that gets constant
+    // folded with the offset.
+    if (Idx == 0)
+      return 2 * TTI::TCC_Basic;
+    return TTI::TCC_Free;
+  case Instruction::Store:
+    ImmIdx = 0;
+    break;
+  case Instruction::ICmp:
+    // This is an imperfect hack to prevent constant hoisting of
+    // compares that might be trying to check if a 64-bit value fits in
+    // 32-bits. The backend can optimize these cases using a right shift by 32.
+    // Ideally we would check the compare predicate here. There also other
+    // similar immediates the backend can use shifts for.
+    if (Idx == 1 && Imm.getBitWidth() == 64) {
+      uint64_t ImmVal = Imm.getZExtValue();
+      if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
+        return TTI::TCC_Free;
+    }
+    ImmIdx = 1;
+    break;
+  case Instruction::And:
+    // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
+    // by using a 32-bit operation with implicit zero extension. Detect such
+    // immediates here as the normal path expects bit 31 to be sign extended.
+    if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
+      return TTI::TCC_Free;
+    // Fallthrough
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::Or:
+  case Instruction::Xor:
+    ImmIdx = 1;
+    break;
+  // Always return TCC_Free for the shift value of a shift instruction.
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    if (Idx == 1)
+      return TTI::TCC_Free;
+    break;
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
+  case Instruction::BitCast:
+  case Instruction::PHI:
+  case Instruction::Call:
+  case Instruction::Select:
+  case Instruction::Ret:
+  case Instruction::Load:
+    break;
+  }
+
+  if (Idx == ImmIdx) {
+    int NumConstants = (BitSize + 63) / 64;
+    int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
+    return (Cost <= NumConstants * TTI::TCC_Basic)
+               ? static_cast<int>(TTI::TCC_Free)
+               : Cost;
+  }
+
+  return X86TTIImpl::getIntImmCost(Imm, Ty);
+}
+
+int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                              Type *Ty) {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TTI::TCC_Free;
+
+  switch (IID) {
+  default:
+    return TTI::TCC_Free;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+    if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
+      return TTI::TCC_Free;
+    break;
+  case Intrinsic::experimental_stackmap:
+    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TTI::TCC_Free;
+    break;
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64:
+    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TTI::TCC_Free;
+    break;
+  }
+  return X86TTIImpl::getIntImmCost(Imm, Ty);
+}
+
+// Return an average cost of Gather / Scatter instruction, maybe improved later
+int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
+                                unsigned Alignment, unsigned AddressSpace) {
+
+  assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
+  unsigned VF = SrcVTy->getVectorNumElements();
+
+  // Try to reduce index size from 64 bit (default for GEP)
+  // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
+  // operation will use 16 x 64 indices which do not fit in a zmm and needs
+  // to split. Also check that the base pointer is the same for all lanes,
+  // and that there's at most one variable index.
+  auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
+    unsigned IndexSize = DL.getPointerSizeInBits();
+    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+    if (IndexSize < 64 || !GEP)
+      return IndexSize;
+ 
+    unsigned NumOfVarIndices = 0;
+    Value *Ptrs = GEP->getPointerOperand();
+    if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
+      return IndexSize;
+    for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
+      if (isa<Constant>(GEP->getOperand(i)))
+        continue;
+      Type *IndxTy = GEP->getOperand(i)->getType();
+      if (IndxTy->isVectorTy())
+        IndxTy = IndxTy->getVectorElementType();
+      if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
+          !isa<SExtInst>(GEP->getOperand(i))) ||
+         ++NumOfVarIndices > 1)
+        return IndexSize; // 64
+    }
+    return (unsigned)32;
+  };
+
+
+  // Trying to reduce IndexSize to 32 bits for vector 16.
+  // By default the IndexSize is equal to pointer size.
+  unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :
+    DL.getPointerSizeInBits();
+
+  Type *IndexVTy = VectorType::get(IntegerType::get(getGlobalContext(),
+                                                    IndexSize), VF);
+  std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
+  std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
+  int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
+  if (SplitFactor > 1) {
+    // Handle splitting of vector of pointers
+    Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
+    return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
+                                         AddressSpace);
+  }
+
+  // The gather / scatter cost is given by Intel architects. It is a rough
+  // number since we are looking at one instruction in a time.
+  const int GSOverhead = 2;
+  return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+                                           Alignment, AddressSpace);
+}
+
+/// Return the cost of full scalarization of gather / scatter operation.
+///
+/// Opcode - Load or Store instruction.
+/// SrcVTy - The type of the data vector that should be gathered or scattered.
+/// VariableMask - The mask is non-constant at compile time.
+/// Alignment - Alignment for one element.
+/// AddressSpace - pointer[s] address space.
+///
+int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
+                                bool VariableMask, unsigned Alignment,
+                                unsigned AddressSpace) {
+  unsigned VF = SrcVTy->getVectorNumElements();
+
+  int MaskUnpackCost = 0;
+  if (VariableMask) {
+    VectorType *MaskTy =
+      VectorType::get(Type::getInt1Ty(getGlobalContext()), VF);
+    MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
+    int ScalarCompareCost =
+      getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(getGlobalContext()),
+                         nullptr);
+    int BranchCost = getCFInstrCost(Instruction::Br);
+    MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
+  }
+
+  // The cost of the scalar loads/stores.
+  int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+                                          Alignment, AddressSpace);
+
+  int InsertExtractCost = 0;
+  if (Opcode == Instruction::Load)
+    for (unsigned i = 0; i < VF; ++i)
+      // Add the cost of inserting each scalar load into the vector
+      InsertExtractCost +=
+        getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
+  else
+    for (unsigned i = 0; i < VF; ++i)
+      // Add the cost of extracting each element out of the data vector
+      InsertExtractCost +=
+        getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
+
+  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
+}
+
+/// Calculate the cost of Gather / Scatter operation
+int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
+                                       Value *Ptr, bool VariableMask,
+                                       unsigned Alignment) {
+  assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
+  unsigned VF = SrcVTy->getVectorNumElements();
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  if (!PtrTy && Ptr->getType()->isVectorTy())
+    PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
+  assert(PtrTy && "Unexpected type for Ptr argument");
+  unsigned AddressSpace = PtrTy->getAddressSpace();
+
+  bool Scalarize = false;
+  if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
+      (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
+    Scalarize = true;
+  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
+  // Vector-4 of gather/scatter instruction does not exist on KNL.
+  // We can extend it to 8 elements, but zeroing upper bits of
+  // the mask vector will add more instructions. Right now we give the scalar
+  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is
+  // better in the VariableMask case.
+  if (VF == 2 || (VF == 4 && !ST->hasVLX()))
+    Scalarize = true;
+
+  if (Scalarize)
+    return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace);
+
+  return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
+}
+
+bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
+  Type *ScalarTy = DataTy->getScalarType();
+  int DataWidth = isa<PointerType>(ScalarTy) ?
+    DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
+
+  return (DataWidth >= 32 && ST->hasAVX2());
+}
+
+bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
+  return isLegalMaskedLoad(DataType);
+}
+
+bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
+  // This function is called now in two cases: from the Loop Vectorizer
+  // and from the Scalarizer.
+  // When the Loop Vectorizer asks about legality of the feature,
+  // the vectorization factor is not calculated yet. The Loop Vectorizer
+  // sends a scalar type and the decision is based on the width of the
+  // scalar element.
+  // Later on, the cost model will estimate usage this intrinsic based on
+  // the vector type.
+  // The Scalarizer asks again about legality. It sends a vector type.
+  // In this case we can reject non-power-of-2 vectors.
+  if (isa<VectorType>(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements()))
+    return false;
+  Type *ScalarTy = DataTy->getScalarType();
+  int DataWidth = isa<PointerType>(ScalarTy) ?
+    DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
+
+  // AVX-512 allows gather and scatter
+  return DataWidth >= 32 && ST->hasAVX512();
+}
+
+bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
+  return isLegalMaskedGather(DataType);
+}
+
+bool X86TTIImpl::areInlineCompatible(const Function *Caller,
+                                     const Function *Callee) const {
+  const TargetMachine &TM = getTLI()->getTargetMachine();
+
+  // Work this as a subsetting of subtarget features.
+  const FeatureBitset &CallerBits =
+      TM.getSubtargetImpl(*Caller)->getFeatureBits();
+  const FeatureBitset &CalleeBits =
+      TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+  // FIXME: This is likely too limiting as it will include subtarget features
+  // that we might not care about for inlining, but it is conservatively
+  // correct.
+  return (CallerBits & CalleeBits) == CalleeBits;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
new file mode 100644
index 0000000..adb745e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -0,0 +1,109 @@
+//===-- X86TargetTransformInfo.h - X86 specific TTI -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// X86 target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H
+
+#include "X86.h"
+#include "X86TargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
+  typedef BasicTTIImplBase<X86TTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const X86Subtarget *ST;
+  const X86TargetLowering *TLI;
+
+  int getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
+
+  const X86Subtarget *getST() const { return ST; }
+  const X86TargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+
+  // Provide value semantics. MSVC requires that we spell all of these out.
+  X86TTIImpl(const X86TTIImpl &Arg)
+      : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
+  X86TTIImpl(X86TTIImpl &&Arg)
+      : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
+        TLI(std::move(Arg.TLI)) {}
+
+  /// \name Scalar TTI Implementations
+  /// @{
+  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getRegisterBitWidth(bool Vector);
+  unsigned getMaxInterleaveFactor(unsigned VF);
+  int getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                      unsigned AddressSpace);
+  int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                            unsigned AddressSpace);
+  int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+                             bool VariableMask, unsigned Alignment);
+  int getAddressComputationCost(Type *PtrTy, bool IsComplex);
+
+  int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
+
+  int getIntImmCost(int64_t);
+
+  int getIntImmCost(const APInt &Imm, Type *Ty);
+
+  int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+  int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                    Type *Ty);
+  bool isLegalMaskedLoad(Type *DataType);
+  bool isLegalMaskedStore(Type *DataType);
+  bool isLegalMaskedGather(Type *DataType);
+  bool isLegalMaskedScatter(Type *DataType);
+  bool areInlineCompatible(const Function *Caller,
+                           const Function *Callee) const;
+private:
+  int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
+                      unsigned Alignment, unsigned AddressSpace);
+  int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+                      unsigned Alignment, unsigned AddressSpace);
+
+  /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
new file mode 100644
index 0000000..6925b27
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -0,0 +1,320 @@
+//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which inserts x86 AVX vzeroupper instructions
+// before calls to SSE encoded functions. This avoids transition latency
+// penalty when transferring control between AVX encoded instructions and old
+// SSE encoding mode.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-vzeroupper"
+
+STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
+
+namespace {
+
+  class VZeroUpperInserter : public MachineFunctionPass {
+  public:
+
+    VZeroUpperInserter() : MachineFunctionPass(ID) {}
+    bool runOnMachineFunction(MachineFunction &MF) override;
+    const char *getPassName() const override {return "X86 vzeroupper inserter";}
+
+  private:
+
+    void processBasicBlock(MachineBasicBlock &MBB);
+    void insertVZeroUpper(MachineBasicBlock::iterator I,
+                          MachineBasicBlock &MBB);
+    void addDirtySuccessor(MachineBasicBlock &MBB);
+
+    typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState;
+    static const char* getBlockExitStateName(BlockExitState ST);
+
+    // Core algorithm state:
+    // BlockState - Each block is either:
+    //   - PASS_THROUGH: There are neither YMM dirtying instructions nor
+    //                   vzeroupper instructions in this block.
+    //   - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
+    //                  block that will ensure that YMM is clean on exit.
+    //   - EXITS_DIRTY: An instruction in the block dirties YMM and no
+    //                  subsequent vzeroupper in the block clears it.
+    //
+    // AddedToDirtySuccessors - This flag is raised when a block is added to the
+    //                          DirtySuccessors list to ensure that it's not
+    //                          added multiple times.
+    //
+    // FirstUnguardedCall - Records the location of the first unguarded call in
+    //                      each basic block that may need to be guarded by a
+    //                      vzeroupper. We won't know whether it actually needs
+    //                      to be guarded until we discover a predecessor that
+    //                      is DIRTY_OUT.
+    struct BlockState {
+      BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {}
+      BlockExitState ExitState;
+      bool AddedToDirtySuccessors;
+      MachineBasicBlock::iterator FirstUnguardedCall;
+    };
+    typedef SmallVector<BlockState, 8> BlockStateMap;
+    typedef SmallVector<MachineBasicBlock*, 8> DirtySuccessorsWorkList;
+
+    BlockStateMap BlockStates;
+    DirtySuccessorsWorkList DirtySuccessors;
+    bool EverMadeChange;
+    const TargetInstrInfo *TII;
+
+    static char ID;
+  };
+
+  char VZeroUpperInserter::ID = 0;
+}
+
+FunctionPass *llvm::createX86IssueVZeroUpperPass() {
+  return new VZeroUpperInserter();
+}
+
+const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
+  switch (ST) {
+    case PASS_THROUGH: return "Pass-through";
+    case EXITS_DIRTY: return "Exits-dirty";
+    case EXITS_CLEAN: return "Exits-clean";
+  }
+  llvm_unreachable("Invalid block exit state.");
+}
+
+static bool isYmmReg(unsigned Reg) {
+  return (Reg >= X86::YMM0 && Reg <= X86::YMM15);
+}
+
+static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
+  for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
+       E = MRI.livein_end(); I != E; ++I)
+    if (isYmmReg(I->first))
+      return true;
+
+  return false;
+}
+
+static bool clobbersAllYmmRegs(const MachineOperand &MO) {
+  for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
+    if (!MO.clobbersPhysReg(reg))
+      return false;
+  }
+  return true;
+}
+
+static bool hasYmmReg(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MI->isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
+      return true;
+    if (!MO.isReg())
+      continue;
+    if (MO.isDebug())
+      continue;
+    if (isYmmReg(MO.getReg()))
+      return true;
+  }
+  return false;
+}
+
+/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this
+/// instruction.
+static bool callClobbersAnyYmmReg(MachineInstr *MI) {
+  assert(MI->isCall() && "Can only be called on call instructions.");
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isRegMask())
+      continue;
+    for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
+      if (MO.clobbersPhysReg(reg))
+        return true;
+    }
+  }
+  return false;
+}
+
+// Insert a vzeroupper instruction before I.
+void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I,
+                                              MachineBasicBlock &MBB) {
+  DebugLoc dl = I->getDebugLoc();
+  BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER));
+  ++NumVZU;
+  EverMadeChange = true;
+}
+
+// Add MBB to the DirtySuccessors list if it hasn't already been added.
+void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
+  if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) {
+    DirtySuccessors.push_back(&MBB);
+    BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true;
+  }
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// inserting vzeroupper instructions before function calls.
+void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
+
+  // Start by assuming that the block PASS_THROUGH, which implies no unguarded
+  // calls.
+  BlockExitState CurState = PASS_THROUGH;
+  BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
+
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+    MachineInstr *MI = I;
+    bool isControlFlow = MI->isCall() || MI->isReturn();
+
+    // Shortcut: don't need to check regular instructions in dirty state.
+    if (!isControlFlow && CurState == EXITS_DIRTY)
+      continue;
+
+    if (hasYmmReg(MI)) {
+      // We found a ymm-using instruction; this could be an AVX instruction,
+      // or it could be control flow.
+      CurState = EXITS_DIRTY;
+      continue;
+    }
+
+    // Check for control-flow out of the current function (which might
+    // indirectly execute SSE instructions).
+    if (!isControlFlow)
+      continue;
+
+    // If the call won't clobber any YMM register, skip it as well. It usually
+    // happens on helper function calls (such as '_chkstk', '_ftol2') where
+    // standard calling convention is not used (RegMask is not used to mark
+    // register clobbered and register usage (def/imp-def/use) is well-defined
+    // and explicitly specified.
+    if (MI->isCall() && !callClobbersAnyYmmReg(MI))
+      continue;
+
+    // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
+    // registers. This instruction has zero latency. In addition, the processor
+    // changes back to Clean state, after which execution of Intel SSE
+    // instructions or Intel AVX instructions has no transition penalty. Add
+    // the VZEROUPPER instruction before any function call/return that might
+    // execute SSE code.
+    // FIXME: In some cases, we may want to move the VZEROUPPER into a
+    // predecessor block.
+    if (CurState == EXITS_DIRTY) {
+      // After the inserted VZEROUPPER the state becomes clean again, but
+      // other YMM may appear before other subsequent calls or even before
+      // the end of the BB.
+      insertVZeroUpper(I, MBB);
+      CurState = EXITS_CLEAN;
+    } else if (CurState == PASS_THROUGH) {
+      // If this block is currently in pass-through state and we encounter a
+      // call then whether we need a vzeroupper or not depends on whether this
+      // block has successors that exit dirty. Record the location of the call,
+      // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet.
+      // It will be inserted later if necessary.
+      BlockStates[MBB.getNumber()].FirstUnguardedCall = I;
+      CurState = EXITS_CLEAN;
+    }
+  }
+
+  DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: "
+               << getBlockExitStateName(CurState) << '\n');
+
+  if (CurState == EXITS_DIRTY)
+    for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
+                                          SE = MBB.succ_end();
+         SI != SE; ++SI)
+      addDirtySuccessor(**SI);
+
+  BlockStates[MBB.getNumber()].ExitState = CurState;
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, inserting
+/// vzeroupper instructions before function calls.
+bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
+  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+  if (!ST.hasAVX() || ST.hasAVX512())
+    return false;
+  TII = ST.getInstrInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  EverMadeChange = false;
+
+  bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
+
+  // Fast check: if the function doesn't use any ymm registers, we don't need
+  // to insert any VZEROUPPER instructions.  This is constant-time, so it is
+  // cheap in the common case of no ymm use.
+  bool YMMUsed = FnHasLiveInYmm;
+  if (!YMMUsed) {
+    const TargetRegisterClass *RC = &X86::VR256RegClass;
+    for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
+         i++) {
+      if (!MRI.reg_nodbg_empty(*i)) {
+        YMMUsed = true;
+        break;
+      }
+    }
+  }
+  if (!YMMUsed) {
+    return false;
+  }
+
+  assert(BlockStates.empty() && DirtySuccessors.empty() &&
+         "X86VZeroUpper state should be clear");
+  BlockStates.resize(MF.getNumBlockIDs());
+
+  // Process all blocks. This will compute block exit states, record the first
+  // unguarded call in each block, and add successors of dirty blocks to the
+  // DirtySuccessors list.
+  for (MachineBasicBlock &MBB : MF)
+    processBasicBlock(MBB);
+
+  // If any YMM regs are live in to this function, add the entry block to the
+  // DirtySuccessors list
+  if (FnHasLiveInYmm)
+    addDirtySuccessor(MF.front());
+
+  // Re-visit all blocks that are successors of EXITS_DIRTY bsocks. Add
+  // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY
+  // through PASS_THROUGH blocks.
+  while (!DirtySuccessors.empty()) {
+    MachineBasicBlock &MBB = *DirtySuccessors.back();
+    DirtySuccessors.pop_back();
+    BlockState &BBState = BlockStates[MBB.getNumber()];
+
+    // MBB is a successor of a dirty block, so its first call needs to be
+    // guarded.
+    if (BBState.FirstUnguardedCall != MBB.end())
+      insertVZeroUpper(BBState.FirstUnguardedCall, MBB);
+
+    // If this successor was a pass-through block then it is now dirty, and its
+    // successors need to be added to the worklist (if they haven't been
+    // already).
+    if (BBState.ExitState == PASS_THROUGH) {
+      DEBUG(dbgs() << "MBB #" << MBB.getNumber()
+                   << " was Pass-through, is now Dirty-out.\n");
+      for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
+                                            SE = MBB.succ_end();
+           SI != SE; ++SI)
+        addDirtySuccessor(**SI);
+    }
+  }
+
+  BlockStates.clear();
+  return EverMadeChange;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
new file mode 100644
index 0000000..dce94a9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -0,0 +1,456 @@
+//===-- X86WinEHState - Insert EH state updates for win32 exceptions ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// All functions using an MSVC EH personality use an explicitly updated state
+// number stored in an exception registration stack object. The registration
+// object is linked into a thread-local chain of registrations stored at fs:00.
+// This pass adds the registration object and EH state updates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "winehstate"
+
+namespace llvm { void initializeWinEHStatePassPass(PassRegistry &); }
+
+namespace {
+class WinEHStatePass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid.
+
+  WinEHStatePass() : FunctionPass(ID) {
+    initializeWinEHStatePassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &Fn) override;
+
+  bool doInitialization(Module &M) override;
+
+  bool doFinalization(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  const char *getPassName() const override {
+    return "Windows 32-bit x86 EH state insertion";
+  }
+
+private:
+  void emitExceptionRegistrationRecord(Function *F);
+
+  void linkExceptionRegistration(IRBuilder<> &Builder, Function *Handler);
+  void unlinkExceptionRegistration(IRBuilder<> &Builder);
+  void addStateStores(Function &F, WinEHFuncInfo &FuncInfo);
+  void insertStateNumberStore(Value *ParentRegNode, Instruction *IP, int State);
+
+  Value *emitEHLSDA(IRBuilder<> &Builder, Function *F);
+
+  Function *generateLSDAInEAXThunk(Function *ParentFunc);
+
+  // Module-level type getters.
+  Type *getEHLinkRegistrationType();
+  Type *getSEHRegistrationType();
+  Type *getCXXEHRegistrationType();
+
+  // Per-module data.
+  Module *TheModule = nullptr;
+  StructType *EHLinkRegistrationTy = nullptr;
+  StructType *CXXEHRegistrationTy = nullptr;
+  StructType *SEHRegistrationTy = nullptr;
+  Function *FrameRecover = nullptr;
+  Function *FrameAddress = nullptr;
+  Function *FrameEscape = nullptr;
+
+  // Per-function state
+  EHPersonality Personality = EHPersonality::Unknown;
+  Function *PersonalityFn = nullptr;
+
+  /// The stack allocation containing all EH data, including the link in the
+  /// fs:00 chain and the current state.
+  AllocaInst *RegNode = nullptr;
+
+  /// Struct type of RegNode. Used for GEPing.
+  Type *RegNodeTy = nullptr;
+
+  /// The index of the state field of RegNode.
+  int StateFieldIndex = ~0U;
+
+  /// The linked list node subobject inside of RegNode.
+  Value *Link = nullptr;
+};
+}
+
+FunctionPass *llvm::createX86WinEHStatePass() { return new WinEHStatePass(); }
+
+char WinEHStatePass::ID = 0;
+
+INITIALIZE_PASS(WinEHStatePass, "x86-winehstate",
+                "Insert stores for EH state numbers", false, false)
+
+bool WinEHStatePass::doInitialization(Module &M) {
+  TheModule = &M;
+  FrameEscape = Intrinsic::getDeclaration(TheModule, Intrinsic::localescape);
+  FrameRecover = Intrinsic::getDeclaration(TheModule, Intrinsic::localrecover);
+  FrameAddress = Intrinsic::getDeclaration(TheModule, Intrinsic::frameaddress);
+  return false;
+}
+
+bool WinEHStatePass::doFinalization(Module &M) {
+  assert(TheModule == &M);
+  TheModule = nullptr;
+  EHLinkRegistrationTy = nullptr;
+  CXXEHRegistrationTy = nullptr;
+  SEHRegistrationTy = nullptr;
+  FrameEscape = nullptr;
+  FrameRecover = nullptr;
+  FrameAddress = nullptr;
+  return false;
+}
+
+void WinEHStatePass::getAnalysisUsage(AnalysisUsage &AU) const {
+  // This pass should only insert a stack allocation, memory accesses, and
+  // localrecovers.
+  AU.setPreservesCFG();
+}
+
+bool WinEHStatePass::runOnFunction(Function &F) {
+  // Check the personality. Do nothing if this personality doesn't use funclets.
+  if (!F.hasPersonalityFn())
+    return false;
+  PersonalityFn =
+      dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts());
+  if (!PersonalityFn)
+    return false;
+  Personality = classifyEHPersonality(PersonalityFn);
+  if (!isFuncletEHPersonality(Personality))
+    return false;
+
+  // Skip this function if there are no EH pads and we aren't using IR-level
+  // outlining.
+  bool HasPads = false;
+  for (BasicBlock &BB : F) {
+    if (BB.isEHPad()) {
+      HasPads = true;
+      break;
+    }
+  }
+  if (!HasPads)
+    return false;
+
+  // Disable frame pointer elimination in this function.
+  // FIXME: Do the nested handlers need to keep the parent ebp in ebp, or can we
+  // use an arbitrary register?
+  F.addFnAttr("no-frame-pointer-elim", "true");
+
+  emitExceptionRegistrationRecord(&F);
+
+  // The state numbers calculated here in IR must agree with what we calculate
+  // later on for the MachineFunction. In particular, if an IR pass deletes an
+  // unreachable EH pad after this point before machine CFG construction, we
+  // will be in trouble. If this assumption is ever broken, we should turn the
+  // numbers into an immutable analysis pass.
+  WinEHFuncInfo FuncInfo;
+  addStateStores(F, FuncInfo);
+
+  // Reset per-function state.
+  PersonalityFn = nullptr;
+  Personality = EHPersonality::Unknown;
+  return true;
+}
+
+/// Get the common EH registration subobject:
+///   typedef _EXCEPTION_DISPOSITION (*PEXCEPTION_ROUTINE)(
+///       _EXCEPTION_RECORD *, void *, _CONTEXT *, void *);
+///   struct EHRegistrationNode {
+///     EHRegistrationNode *Next;
+///     PEXCEPTION_ROUTINE Handler;
+///   };
+Type *WinEHStatePass::getEHLinkRegistrationType() {
+  if (EHLinkRegistrationTy)
+    return EHLinkRegistrationTy;
+  LLVMContext &Context = TheModule->getContext();
+  EHLinkRegistrationTy = StructType::create(Context, "EHRegistrationNode");
+  Type *FieldTys[] = {
+      EHLinkRegistrationTy->getPointerTo(0), // EHRegistrationNode *Next
+      Type::getInt8PtrTy(Context) // EXCEPTION_DISPOSITION (*Handler)(...)
+  };
+  EHLinkRegistrationTy->setBody(FieldTys, false);
+  return EHLinkRegistrationTy;
+}
+
+/// The __CxxFrameHandler3 registration node:
+///   struct CXXExceptionRegistration {
+///     void *SavedESP;
+///     EHRegistrationNode SubRecord;
+///     int32_t TryLevel;
+///   };
+Type *WinEHStatePass::getCXXEHRegistrationType() {
+  if (CXXEHRegistrationTy)
+    return CXXEHRegistrationTy;
+  LLVMContext &Context = TheModule->getContext();
+  Type *FieldTys[] = {
+      Type::getInt8PtrTy(Context), // void *SavedESP
+      getEHLinkRegistrationType(), // EHRegistrationNode SubRecord
+      Type::getInt32Ty(Context)    // int32_t TryLevel
+  };
+  CXXEHRegistrationTy =
+      StructType::create(FieldTys, "CXXExceptionRegistration");
+  return CXXEHRegistrationTy;
+}
+
+/// The _except_handler3/4 registration node:
+///   struct EH4ExceptionRegistration {
+///     void *SavedESP;
+///     _EXCEPTION_POINTERS *ExceptionPointers;
+///     EHRegistrationNode SubRecord;
+///     int32_t EncodedScopeTable;
+///     int32_t TryLevel;
+///   };
+Type *WinEHStatePass::getSEHRegistrationType() {
+  if (SEHRegistrationTy)
+    return SEHRegistrationTy;
+  LLVMContext &Context = TheModule->getContext();
+  Type *FieldTys[] = {
+      Type::getInt8PtrTy(Context), // void *SavedESP
+      Type::getInt8PtrTy(Context), // void *ExceptionPointers
+      getEHLinkRegistrationType(), // EHRegistrationNode SubRecord
+      Type::getInt32Ty(Context),   // int32_t EncodedScopeTable
+      Type::getInt32Ty(Context)    // int32_t TryLevel
+  };
+  SEHRegistrationTy = StructType::create(FieldTys, "SEHExceptionRegistration");
+  return SEHRegistrationTy;
+}
+
+// Emit an exception registration record. These are stack allocations with the
+// common subobject of two pointers: the previous registration record (the old
+// fs:00) and the personality function for the current frame. The data before
+// and after that is personality function specific.
+void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
+  assert(Personality == EHPersonality::MSVC_CXX ||
+         Personality == EHPersonality::MSVC_X86SEH);
+
+  StringRef PersonalityName = PersonalityFn->getName();
+  IRBuilder<> Builder(&F->getEntryBlock(), F->getEntryBlock().begin());
+  Type *Int8PtrType = Builder.getInt8PtrTy();
+  if (Personality == EHPersonality::MSVC_CXX) {
+    RegNodeTy = getCXXEHRegistrationType();
+    RegNode = Builder.CreateAlloca(RegNodeTy);
+    // SavedESP = llvm.stacksave()
+    Value *SP = Builder.CreateCall(
+        Intrinsic::getDeclaration(TheModule, Intrinsic::stacksave), {});
+    Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
+    // TryLevel = -1
+    StateFieldIndex = 2;
+    insertStateNumberStore(RegNode, &*Builder.GetInsertPoint(), -1);
+    // Handler = __ehhandler$F
+    Function *Trampoline = generateLSDAInEAXThunk(F);
+    Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 1);
+    linkExceptionRegistration(Builder, Trampoline);
+  } else if (Personality == EHPersonality::MSVC_X86SEH) {
+    // If _except_handler4 is in use, some additional guard checks and prologue
+    // stuff is required.
+    bool UseStackGuard = (PersonalityName == "_except_handler4");
+    RegNodeTy = getSEHRegistrationType();
+    RegNode = Builder.CreateAlloca(RegNodeTy);
+    // SavedESP = llvm.stacksave()
+    Value *SP = Builder.CreateCall(
+        Intrinsic::getDeclaration(TheModule, Intrinsic::stacksave), {});
+    Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
+    // TryLevel = -2 / -1
+    StateFieldIndex = 4;
+    insertStateNumberStore(RegNode, &*Builder.GetInsertPoint(),
+                           UseStackGuard ? -2 : -1);
+    // ScopeTable = llvm.x86.seh.lsda(F)
+    Value *FI8 = Builder.CreateBitCast(F, Int8PtrType);
+    Value *LSDA = Builder.CreateCall(
+        Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_lsda), FI8);
+    Type *Int32Ty = Type::getInt32Ty(TheModule->getContext());
+    LSDA = Builder.CreatePtrToInt(LSDA, Int32Ty);
+    // If using _except_handler4, xor the address of the table with
+    // __security_cookie.
+    if (UseStackGuard) {
+      Value *Cookie =
+          TheModule->getOrInsertGlobal("__security_cookie", Int32Ty);
+      Value *Val = Builder.CreateLoad(Int32Ty, Cookie);
+      LSDA = Builder.CreateXor(LSDA, Val);
+    }
+    Builder.CreateStore(LSDA, Builder.CreateStructGEP(RegNodeTy, RegNode, 3));
+    Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 2);
+    linkExceptionRegistration(Builder, PersonalityFn);
+  } else {
+    llvm_unreachable("unexpected personality function");
+  }
+
+  // Insert an unlink before all returns.
+  for (BasicBlock &BB : *F) {
+    TerminatorInst *T = BB.getTerminator();
+    if (!isa<ReturnInst>(T))
+      continue;
+    Builder.SetInsertPoint(T);
+    unlinkExceptionRegistration(Builder);
+  }
+}
+
+Value *WinEHStatePass::emitEHLSDA(IRBuilder<> &Builder, Function *F) {
+  Value *FI8 = Builder.CreateBitCast(F, Type::getInt8PtrTy(F->getContext()));
+  return Builder.CreateCall(
+      Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_lsda), FI8);
+}
+
+/// Generate a thunk that puts the LSDA of ParentFunc in EAX and then calls
+/// PersonalityFn, forwarding the parameters passed to PEXCEPTION_ROUTINE:
+///   typedef _EXCEPTION_DISPOSITION (*PEXCEPTION_ROUTINE)(
+///       _EXCEPTION_RECORD *, void *, _CONTEXT *, void *);
+/// We essentially want this code:
+///   movl $lsda, %eax
+///   jmpl ___CxxFrameHandler3
+Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
+  LLVMContext &Context = ParentFunc->getContext();
+  Type *Int32Ty = Type::getInt32Ty(Context);
+  Type *Int8PtrType = Type::getInt8PtrTy(Context);
+  Type *ArgTys[5] = {Int8PtrType, Int8PtrType, Int8PtrType, Int8PtrType,
+                     Int8PtrType};
+  FunctionType *TrampolineTy =
+      FunctionType::get(Int32Ty, makeArrayRef(&ArgTys[0], 4),
+                        /*isVarArg=*/false);
+  FunctionType *TargetFuncTy =
+      FunctionType::get(Int32Ty, makeArrayRef(&ArgTys[0], 5),
+                        /*isVarArg=*/false);
+  Function *Trampoline =
+      Function::Create(TrampolineTy, GlobalValue::InternalLinkage,
+                       Twine("__ehhandler$") + GlobalValue::getRealLinkageName(
+                                                   ParentFunc->getName()),
+                       TheModule);
+  BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline);
+  IRBuilder<> Builder(EntryBB);
+  Value *LSDA = emitEHLSDA(Builder, ParentFunc);
+  Value *CastPersonality =
+      Builder.CreateBitCast(PersonalityFn, TargetFuncTy->getPointerTo());
+  auto AI = Trampoline->arg_begin();
+  Value *Args[5] = {LSDA, &*AI++, &*AI++, &*AI++, &*AI++};
+  CallInst *Call = Builder.CreateCall(CastPersonality, Args);
+  // Can't use musttail due to prototype mismatch, but we can use tail.
+  Call->setTailCall(true);
+  // Set inreg so we pass it in EAX.
+  Call->addAttribute(1, Attribute::InReg);
+  Builder.CreateRet(Call);
+  return Trampoline;
+}
+
+void WinEHStatePass::linkExceptionRegistration(IRBuilder<> &Builder,
+                                               Function *Handler) {
+  // Emit the .safeseh directive for this function.
+  Handler->addFnAttr("safeseh");
+
+  Type *LinkTy = getEHLinkRegistrationType();
+  // Handler = Handler
+  Value *HandlerI8 = Builder.CreateBitCast(Handler, Builder.getInt8PtrTy());
+  Builder.CreateStore(HandlerI8, Builder.CreateStructGEP(LinkTy, Link, 1));
+  // Next = [fs:00]
+  Constant *FSZero =
+      Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257));
+  Value *Next = Builder.CreateLoad(FSZero);
+  Builder.CreateStore(Next, Builder.CreateStructGEP(LinkTy, Link, 0));
+  // [fs:00] = Link
+  Builder.CreateStore(Link, FSZero);
+}
+
+void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) {
+  // Clone Link into the current BB for better address mode folding.
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(Link)) {
+    GEP = cast<GetElementPtrInst>(GEP->clone());
+    Builder.Insert(GEP);
+    Link = GEP;
+  }
+  Type *LinkTy = getEHLinkRegistrationType();
+  // [fs:00] = Link->Next
+  Value *Next =
+      Builder.CreateLoad(Builder.CreateStructGEP(LinkTy, Link, 0));
+  Constant *FSZero =
+      Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257));
+  Builder.CreateStore(Next, FSZero);
+}
+
+void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
+  // Mark the registration node. The backend needs to know which alloca it is so
+  // that it can recover the original frame pointer.
+  IRBuilder<> Builder(RegNode->getParent(), std::next(RegNode->getIterator()));
+  Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getInt8PtrTy());
+  Builder.CreateCall(
+      Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehregnode),
+      {RegNodeI8});
+
+  // Calculate state numbers.
+  if (isAsynchronousEHPersonality(Personality))
+    calculateSEHStateNumbers(&F, FuncInfo);
+  else
+    calculateWinCXXEHStateNumbers(&F, FuncInfo);
+
+  // Iterate all the instructions and emit state number stores.
+  DenseMap<BasicBlock *, ColorVector> BlockColors = colorEHFunclets(F);
+  for (BasicBlock &BB : F) {
+    // Figure out what state we should assign calls in this block.
+    int BaseState = -1;
+    auto &BBColors = BlockColors[&BB];
+
+    assert(BBColors.size() == 1 &&
+           "multi-color BB not removed by preparation");
+    BasicBlock *FuncletEntryBB = BBColors.front();
+    if (auto *FuncletPad =
+            dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI())) {
+      auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad);
+      if (BaseStateI != FuncInfo.FuncletBaseStateMap.end())
+        BaseState = BaseStateI->second;
+    }
+
+    for (Instruction &I : BB) {
+      if (auto *CI = dyn_cast<CallInst>(&I)) {
+        // Possibly throwing call instructions have no actions to take after
+        // an unwind. Ensure they are in the -1 state.
+        if (CI->doesNotThrow())
+          continue;
+        insertStateNumberStore(RegNode, CI, BaseState);
+      } else if (auto *II = dyn_cast<InvokeInst>(&I)) {
+        // Look up the state number of the landingpad this unwinds to.
+        assert(FuncInfo.InvokeStateMap.count(II) && "invoke has no state!");
+        int State = FuncInfo.InvokeStateMap[II];
+        insertStateNumberStore(RegNode, II, State);
+      }
+    }
+  }
+}
+
+void WinEHStatePass::insertStateNumberStore(Value *ParentRegNode,
+                                            Instruction *IP, int State) {
+  IRBuilder<> Builder(IP);
+  Value *StateField =
+      Builder.CreateStructGEP(RegNodeTy, ParentRegNode, StateFieldIndex);
+  Builder.CreateStore(Builder.getInt32(State), StateField);
+}