217 files changed, 12883 insertions, 5263 deletions
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index bf4315f..6af5f85 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -51,6 +51,12 @@ def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
 // to just not use them.
 def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
                                          "Disable VFP / NEON MAC instructions">;
+
+// Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding.
+def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
+                                       "HasVMLxForwarding", "true",
+                                       "Has multiplier accumulator forwarding">;
+
 // Some processors benefit from using NEON instructions for scalar
 // single-precision FP operations.
 def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
@@ -61,6 +67,14 @@ def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
 def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
                                              "Prefer 32-bit Thumb instrs">;
 
+/// Some instructions update CPSR partially, which can add false dependency for
+/// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is
+/// mapped to a separate physical register. Avoid partial CPSR update for these
+/// processors.
+def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr",
+                                               "AvoidCPSRPartialUpdate", "true",
+                                 "Avoid CPSR partial update for OOO execution">;
+
 // Multiprocessing extension.
 def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
                                  "Supports Multiprocessing extension">;
@@ -100,11 +114,13 @@ def ProcOthers  : SubtargetFeature<"others", "ARMProcFamily", "Others",
 def ProcA8      : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8",
                                    "Cortex-A8 ARM processors",
                                    [FeatureSlowFPBrcc, FeatureNEONForFP,
-                                    FeatureHasSlowFPVMLx, FeatureT2XtPk]>;
+                                    FeatureHasSlowFPVMLx, FeatureVMLxForwarding,
+                                    FeatureT2XtPk]>;
 def ProcA9      : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
                                    "Cortex-A9 ARM processors",
-                                   [FeatureHasSlowFPVMLx, FeatureT2XtPk,
-                                    FeatureFP16]>;
+                                   [FeatureVMLxForwarding,
+                                    FeatureT2XtPk, FeatureFP16,
+                                    FeatureAvoidPartialCPSR]>;
 
 class ProcNoItin<string Name, list<SubtargetFeature> Features>
  : Processor<Name, GenericItineraries, Features>;
@@ -171,6 +187,8 @@ def : Processor<"cortex-a8",        CortexA8Itineraries,
                                     [ArchV7A, ProcA8]>;
 def : Processor<"cortex-a9",        CortexA9Itineraries,
                                     [ArchV7A, ProcA9]>;
+def : Processor<"cortex-a9-mp",     CortexA9Itineraries,
+                                    [ArchV7A, ProcA9, FeatureMP]>;
 
 // V7M Processors.
 def : ProcNoItin<"cortex-m3",       [ArchV7M]>;
diff --git a/lib/Target/ARM/ARMAddressingModes.h b/lib/Target/ARM/ARMAddressingModes.h
index 19fbf05..595708f 100644
--- a/lib/Target/ARM/ARMAddressingModes.h
+++ b/lib/Target/ARM/ARMAddressingModes.h
@@ -408,16 +408,18 @@ namespace ARM_AM {
   //
   // The first operand is always a Reg.  The second operand is a reg if in
   // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
-  // in bit 12, the immediate in bits 0-11, and the shift op in 13-15.
+  // in bit 12, the immediate in bits 0-11, and the shift op in 13-15. The
+  // fourth operand 16-17 encodes the index mode.
   //
   // If this addressing mode is a frame index (before prolog/epilog insertion
   // and code rewriting), this operand will have the form:  FI#, reg0, <offs>
   // with no shift amount for the frame offset.
   //
-  static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO) {
+  static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO,
+                                   unsigned IdxMode = 0) {
     assert(Imm12 < (1 << 12) && "Imm too large!");
     bool isSub = Opc == sub;
-    return Imm12 | ((int)isSub << 12) | (SO << 13);
+    return Imm12 | ((int)isSub << 12) | (SO << 13) | (IdxMode << 16) ;
   }
   static inline unsigned getAM2Offset(unsigned AM2Opc) {
     return AM2Opc & ((1 << 12)-1);
@@ -426,7 +428,10 @@ namespace ARM_AM {
     return ((AM2Opc >> 12) & 1) ? sub : add;
   }
   static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) {
-    return (ShiftOpc)(AM2Opc >> 13);
+    return (ShiftOpc)((AM2Opc >> 13) & 7);
+  }
+  static inline unsigned getAM2IdxMode(unsigned AM2Opc) {
+    return (AM2Opc >> 16);
   }
 
 
@@ -441,12 +446,14 @@ namespace ARM_AM {
   //
   // The first operand is always a Reg.  The second operand is a reg if in
   // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
-  // in bit 8, the immediate in bits 0-7.
+  // in bit 8, the immediate in bits 0-7. The fourth operand 9-10 encodes the
+  // index mode.
 
   /// getAM3Opc - This function encodes the addrmode3 opc field.
-  static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset) {
+  static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset,
+                                   unsigned IdxMode = 0) {
     bool isSub = Opc == sub;
-    return ((int)isSub << 8) | Offset;
+    return ((int)isSub << 8) | Offset | (IdxMode << 9);
   }
   static inline unsigned char getAM3Offset(unsigned AM3Opc) {
     return AM3Opc & 0xFF;
@@ -454,6 +461,9 @@ namespace ARM_AM {
   static inline AddrOpc getAM3Op(unsigned AM3Opc) {
     return ((AM3Opc >> 8) & 1) ? sub : add;
   }
+  static inline unsigned getAM3IdxMode(unsigned AM3Opc) {
+    return (AM3Opc >> 9);
+  }
 
   //===--------------------------------------------------------------------===//
   // Addressing Mode #4
diff --git a/lib/Target/ARM/ARMAsmBackend.cpp b/lib/Target/ARM/ARMAsmBackend.cpp
index ec23449..f062819 100644
--- a/lib/Target/ARM/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/ARMAsmBackend.cpp
@@ -246,7 +246,7 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     }
 
     uint32_t out = (opc << 21);
-    out |= (Value & 0x800) << 14;
+    out |= (Value & 0x800) << 15;
     out |= (Value & 0x700) << 4;
     out |= (Value & 0x0FF);
 
@@ -416,21 +416,22 @@ void ELFARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data,
 // FIXME: This should be in a separate file.
 class DarwinARMAsmBackend : public ARMAsmBackend {
 public:
-  DarwinARMAsmBackend(const Target &T) : ARMAsmBackend(T) { }
-
-  void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value) const;
+  const object::mach::CPUSubtypeARM Subtype;
+  DarwinARMAsmBackend(const Target &T, object::mach::CPUSubtypeARM st)
+    : ARMAsmBackend(T), Subtype(st) { }
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    // FIXME: Subtarget info should be derived. Force v7 for now.
     return createMachObjectWriter(new ARMMachObjectWriter(
                                     /*Is64Bit=*/false,
                                     object::mach::CTM_ARM,
-                                    object::mach::CSARM_V7),
+                                    Subtype),
                                   OS,
                                   /*IsLittleEndian=*/true);
   }
 
+  void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value) const;
+
   virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
     return false;
   }
@@ -499,14 +500,17 @@ void DarwinARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data,
 
 TargetAsmBackend *llvm::createARMAsmBackend(const Target &T,
                                             const std::string &TT) {
-  switch (Triple(TT).getOS()) {
-  case Triple::Darwin:
-    return new DarwinARMAsmBackend(T);
-  case Triple::MinGW32:
-  case Triple::Cygwin:
-  case Triple::Win32:
-    assert(0 && "Windows not supported on ARM");
-  default:
-    return new ELFARMAsmBackend(T, Triple(TT).getOS());
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin()) {
+    if (TheTriple.getArchName() == "armv6" ||
+        TheTriple.getArchName() == "thumbv6")
+      return new DarwinARMAsmBackend(T, object::mach::CSARM_V6);
+    return new DarwinARMAsmBackend(T, object::mach::CSARM_V7);
   }
+
+  if (TheTriple.isOSWindows())
+    assert(0 && "Windows not supported on ARM");
+
+  return new ELFARMAsmBackend(T, Triple(TT).getOS());
 }
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index db12b8e..c428e18 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -88,6 +88,11 @@ namespace {
       case ARMBuildAttrs::CPU_name:
         Streamer.EmitRawText(StringRef("\t.cpu ") + LowercaseString(String));
         break;
+      /* GAS requires .fpu to be emitted regardless of EABI attribute */
+      case ARMBuildAttrs::Advanced_SIMD_arch:
+      case ARMBuildAttrs::VFP_arch:
+        Streamer.EmitRawText(StringRef("\t.fpu ") + LowercaseString(String));
+        break;    
       default: assert(0 && "Unsupported Text attribute in ASM Mode"); break;
       }
     }
@@ -167,6 +172,117 @@ getDebugValueLocation(const MachineInstr *MI) const {
   return Location;
 }
 
+/// getDwarfRegOpSize - get size required to emit given machine location using
+/// dwarf encoding.
+unsigned ARMAsmPrinter::getDwarfRegOpSize(const MachineLocation &MLoc) const {
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+  if (RI->getDwarfRegNum(MLoc.getReg(), false) != -1)
+    return AsmPrinter::getDwarfRegOpSize(MLoc);
+  else {
+    unsigned Reg = MLoc.getReg();
+    if (Reg >= ARM::S0 && Reg <= ARM::S31) {
+      assert(ARM::S0 + 31 == ARM::S31 && "Unexpected ARM S register numbering");
+      // S registers are described as bit-pieces of a register
+      // S[2x] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 0)
+      // S[2x+1] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 32)
+      
+      unsigned SReg = Reg - ARM::S0;
+      unsigned Rx = 256 + (SReg >> 1);
+      OutStreamer.AddComment("Loc expr size");
+      // DW_OP_regx + ULEB + DW_OP_bit_piece + ULEB + ULEB
+      //   1 + ULEB(Rx) + 1 + 1 + 1
+      return 4 + MCAsmInfo::getULEB128Size(Rx);
+    } 
+    
+    if (Reg >= ARM::Q0 && Reg <= ARM::Q15) {
+      assert(ARM::Q0 + 15 == ARM::Q15 && "Unexpected ARM Q register numbering");
+      // Q registers Q0-Q15 are described by composing two D registers together.
+      // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1) DW_OP_piece(8)
+
+      unsigned QReg = Reg - ARM::Q0;
+      unsigned D1 = 256 + 2 * QReg;
+      unsigned D2 = D1 + 1;
+      
+      OutStreamer.AddComment("Loc expr size");
+      // DW_OP_regx + ULEB + DW_OP_piece + ULEB(8) +
+      // DW_OP_regx + ULEB + DW_OP_piece + ULEB(8);
+      //   6 + ULEB(D1) + ULEB(D2)
+      return 6 + MCAsmInfo::getULEB128Size(D1) + MCAsmInfo::getULEB128Size(D2);
+    }
+  }
+  return 0;
+}
+
+/// EmitDwarfRegOp - Emit dwarf register operation.
+void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
+  const TargetRegisterInfo *RI = TM.getRegisterInfo();
+  if (RI->getDwarfRegNum(MLoc.getReg(), false) != -1)
+    AsmPrinter::EmitDwarfRegOp(MLoc);
+  else {
+    unsigned Reg = MLoc.getReg();
+    if (Reg >= ARM::S0 && Reg <= ARM::S31) {
+      assert(ARM::S0 + 31 == ARM::S31 && "Unexpected ARM S register numbering");
+      // S registers are described as bit-pieces of a register
+      // S[2x] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 0)
+      // S[2x+1] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 32)
+      
+      unsigned SReg = Reg - ARM::S0;
+      bool odd = SReg & 0x1;
+      unsigned Rx = 256 + (SReg >> 1);
+      OutStreamer.AddComment("Loc expr size");
+      // DW_OP_regx + ULEB + DW_OP_bit_piece + ULEB + ULEB
+      //   1 + ULEB(Rx) + 1 + 1 + 1
+      EmitInt16(4 + MCAsmInfo::getULEB128Size(Rx));
+
+      OutStreamer.AddComment("DW_OP_regx for S register");
+      EmitInt8(dwarf::DW_OP_regx);
+
+      OutStreamer.AddComment(Twine(SReg));
+      EmitULEB128(Rx);
+
+      if (odd) {
+        OutStreamer.AddComment("DW_OP_bit_piece 32 32");
+        EmitInt8(dwarf::DW_OP_bit_piece);
+        EmitULEB128(32);
+        EmitULEB128(32);
+      } else {
+        OutStreamer.AddComment("DW_OP_bit_piece 32 0");
+        EmitInt8(dwarf::DW_OP_bit_piece);
+        EmitULEB128(32);
+        EmitULEB128(0);
+      }
+    } else if (Reg >= ARM::Q0 && Reg <= ARM::Q15) {
+      assert(ARM::Q0 + 15 == ARM::Q15 && "Unexpected ARM Q register numbering");
+      // Q registers Q0-Q15 are described by composing two D registers together.
+      // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1) DW_OP_piece(8)
+
+      unsigned QReg = Reg - ARM::Q0;
+      unsigned D1 = 256 + 2 * QReg;
+      unsigned D2 = D1 + 1;
+      
+      OutStreamer.AddComment("Loc expr size");
+      // DW_OP_regx + ULEB + DW_OP_piece + ULEB(8) +
+      // DW_OP_regx + ULEB + DW_OP_piece + ULEB(8);
+      //   6 + ULEB(D1) + ULEB(D2)
+      EmitInt16(6 + MCAsmInfo::getULEB128Size(D1) + MCAsmInfo::getULEB128Size(D2));
+
+      OutStreamer.AddComment("DW_OP_regx for Q register: D1");
+      EmitInt8(dwarf::DW_OP_regx);
+      EmitULEB128(D1);
+      OutStreamer.AddComment("DW_OP_piece 8");
+      EmitInt8(dwarf::DW_OP_piece);
+      EmitULEB128(8);
+
+      OutStreamer.AddComment("DW_OP_regx for Q register: D2");
+      EmitInt8(dwarf::DW_OP_regx);
+      EmitULEB128(D2);
+      OutStreamer.AddComment("DW_OP_piece 8");
+      EmitInt8(dwarf::DW_OP_piece);
+      EmitULEB128(8);
+    }
+  }
+}
+
 void ARMAsmPrinter::EmitFunctionEntryLabel() {
   if (AFI->isThumbFunction()) {
     OutStreamer.EmitAssemblerFlag(MCAF_Code16);
@@ -453,10 +569,13 @@ void ARMAsmPrinter::emitAttributes() {
 
   emitARMAttributeSection();
 
+  /* GAS expect .fpu to be emitted, regardless of VFP build attribute */
+  bool emitFPU = false;
   AttributeEmitter *AttrEmitter;
-  if (OutStreamer.hasRawTextSupport())
+  if (OutStreamer.hasRawTextSupport()) {
     AttrEmitter = new AsmAttributeEmitter(OutStreamer);
-  else {
+    emitFPU = true;
+  } else {
     MCObjectStreamer &O = static_cast<MCObjectStreamer&>(OutStreamer);
     AttrEmitter = new ObjectAttributeEmitter(O);
   }
@@ -490,10 +609,36 @@ void ARMAsmPrinter::emitAttributes() {
                                ARMBuildAttrs::Allowed);
   }
 
-  // FIXME: Emit FPU type
-  if (Subtarget->hasVFP2())
+  if (Subtarget->hasNEON() && emitFPU) {
+    /* NEON is not exactly a VFP architecture, but GAS emit one of
+     * neon/vfpv3/vfpv2 for .fpu parameters */
+    AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch, "neon");
+    /* If emitted for NEON, omit from VFP below, since you can have both
+     * NEON and VFP in build attributes but only one .fpu */
+    emitFPU = false;
+  }
+
+  /* VFPv3 + .fpu */
+  if (Subtarget->hasVFP3()) {
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
+                               ARMBuildAttrs::AllowFPv3A);
+    if (emitFPU)
+      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv3");
+
+  /* VFPv2 + .fpu */
+  } else if (Subtarget->hasVFP2()) {
     AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
                                ARMBuildAttrs::AllowFPv2);
+    if (emitFPU)
+      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv2");
+  }
+
+  /* TODO: ARMBuildAttrs::Allowed is not completely accurate,
+   * since NEON can have 1 (allowed) or 2 (fused MAC operations) */
+  if (Subtarget->hasNEON()) {
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
+                               ARMBuildAttrs::Allowed);
+  }
 
   // Signal various FP modes.
   if (!UnsafeFPMath) {
@@ -777,10 +922,161 @@ void ARMAsmPrinter::EmitPatchedInstruction(const MachineInstr *MI,
   OutStreamer.EmitInstruction(TmpInst);
 }
 
+void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
+  assert(MI->getFlag(MachineInstr::FrameSetup) &&
+      "Only instruction which are involved into frame setup code are allowed");
+
+  const MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  const ARMFunctionInfo &AFI = *MF.getInfo<ARMFunctionInfo>();
+
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  unsigned Opc = MI->getOpcode();
+  unsigned SrcReg, DstReg;
+
+  if (Opc == ARM::tPUSH || Opc == ARM::tLDRpci) {
+    // Two special cases:
+    // 1) tPUSH does not have src/dst regs.
+    // 2) for Thumb1 code we sometimes materialize the constant via constpool
+    // load. Yes, this is pretty fragile, but for now I don't see better
+    // way... :(
+    SrcReg = DstReg = ARM::SP;
+  } else {
+    SrcReg = MI->getOperand(1).getReg();
+    DstReg = MI->getOperand(0).getReg();
+  }
+
+  // Try to figure out the unwinding opcode out of src / dst regs.
+  if (MI->getDesc().mayStore()) {
+    // Register saves.
+    assert(DstReg == ARM::SP &&
+           "Only stack pointer as a destination reg is supported");
+
+    SmallVector<unsigned, 4> RegList;
+    // Skip src & dst reg, and pred ops.
+    unsigned StartOp = 2 + 2;
+    // Use all the operands.
+    unsigned NumOffset = 0;
+
+    switch (Opc) {
+    default:
+      MI->dump();
+      assert(0 && "Unsupported opcode for unwinding information");
+    case ARM::tPUSH:
+      // Special case here: no src & dst reg, but two extra imp ops.
+      StartOp = 2; NumOffset = 2;
+    case ARM::STMDB_UPD:
+    case ARM::t2STMDB_UPD:
+    case ARM::VSTMDDB_UPD:
+      assert(SrcReg == ARM::SP &&
+             "Only stack pointer as a source reg is supported");
+      for (unsigned i = StartOp, NumOps = MI->getNumOperands() - NumOffset;
+           i != NumOps; ++i)
+        RegList.push_back(MI->getOperand(i).getReg());
+      break;
+    case ARM::STR_PRE:
+      assert(MI->getOperand(2).getReg() == ARM::SP &&
+             "Only stack pointer as a source reg is supported");
+      RegList.push_back(SrcReg);
+      break;
+    }
+    OutStreamer.EmitRegSave(RegList, Opc == ARM::VSTMDDB_UPD);
+  } else {
+    // Changes of stack / frame pointer.
+    if (SrcReg == ARM::SP) {
+      int64_t Offset = 0;
+      switch (Opc) {
+      default:
+        MI->dump();
+        assert(0 && "Unsupported opcode for unwinding information");
+      case ARM::MOVr:
+      case ARM::tMOVgpr2gpr:
+      case ARM::tMOVgpr2tgpr:
+        Offset = 0;
+        break;
+      case ARM::ADDri:
+        Offset = -MI->getOperand(2).getImm();
+        break;
+      case ARM::SUBri:
+      case ARM::t2SUBrSPi:
+        Offset =  MI->getOperand(2).getImm();
+        break;
+      case ARM::tSUBspi:
+        Offset =  MI->getOperand(2).getImm()*4;
+        break;
+      case ARM::tADDspi:
+      case ARM::tADDrSPi:
+        Offset = -MI->getOperand(2).getImm()*4;
+        break;
+      case ARM::tLDRpci: {
+        // Grab the constpool index and check, whether it corresponds to
+        // original or cloned constpool entry.
+        unsigned CPI = MI->getOperand(1).getIndex();
+        const MachineConstantPool *MCP = MF.getConstantPool();
+        if (CPI >= MCP->getConstants().size())
+          CPI = AFI.getOriginalCPIdx(CPI);
+        assert(CPI != -1U && "Invalid constpool index");
+
+        // Derive the actual offset.
+        const MachineConstantPoolEntry &CPE = MCP->getConstants()[CPI];
+        assert(!CPE.isMachineConstantPoolEntry() && "Invalid constpool entry");
+        // FIXME: Check for user, it should be "add" instruction!
+        Offset = -cast<ConstantInt>(CPE.Val.ConstVal)->getSExtValue();
+        break;
+      }
+      }
+
+      if (DstReg == FramePtr && FramePtr != ARM::SP)
+        // Set-up of the frame pointer. Positive values correspond to "add"
+        // instruction.
+        OutStreamer.EmitSetFP(FramePtr, ARM::SP, -Offset);
+      else if (DstReg == ARM::SP) {
+        // Change of SP by an offset. Positive values correspond to "sub"
+        // instruction.
+        OutStreamer.EmitPad(Offset);
+      } else {
+        MI->dump();
+        assert(0 && "Unsupported opcode for unwinding information");
+      }
+    } else if (DstReg == ARM::SP) {
+      // FIXME: .movsp goes here
+      MI->dump();
+      assert(0 && "Unsupported opcode for unwinding information");
+    }
+    else {
+      MI->dump();
+      assert(0 && "Unsupported opcode for unwinding information");
+    }
+  }
+}
+
+extern cl::opt<bool> EnableARMEHABI;
+
 void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   unsigned Opc = MI->getOpcode();
   switch (Opc) {
   default: break;
+  case ARM::B: {
+    // B is just a Bcc with an 'always' predicate.
+    MCInst TmpInst;
+    LowerARMMachineInstrToMCInst(MI, TmpInst, *this);
+    TmpInst.setOpcode(ARM::Bcc);
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+  case ARM::LDMIA_RET: {
+    // LDMIA_RET is just a normal LDMIA_UPD instruction that targets PC and as
+    // such has additional code-gen properties and scheduling information.
+    // To emit it, we just construct as normal and set the opcode to LDMIA_UPD.
+    MCInst TmpInst;
+    LowerARMMachineInstrToMCInst(MI, TmpInst, *this);
+    TmpInst.setOpcode(ARM::LDMIA_UPD);
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
   case ARM::t2ADDrSPi:
   case ARM::t2ADDrSPi12:
   case ARM::t2SUBrSPi:
@@ -850,6 +1146,26 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     OutStreamer.EmitInstruction(TmpInst);
     return;
   }
+  // Darwin call instructions are just normal call instructions with different
+  // clobber semantics (they clobber R9).
+  case ARM::BLr9:
+  case ARM::BLr9_pred:
+  case ARM::BLXr9:
+  case ARM::BLXr9_pred: {
+    unsigned newOpc;
+    switch (Opc) {
+    default: assert(0);
+    case ARM::BLr9:       newOpc = ARM::BL; break;
+    case ARM::BLr9_pred:  newOpc = ARM::BL_pred; break;
+    case ARM::BLXr9:      newOpc = ARM::BLX; break;
+    case ARM::BLXr9_pred: newOpc = ARM::BLX_pred; break;
+    }
+    MCInst TmpInst;
+    LowerARMMachineInstrToMCInst(MI, TmpInst, *this);
+    TmpInst.setOpcode(newOpc);
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
   case ARM::BXr9_CALL:
   case ARM::BX_CALL: {
     {
@@ -1502,6 +1818,49 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     return;
   }
+  // Tail jump branches are really just branch instructions with additional
+  // code-gen attributes. Convert them to the canonical form here.
+  case ARM::TAILJMPd:
+  case ARM::TAILJMPdND: {
+    MCInst TmpInst, TmpInst2;
+    // Lower the instruction as-is to get the operands properly converted.
+    LowerARMMachineInstrToMCInst(MI, TmpInst2, *this);
+    TmpInst.setOpcode(ARM::Bcc);
+    TmpInst.addOperand(TmpInst2.getOperand(0));
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    OutStreamer.AddComment("TAILCALL");
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+  case ARM::tTAILJMPd:
+  case ARM::tTAILJMPdND: {
+    MCInst TmpInst, TmpInst2;
+    LowerARMMachineInstrToMCInst(MI, TmpInst2, *this);
+    TmpInst.setOpcode(ARM::tB);
+    TmpInst.addOperand(TmpInst2.getOperand(0));
+    OutStreamer.AddComment("TAILCALL");
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+  case ARM::TAILJMPrND:
+  case ARM::tTAILJMPrND:
+  case ARM::TAILJMPr:
+  case ARM::tTAILJMPr: {
+    unsigned newOpc = (Opc == ARM::TAILJMPr || Opc == ARM::TAILJMPrND)
+      ? ARM::BX : ARM::tBX;
+    MCInst TmpInst;
+    TmpInst.setOpcode(newOpc);
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    // Predicate.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    OutStreamer.AddComment("TAILCALL");
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+
   // These are the pseudos created to comply with stricter operand restrictions
   // on ARMv5. Lower them now to "normal" instructions, since all the
   // restrictions are already satisfied.
@@ -1530,6 +1889,11 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
   MCInst TmpInst;
   LowerARMMachineInstrToMCInst(MI, TmpInst, *this);
+
+  // Emit unwinding stuff for frame-related instructions
+  if (EnableARMEHABI && MI->getFlag(MachineInstr::FrameSetup))
+    EmitUnwindingInstruction(MI);
+
   OutStreamer.EmitInstruction(TmpInst);
 }
 
@@ -1538,10 +1902,11 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 //===----------------------------------------------------------------------===//
 
 static MCInstPrinter *createARMMCInstPrinter(const Target &T,
+                                             TargetMachine &TM,
                                              unsigned SyntaxVariant,
                                              const MCAsmInfo &MAI) {
   if (SyntaxVariant == 0)
-    return new ARMInstPrinter(MAI);
+    return new ARMInstPrinter(TM, MAI);
   return 0;
 }
 
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index 5852684..1ee1b70 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -82,11 +82,20 @@ private:
   // Generic helper used to emit e.g. ARMv5 mul pseudos
   void EmitPatchedInstruction(const MachineInstr *MI, unsigned TargetOpc);
 
+  void EmitUnwindingInstruction(const MachineInstr *MI);
+
 public:
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
 
   MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
 
+  /// getDwarfRegOpSize - get size required to emit given machine location
+  /// using dwarf encoding.
+  virtual unsigned getDwarfRegOpSize(const MachineLocation &MLoc) const;
+
+  /// EmitDwarfRegOp - Emit dwarf register operation.
+  virtual void EmitDwarfRegOp(const MachineLocation &MLoc) const;
+
   virtual unsigned getISAEncoding() {
     // ARM/Darwin adds ISA to the DWARF info for each function.
     if (!Subtarget->isTargetDarwin())
diff --git a/lib/Target/ARM/ARMBaseInfo.h b/lib/Target/ARM/ARMBaseInfo.h
index a56cc1a..36edbad 100644
--- a/lib/Target/ARM/ARMBaseInfo.h
+++ b/lib/Target/ARM/ARMBaseInfo.h
@@ -200,6 +200,59 @@ inline static unsigned getARMRegisterNumbering(unsigned Reg) {
 }
 
 namespace ARMII {
+
+  /// ARM Index Modes
+  enum IndexMode {
+    IndexModeNone  = 0,
+    IndexModePre   = 1,
+    IndexModePost  = 2,
+    IndexModeUpd   = 3
+  };
+
+  /// ARM Addressing Modes
+  enum AddrMode {
+    AddrModeNone    = 0,
+    AddrMode1       = 1,
+    AddrMode2       = 2,
+    AddrMode3       = 3,
+    AddrMode4       = 4,
+    AddrMode5       = 5,
+    AddrMode6       = 6,
+    AddrModeT1_1    = 7,
+    AddrModeT1_2    = 8,
+    AddrModeT1_4    = 9,
+    AddrModeT1_s    = 10, // i8 * 4 for pc and sp relative data
+    AddrModeT2_i12  = 11,
+    AddrModeT2_i8   = 12,
+    AddrModeT2_so   = 13,
+    AddrModeT2_pc   = 14, // +/- i12 for pc relative data
+    AddrModeT2_i8s4 = 15, // i8 * 4
+    AddrMode_i12    = 16
+  };
+
+  inline static const char *AddrModeToString(AddrMode addrmode) {
+    switch (addrmode) {
+    default: llvm_unreachable("Unknown memory operation");
+    case AddrModeNone:    return "AddrModeNone";
+    case AddrMode1:       return "AddrMode1";
+    case AddrMode2:       return "AddrMode2";
+    case AddrMode3:       return "AddrMode3";
+    case AddrMode4:       return "AddrMode4";
+    case AddrMode5:       return "AddrMode5";
+    case AddrMode6:       return "AddrMode6";
+    case AddrModeT1_1:    return "AddrModeT1_1";
+    case AddrModeT1_2:    return "AddrModeT1_2";
+    case AddrModeT1_4:    return "AddrModeT1_4";
+    case AddrModeT1_s:    return "AddrModeT1_s";
+    case AddrModeT2_i12:  return "AddrModeT2_i12";
+    case AddrModeT2_i8:   return "AddrModeT2_i8";
+    case AddrModeT2_so:   return "AddrModeT2_so";
+    case AddrModeT2_pc:   return "AddrModeT2_pc";
+    case AddrModeT2_i8s4: return "AddrModeT2_i8s4";
+    case AddrMode_i12:    return "AddrMode_i12";
+    }
+  }
+
   /// Target Operand Flag enum.
   enum TOF {
     //===------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 2268e59..44a3976 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1021,7 +1021,7 @@ reMaterialize(MachineBasicBlock &MBB,
     MachineInstrBuilder MIB = BuildMI(MBB, I, Orig->getDebugLoc(), get(Opcode),
                                       DestReg)
       .addConstantPoolIndex(CPI).addImm(PCLabelId);
-    (*MIB).setMemRefs(Orig->memoperands_begin(), Orig->memoperands_end());
+    MIB->setMemRefs(Orig->memoperands_begin(), Orig->memoperands_end());
     break;
   }
   }
@@ -1080,11 +1080,18 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
     int CPI1 = MO1.getIndex();
     const MachineConstantPoolEntry &MCPE0 = MCP->getConstants()[CPI0];
     const MachineConstantPoolEntry &MCPE1 = MCP->getConstants()[CPI1];
-    ARMConstantPoolValue *ACPV0 =
-      static_cast<ARMConstantPoolValue*>(MCPE0.Val.MachineCPVal);
-    ARMConstantPoolValue *ACPV1 =
-      static_cast<ARMConstantPoolValue*>(MCPE1.Val.MachineCPVal);
-    return ACPV0->hasSameValue(ACPV1);
+    bool isARMCP0 = MCPE0.isMachineConstantPoolEntry();
+    bool isARMCP1 = MCPE1.isMachineConstantPoolEntry();
+    if (isARMCP0 && isARMCP1) {
+      ARMConstantPoolValue *ACPV0 =
+        static_cast<ARMConstantPoolValue*>(MCPE0.Val.MachineCPVal);
+      ARMConstantPoolValue *ACPV1 =
+        static_cast<ARMConstantPoolValue*>(MCPE1.Val.MachineCPVal);
+      return ACPV0->hasSameValue(ACPV1);
+    } else if (!isARMCP0 && !isARMCP1) {
+      return MCPE0.Val.ConstVal == MCPE1.Val.ConstVal;
+    }
+    return false;
   } else if (Opcode == ARM::PICLDR) {
     if (MI1->getOpcode() != Opcode)
       return false;
@@ -1194,7 +1201,7 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
 }
 
 /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
-/// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should
+/// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
 /// be scheduled togther. On some targets if two loads are loading from
 /// addresses in the same cache line, it's better if they are scheduled
 /// together. This function takes two integers that represent the load offsets
@@ -1263,19 +1270,19 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
 }
 
 bool ARMBaseInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
-                                           unsigned NumCyles,
+                                           unsigned NumCycles,
                                            unsigned ExtraPredCycles,
                                            float Probability,
                                            float Confidence) const {
-  if (!NumCyles)
+  if (!NumCycles)
     return false;
 
   // Attempt to estimate the relative costs of predication versus branching.
-  float UnpredCost = Probability * NumCyles;
+  float UnpredCost = Probability * NumCycles;
   UnpredCost += 1.0; // The branch itself
   UnpredCost += (1.0 - Confidence) * Subtarget.getMispredictionPenalty();
 
-  return (float)(NumCyles + ExtraPredCycles) < UnpredCost;
+  return (float)(NumCycles + ExtraPredCycles) < UnpredCost;
 }
 
 bool ARMBaseInstrInfo::
@@ -1328,7 +1335,7 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator &MBBI, DebugLoc dl,
                                unsigned DestReg, unsigned BaseReg, int NumBytes,
                                ARMCC::CondCodes Pred, unsigned PredReg,
-                               const ARMBaseInstrInfo &TII) {
+                               const ARMBaseInstrInfo &TII, unsigned MIFlags) {
   bool isSub = NumBytes < 0;
   if (isSub) NumBytes = -NumBytes;
 
@@ -1346,7 +1353,8 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
     unsigned Opc = isSub ? ARM::SUBri : ARM::ADDri;
     BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
       .addReg(BaseReg, RegState::Kill).addImm(ThisVal)
-      .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+      .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+      .setMIFlags(MIFlags);
     BaseReg = DestReg;
   }
 }
@@ -1610,18 +1618,84 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask,
   // Set the "zero" bit in CPSR.
   switch (MI->getOpcode()) {
   default: break;
+  case ARM::RSBrr:
+  case ARM::RSBri:
+  case ARM::RSCrr:
+  case ARM::RSCri:
+  case ARM::ADDrr:
   case ARM::ADDri:
-  case ARM::ANDri:
-  case ARM::t2ANDri:
+  case ARM::ADCrr:
+  case ARM::ADCri:
+  case ARM::SUBrr:
   case ARM::SUBri:
+  case ARM::SBCrr:
+  case ARM::SBCri:
+  case ARM::t2RSBri:
+  case ARM::t2ADDrr:
   case ARM::t2ADDri:
+  case ARM::t2ADCrr:
+  case ARM::t2ADCri:
+  case ARM::t2SUBrr:
   case ARM::t2SUBri:
+  case ARM::t2SBCrr:
+  case ARM::t2SBCri:
+  case ARM::ANDrr:
+  case ARM::ANDri:
+  case ARM::t2ANDrr:
+  case ARM::t2ANDri:
+  case ARM::ORRrr:
+  case ARM::ORRri:
+  case ARM::t2ORRrr:
+  case ARM::t2ORRri:
+  case ARM::EORrr:
+  case ARM::EORri:
+  case ARM::t2EORrr:
+  case ARM::t2EORri: {
+    // Scan forward for the use of CPSR, if it's a conditional code requires
+    // checking of V bit, then this is not safe to do. If we can't find the
+    // CPSR use (i.e. used in another block), then it's not safe to perform
+    // the optimization.
+    bool isSafe = false;
+    I = CmpInstr;
+    E = MI->getParent()->end();
+    while (!isSafe && ++I != E) {
+      const MachineInstr &Instr = *I;
+      for (unsigned IO = 0, EO = Instr.getNumOperands();
+           !isSafe && IO != EO; ++IO) {
+        const MachineOperand &MO = Instr.getOperand(IO);
+        if (!MO.isReg() || MO.getReg() != ARM::CPSR)
+          continue;
+        if (MO.isDef()) {
+          isSafe = true;
+          break;
+        }
+        // Condition code is after the operand before CPSR.
+        ARMCC::CondCodes CC = (ARMCC::CondCodes)Instr.getOperand(IO-1).getImm();
+        switch (CC) {
+        default:
+          isSafe = true;
+          break;
+        case ARMCC::VS:
+        case ARMCC::VC:
+        case ARMCC::GE:
+        case ARMCC::LT:
+        case ARMCC::GT:
+        case ARMCC::LE:
+          return false;
+        }
+      }
+    }
+
+    if (!isSafe)
+      return false;
+
     // Toggle the optional operand to CPSR.
     MI->getOperand(5).setReg(ARM::CPSR);
     MI->getOperand(5).setIsDef(true);
     CmpInstr->eraseFromParent();
     return true;
   }
+  }
 
   return false;
 }
@@ -1741,9 +1815,7 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
     llvm_unreachable("Unexpected multi-uops instruction!");
     break;
   case ARM::VLDMQIA:
-  case ARM::VLDMQDB:
   case ARM::VSTMQIA:
-  case ARM::VSTMQDB:
     return 2;
 
   // The number of uOps for load / store multiple are determined by the number
@@ -1757,19 +1829,15 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
   // is not 64-bit aligned, then AGU would take an extra cycle.  For VFP / NEON
   // load / store multiple, the formula is (#reg / 2) + (#reg % 2) + 1.
   case ARM::VLDMDIA:
-  case ARM::VLDMDDB:
   case ARM::VLDMDIA_UPD:
   case ARM::VLDMDDB_UPD:
   case ARM::VLDMSIA:
-  case ARM::VLDMSDB:
   case ARM::VLDMSIA_UPD:
   case ARM::VLDMSDB_UPD:
   case ARM::VSTMDIA:
-  case ARM::VSTMDDB:
   case ARM::VSTMDIA_UPD:
   case ARM::VSTMDDB_UPD:
   case ARM::VSTMSIA:
-  case ARM::VSTMSDB:
   case ARM::VSTMSIA_UPD:
   case ARM::VSTMSDB_UPD: {
     unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands();
@@ -1859,7 +1927,6 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
     switch (DefTID.getOpcode()) {
     default: break;
     case ARM::VLDMSIA:
-    case ARM::VLDMSDB:
     case ARM::VLDMSIA_UPD:
     case ARM::VLDMSDB_UPD:
       isSLoad = true;
@@ -1935,7 +2002,6 @@ ARMBaseInstrInfo::getVSTMUseCycle(const InstrItineraryData *ItinData,
     switch (UseTID.getOpcode()) {
     default: break;
     case ARM::VSTMSIA:
-    case ARM::VSTMSDB:
     case ARM::VSTMSIA_UPD:
     case ARM::VSTMSDB_UPD:
       isSStore = true;
@@ -2006,11 +2072,9 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     break;
 
   case ARM::VLDMDIA:
-  case ARM::VLDMDDB:
   case ARM::VLDMDIA_UPD:
   case ARM::VLDMDDB_UPD:
   case ARM::VLDMSIA:
-  case ARM::VLDMSDB:
   case ARM::VLDMSIA_UPD:
   case ARM::VLDMSDB_UPD:
     DefCycle = getVLDMDefCycle(ItinData, DefTID, DefClass, DefIdx, DefAlign);
@@ -2049,11 +2113,9 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     break;
 
   case ARM::VSTMDIA:
-  case ARM::VSTMDDB:
   case ARM::VSTMDIA_UPD:
   case ARM::VSTMDDB_UPD:
   case ARM::VSTMSIA:
-  case ARM::VSTMSDB:
   case ARM::VSTMSIA_UPD:
   case ARM::VSTMSDB_UPD:
     UseCycle = getVSTMUseCycle(ItinData, UseTID, UseClass, UseIdx, UseAlign);
@@ -2160,6 +2222,101 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     }
   }
 
+  if (DefAlign < 8 && Subtarget.isCortexA9())
+    switch (DefTID.getOpcode()) {
+    default: break;
+    case ARM::VLD1q8:
+    case ARM::VLD1q16:
+    case ARM::VLD1q32:
+    case ARM::VLD1q64:
+    case ARM::VLD1q8_UPD:
+    case ARM::VLD1q16_UPD:
+    case ARM::VLD1q32_UPD:
+    case ARM::VLD1q64_UPD:
+    case ARM::VLD2d8:
+    case ARM::VLD2d16:
+    case ARM::VLD2d32:
+    case ARM::VLD2q8:
+    case ARM::VLD2q16:
+    case ARM::VLD2q32:
+    case ARM::VLD2d8_UPD:
+    case ARM::VLD2d16_UPD:
+    case ARM::VLD2d32_UPD:
+    case ARM::VLD2q8_UPD:
+    case ARM::VLD2q16_UPD:
+    case ARM::VLD2q32_UPD:
+    case ARM::VLD3d8:
+    case ARM::VLD3d16:
+    case ARM::VLD3d32:
+    case ARM::VLD1d64T:
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD1d64T_UPD:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD1d64Q:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+    case ARM::VLD1d64Q_UPD:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+    case ARM::VLD1DUPq8:
+    case ARM::VLD1DUPq16:
+    case ARM::VLD1DUPq32:
+    case ARM::VLD1DUPq8_UPD:
+    case ARM::VLD1DUPq16_UPD:
+    case ARM::VLD1DUPq32_UPD:
+    case ARM::VLD2DUPd8:
+    case ARM::VLD2DUPd16:
+    case ARM::VLD2DUPd32:
+    case ARM::VLD2DUPd8_UPD:
+    case ARM::VLD2DUPd16_UPD:
+    case ARM::VLD2DUPd32_UPD:
+    case ARM::VLD4DUPd8:
+    case ARM::VLD4DUPd16:
+    case ARM::VLD4DUPd32:
+    case ARM::VLD4DUPd8_UPD:
+    case ARM::VLD4DUPd16_UPD:
+    case ARM::VLD4DUPd32_UPD:
+    case ARM::VLD1LNd8:
+    case ARM::VLD1LNd16:
+    case ARM::VLD1LNd32:
+    case ARM::VLD1LNd8_UPD:
+    case ARM::VLD1LNd16_UPD:
+    case ARM::VLD1LNd32_UPD:
+    case ARM::VLD2LNd8:
+    case ARM::VLD2LNd16:
+    case ARM::VLD2LNd32:
+    case ARM::VLD2LNq16:
+    case ARM::VLD2LNq32:
+    case ARM::VLD2LNd8_UPD:
+    case ARM::VLD2LNd16_UPD:
+    case ARM::VLD2LNd32_UPD:
+    case ARM::VLD2LNq16_UPD:
+    case ARM::VLD2LNq32_UPD:
+    case ARM::VLD4LNd8:
+    case ARM::VLD4LNd16:
+    case ARM::VLD4LNd32:
+    case ARM::VLD4LNq16:
+    case ARM::VLD4LNq32:
+    case ARM::VLD4LNd8_UPD:
+    case ARM::VLD4LNd16_UPD:
+    case ARM::VLD4LNd32_UPD:
+    case ARM::VLD4LNq16_UPD:
+    case ARM::VLD4LNq32_UPD:
+      // If the address is not 64-bit aligned, the latencies of these
+      // instructions increases by one.
+      ++Latency;
+      break;
+    }
+
   return Latency;
 }
 
@@ -2226,6 +2383,113 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     }
   }
 
+  if (DefAlign < 8 && Subtarget.isCortexA9())
+    switch (DefTID.getOpcode()) {
+    default: break;
+    case ARM::VLD1q8Pseudo:
+    case ARM::VLD1q16Pseudo:
+    case ARM::VLD1q32Pseudo:
+    case ARM::VLD1q64Pseudo:
+    case ARM::VLD1q8Pseudo_UPD:
+    case ARM::VLD1q16Pseudo_UPD:
+    case ARM::VLD1q32Pseudo_UPD:
+    case ARM::VLD1q64Pseudo_UPD:
+    case ARM::VLD2d8Pseudo:
+    case ARM::VLD2d16Pseudo:
+    case ARM::VLD2d32Pseudo:
+    case ARM::VLD2q8Pseudo:
+    case ARM::VLD2q16Pseudo:
+    case ARM::VLD2q32Pseudo:
+    case ARM::VLD2d8Pseudo_UPD:
+    case ARM::VLD2d16Pseudo_UPD:
+    case ARM::VLD2d32Pseudo_UPD:
+    case ARM::VLD2q8Pseudo_UPD:
+    case ARM::VLD2q16Pseudo_UPD:
+    case ARM::VLD2q32Pseudo_UPD:
+    case ARM::VLD3d8Pseudo:
+    case ARM::VLD3d16Pseudo:
+    case ARM::VLD3d32Pseudo:
+    case ARM::VLD1d64TPseudo:
+    case ARM::VLD3d8Pseudo_UPD:
+    case ARM::VLD3d16Pseudo_UPD:
+    case ARM::VLD3d32Pseudo_UPD:
+    case ARM::VLD1d64TPseudo_UPD:
+    case ARM::VLD3q8Pseudo_UPD:
+    case ARM::VLD3q16Pseudo_UPD:
+    case ARM::VLD3q32Pseudo_UPD:
+    case ARM::VLD3q8oddPseudo:
+    case ARM::VLD3q16oddPseudo:
+    case ARM::VLD3q32oddPseudo:
+    case ARM::VLD3q8oddPseudo_UPD:
+    case ARM::VLD3q16oddPseudo_UPD:
+    case ARM::VLD3q32oddPseudo_UPD:
+    case ARM::VLD4d8Pseudo:
+    case ARM::VLD4d16Pseudo:
+    case ARM::VLD4d32Pseudo:
+    case ARM::VLD1d64QPseudo:
+    case ARM::VLD4d8Pseudo_UPD:
+    case ARM::VLD4d16Pseudo_UPD:
+    case ARM::VLD4d32Pseudo_UPD:
+    case ARM::VLD1d64QPseudo_UPD:
+    case ARM::VLD4q8Pseudo_UPD:
+    case ARM::VLD4q16Pseudo_UPD:
+    case ARM::VLD4q32Pseudo_UPD:
+    case ARM::VLD4q8oddPseudo:
+    case ARM::VLD4q16oddPseudo:
+    case ARM::VLD4q32oddPseudo:
+    case ARM::VLD4q8oddPseudo_UPD:
+    case ARM::VLD4q16oddPseudo_UPD:
+    case ARM::VLD4q32oddPseudo_UPD:
+    case ARM::VLD1DUPq8Pseudo:
+    case ARM::VLD1DUPq16Pseudo:
+    case ARM::VLD1DUPq32Pseudo:
+    case ARM::VLD1DUPq8Pseudo_UPD:
+    case ARM::VLD1DUPq16Pseudo_UPD:
+    case ARM::VLD1DUPq32Pseudo_UPD:
+    case ARM::VLD2DUPd8Pseudo:
+    case ARM::VLD2DUPd16Pseudo:
+    case ARM::VLD2DUPd32Pseudo:
+    case ARM::VLD2DUPd8Pseudo_UPD:
+    case ARM::VLD2DUPd16Pseudo_UPD:
+    case ARM::VLD2DUPd32Pseudo_UPD:
+    case ARM::VLD4DUPd8Pseudo:
+    case ARM::VLD4DUPd16Pseudo:
+    case ARM::VLD4DUPd32Pseudo:
+    case ARM::VLD4DUPd8Pseudo_UPD:
+    case ARM::VLD4DUPd16Pseudo_UPD:
+    case ARM::VLD4DUPd32Pseudo_UPD:
+    case ARM::VLD1LNq8Pseudo:
+    case ARM::VLD1LNq16Pseudo:
+    case ARM::VLD1LNq32Pseudo:
+    case ARM::VLD1LNq8Pseudo_UPD:
+    case ARM::VLD1LNq16Pseudo_UPD:
+    case ARM::VLD1LNq32Pseudo_UPD:
+    case ARM::VLD2LNd8Pseudo:
+    case ARM::VLD2LNd16Pseudo:
+    case ARM::VLD2LNd32Pseudo:
+    case ARM::VLD2LNq16Pseudo:
+    case ARM::VLD2LNq32Pseudo:
+    case ARM::VLD2LNd8Pseudo_UPD:
+    case ARM::VLD2LNd16Pseudo_UPD:
+    case ARM::VLD2LNd32Pseudo_UPD:
+    case ARM::VLD2LNq16Pseudo_UPD:
+    case ARM::VLD2LNq32Pseudo_UPD:
+    case ARM::VLD4LNd8Pseudo:
+    case ARM::VLD4LNd16Pseudo:
+    case ARM::VLD4LNd32Pseudo:
+    case ARM::VLD4LNq16Pseudo:
+    case ARM::VLD4LNq32Pseudo:
+    case ARM::VLD4LNd8Pseudo_UPD:
+    case ARM::VLD4LNd16Pseudo_UPD:
+    case ARM::VLD4LNd32Pseudo_UPD:
+    case ARM::VLD4LNq16Pseudo_UPD:
+    case ARM::VLD4LNq32Pseudo_UPD:
+      // If the address is not 64-bit aligned, the latencies of these
+      // instructions increases by one.
+      ++Latency;
+      break;
+    }
+
   return Latency;
 }
 
@@ -2264,9 +2528,7 @@ int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   default:
     return ItinData->getStageLatency(get(Opcode).getSchedClass());
   case ARM::VLDMQIA:
-  case ARM::VLDMQDB:
   case ARM::VSTMQIA:
-  case ARM::VSTMQDB:
     return 2;
   }
 }
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 7e2183d..9a2faf8 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -34,25 +34,7 @@ namespace ARMII {
 
     //===------------------------------------------------------------------===//
     // This four-bit field describes the addressing mode used.
-
-    AddrModeMask  = 0x1f,
-    AddrModeNone    = 0,
-    AddrMode1       = 1,
-    AddrMode2       = 2,
-    AddrMode3       = 3,
-    AddrMode4       = 4,
-    AddrMode5       = 5,
-    AddrMode6       = 6,
-    AddrModeT1_1    = 7,
-    AddrModeT1_2    = 8,
-    AddrModeT1_4    = 9,
-    AddrModeT1_s    = 10, // i8 * 4 for pc and sp relative data
-    AddrModeT2_i12  = 11,
-    AddrModeT2_i8   = 12,
-    AddrModeT2_so   = 13,
-    AddrModeT2_pc   = 14, // +/- i12 for pc relative data
-    AddrModeT2_i8s4 = 15, // i8 * 4
-    AddrMode_i12    = 16,
+    AddrModeMask  = 0x1f, // The AddrMode enums are declared in ARMBaseInfo.h
 
     // Size* - Flags to keep track of the size of an instruction.
     SizeShift     = 5,
@@ -64,11 +46,9 @@ namespace ARMII {
 
     // IndexMode - Unindex, pre-indexed, or post-indexed are valid for load
     // and store ops only.  Generic "updating" flag is used for ld/st multiple.
+    // The index mode enums are declared in ARMBaseInfo.h
     IndexModeShift = 8,
     IndexModeMask  = 3 << IndexModeShift,
-    IndexModePre   = 1,
-    IndexModePost  = 2,
-    IndexModeUpd   = 3,
 
     //===------------------------------------------------------------------===//
     // Instruction encoding formats.
@@ -311,7 +291,7 @@ public:
                                        int64_t &Offset1, int64_t &Offset2)const;
 
   /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
-  /// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should
+  /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
   /// be scheduled togther. On some targets if two loads are loading from
   /// addresses in the same cache line, it's better if they are scheduled
   /// together. This function takes two integers that represent the load offsets
@@ -327,7 +307,7 @@ public:
                                     const MachineFunction &MF) const;
 
   virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB,
-                                   unsigned NumCyles, unsigned ExtraPredCycles,
+                                   unsigned NumCycles, unsigned ExtraPredCycles,
                                    float Prob, float Confidence) const;
 
   virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
@@ -337,10 +317,10 @@ public:
                                    float Probability, float Confidence) const;
 
   virtual bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
-                                         unsigned NumCyles,
+                                         unsigned NumCycles,
                                          float Probability,
                                          float Confidence) const {
-    return NumCyles == 1;
+    return NumCycles == 1;
   }
 
   /// AnalyzeCompare - For a comparison instruction, return the source register
@@ -496,19 +476,19 @@ void emitARMRegPlusImmediate(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator &MBBI, DebugLoc dl,
                              unsigned DestReg, unsigned BaseReg, int NumBytes,
                              ARMCC::CondCodes Pred, unsigned PredReg,
-                             const ARMBaseInstrInfo &TII);
+                             const ARMBaseInstrInfo &TII, unsigned MIFlags = 0);
 
 void emitT2RegPlusImmediate(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator &MBBI, DebugLoc dl,
                             unsigned DestReg, unsigned BaseReg, int NumBytes,
                             ARMCC::CondCodes Pred, unsigned PredReg,
-                            const ARMBaseInstrInfo &TII);
+                            const ARMBaseInstrInfo &TII, unsigned MIFlags = 0);
 void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator &MBBI,
+                               MachineBasicBlock::iterator &MBBI, DebugLoc dl,
                                unsigned DestReg, unsigned BaseReg,
                                int NumBytes, const TargetInstrInfo &TII,
                                const ARMBaseRegisterInfo& MRI,
-                               DebugLoc dl);
+                               unsigned MIFlags = 0);
 
 
 /// rewriteARMFrameIndex / rewriteT2FrameIndex -
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 67a4b7d..ea1f08a 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -88,7 +88,7 @@ BitVector ARMBaseRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
-  // FIXME: avoid re-calculating this everytime.
+  // FIXME: avoid re-calculating this every time.
   BitVector Reserved(getNumRegs());
   Reserved.set(ARM::SP);
   Reserved.set(ARM::PC);
@@ -342,12 +342,51 @@ ARMBaseRegisterInfo::canCombineSubRegIndices(const TargetRegisterClass *RC,
   return false;
 }
 
+const TargetRegisterClass*
+ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)
+                                                                         const {
+  const TargetRegisterClass *Super = RC;
+  TargetRegisterClass::sc_iterator I = RC->superclasses_begin();
+  do {
+    switch (Super->getID()) {
+    case ARM::GPRRegClassID:
+    case ARM::SPRRegClassID:
+    case ARM::DPRRegClassID:
+    case ARM::QPRRegClassID:
+    case ARM::QQPRRegClassID:
+    case ARM::QQQQPRRegClassID:
+      return Super;
+    }
+    Super = *I++;
+  } while (Super);
+  return RC;
+}
 
 const TargetRegisterClass *
 ARMBaseRegisterInfo::getPointerRegClass(unsigned Kind) const {
   return ARM::GPRRegisterClass;
 }
 
+unsigned
+ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                         MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case ARM::tGPRRegClassID:
+    return TFI->hasFP(MF) ? 4 : 5;
+  case ARM::GPRRegClassID: {
+    unsigned FP = TFI->hasFP(MF) ? 1 : 0;
+    return 10 - FP - (STI.isR9Reserved() ? 1 : 0);
+  }
+  case ARM::SPRRegClassID:  // Currently not used as 'rep' register class.
+  case ARM::DPRRegClassID:
+    return 32 - 10;
+  }
+}
+
 /// getAllocationOrder - Returns the register allocation order for a specified
 /// register class in the form of a pair of TargetRegisterClass iterators.
 std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator>
@@ -428,6 +467,10 @@ ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC,
     ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8
   };
 
+  // We only support even/odd hints for GPR and rGPR.
+  if (RC != ARM::GPRRegisterClass && RC != ARM::rGPRRegisterClass)
+    return std::make_pair(RC->allocation_order_begin(MF),
+                          RC->allocation_order_end(MF));
 
   if (HintType == ARMRI::RegPairEven) {
     if (isPhysicalRegister(HintReg) && getRegisterPairEven(HintReg, MF) == 0)
@@ -530,6 +573,29 @@ ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
   }
 }
 
+bool
+ARMBaseRegisterInfo::avoidWriteAfterWrite(const TargetRegisterClass *RC) const {
+  // CortexA9 has a Write-after-write hazard for NEON registers.
+  if (!STI.isCortexA9())
+    return false;
+
+  switch (RC->getID()) {
+  case ARM::DPRRegClassID:
+  case ARM::DPR_8RegClassID:
+  case ARM::DPR_VFP2RegClassID:
+  case ARM::QPRRegClassID:
+  case ARM::QPR_8RegClassID:
+  case ARM::QPR_VFP2RegClassID:
+  case ARM::SPRRegClassID:
+  case ARM::SPR_8RegClassID:
+    // Avoid reusing S, D, and Q registers.
+    // Don't increase register pressure for QQ and QQQQ.
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -806,7 +872,7 @@ emitLoadConstPool(MachineBasicBlock &MBB,
                   DebugLoc dl,
                   unsigned DestReg, unsigned SubIdx, int Val,
                   ARMCC::CondCodes Pred,
-                  unsigned PredReg) const {
+                  unsigned PredReg, unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
   const Constant *C =
@@ -816,7 +882,8 @@ emitLoadConstPool(MachineBasicBlock &MBB,
   BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp))
     .addReg(DestReg, getDefRegState(true), SubIdx)
     .addConstantPoolIndex(Idx)
-    .addImm(0).addImm(Pred).addReg(PredReg);
+    .addImm(0).addImm(Pred).addReg(PredReg)
+    .setMIFlags(MIFlags);
 }
 
 bool ARMBaseRegisterInfo::
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index ba6bd2b..9edf72d 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -128,6 +128,12 @@ public:
 
   const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
 
+  const TargetRegisterClass*
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const;
+
   std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator>
   getAllocationOrder(const TargetRegisterClass *RC,
                      unsigned HintType, unsigned HintReg,
@@ -139,6 +145,8 @@ public:
   void UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
                           MachineFunction &MF) const;
 
+  virtual bool avoidWriteAfterWrite(const TargetRegisterClass *RC) const;
+
   bool hasBasePointer(const MachineFunction &MF) const;
 
   bool canRealignStack(const MachineFunction &MF) const;
@@ -176,7 +184,8 @@ public:
                                  unsigned DestReg, unsigned SubIdx,
                                  int Val,
                                  ARMCC::CondCodes Pred = ARMCC::AL,
-                                 unsigned PredReg = 0) const;
+                                 unsigned PredReg = 0,
+                                 unsigned MIFlags = MachineInstr::NoFlags)const;
 
   /// Code Generation virtual methods...
   virtual bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 426ba13..d2981c0 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -22,6 +22,9 @@ class CCIfAlign<string Align, CCAction A>:
 //===----------------------------------------------------------------------===//
 def CC_ARM_APCS : CallingConv<[
 
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+    
   CCIfType<[i8, i16], CCPromoteToType<i32>>,
 
   // Handle all vector types as either f64 or v2f64.
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index 9bbf6a0..fa73716 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -312,6 +312,15 @@ namespace {
     unsigned getRegisterListOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
 
+    unsigned getShiftRight8Imm(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getShiftRight16Imm(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getShiftRight32Imm(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getShiftRight64Imm(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+
     /// getMovi32Value - Return binary encoding of operand for movw/movt. If the
     /// machine operand requires relocation, record the relocation and return
     /// zero.
@@ -969,7 +978,7 @@ unsigned ARMCodeEmitter::getMachineSoImmOpValue(unsigned SoImm) {
 
 unsigned ARMCodeEmitter::getAddrModeSBit(const MachineInstr &MI,
                                          const TargetInstrDesc &TID) const {
-  for (unsigned i = MI.getNumOperands(), e = TID.getNumOperands(); i != e; --i){
+  for (unsigned i = MI.getNumOperands(), e = TID.getNumOperands(); i >= e; --i){
     const MachineOperand &MO = MI.getOperand(i-1);
     if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)
       return 1 << ARMII::S_BitShift;
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 13d1b33..baf95a3 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -1650,24 +1650,27 @@ bool ARMConstantIslands::OptimizeThumb2Branches(MachineFunction &MF) {
     unsigned BrOffset = GetOffsetOf(Br.MI) + 4 - 2;
     unsigned DestOffset = BBOffsets[DestBB->getNumber()];
     if (BrOffset < DestOffset && (DestOffset - BrOffset) <= 126) {
-      MachineBasicBlock::iterator CmpMI = Br.MI; --CmpMI;
-      if (CmpMI->getOpcode() == ARM::tCMPi8) {
-        unsigned Reg = CmpMI->getOperand(0).getReg();
-        Pred = llvm::getInstrPredicate(CmpMI, PredReg);
-        if (Pred == ARMCC::AL &&
-            CmpMI->getOperand(1).getImm() == 0 &&
-            isARMLowRegister(Reg)) {
-          MachineBasicBlock *MBB = Br.MI->getParent();
-          MachineInstr *NewBR =
-            BuildMI(*MBB, CmpMI, Br.MI->getDebugLoc(), TII->get(NewOpc))
-            .addReg(Reg).addMBB(DestBB, Br.MI->getOperand(0).getTargetFlags());
-          CmpMI->eraseFromParent();
-          Br.MI->eraseFromParent();
-          Br.MI = NewBR;
-          BBSizes[MBB->getNumber()] -= 2;
-          AdjustBBOffsetsAfter(MBB, -2);
-          ++NumCBZ;
-          MadeChange = true;
+      MachineBasicBlock::iterator CmpMI = Br.MI;
+      if (CmpMI != Br.MI->getParent()->begin()) {
+        --CmpMI;
+        if (CmpMI->getOpcode() == ARM::tCMPi8) {
+          unsigned Reg = CmpMI->getOperand(0).getReg();
+          Pred = llvm::getInstrPredicate(CmpMI, PredReg);
+          if (Pred == ARMCC::AL &&
+              CmpMI->getOperand(1).getImm() == 0 &&
+              isARMLowRegister(Reg)) {
+            MachineBasicBlock *MBB = Br.MI->getParent();
+            MachineInstr *NewBR =
+              BuildMI(*MBB, CmpMI, Br.MI->getDebugLoc(), TII->get(NewOpc))
+              .addReg(Reg).addMBB(DestBB,Br.MI->getOperand(0).getTargetFlags());
+            CmpMI->eraseFromParent();
+            Br.MI->eraseFromParent();
+            Br.MI = NewBR;
+            BBSizes[MBB->getNumber()] -= 2;
+            AdjustBBOffsetsAfter(MBB, -2);
+            ++NumCBZ;
+            MadeChange = true;
+          }
         }
       }
     }
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index bd753d2..b6b3c75 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -455,6 +455,10 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
   // Add an implicit def for the super-register.
   MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
   TransferImpOps(MI, MIB, MIB);
+
+  // Transfer memoperands.
+  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
   MI.eraseFromParent();
 }
 
@@ -496,10 +500,13 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
   MIB.addOperand(MI.getOperand(OpIdx++));
   MIB.addOperand(MI.getOperand(OpIdx++));
 
-  if (SrcIsKill)
-    // Add an implicit kill for the super-reg.
-    (*MIB).addRegisterKilled(SrcReg, TRI, true);
+  if (SrcIsKill) // Add an implicit kill for the super-reg.
+    MIB->addRegisterKilled(SrcReg, TRI, true);
   TransferImpOps(MI, MIB, MIB);
+
+  // Transfer memoperands.
+  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
   MI.eraseFromParent();
 }
 
@@ -622,9 +629,8 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
   MIB.addOperand(MI.getOperand(OpIdx++));
   MIB.addOperand(MI.getOperand(OpIdx++));
 
-  if (SrcIsKill)
-    // Add an implicit kill for the super-reg.
-    (*MIB).addRegisterKilled(SrcReg, TRI, true);
+  if (SrcIsKill)  // Add an implicit kill for the super-reg.
+    MIB->addRegisterKilled(SrcReg, TRI, true);
   TransferImpOps(MI, MIB, MIB);
   MI.eraseFromParent();
 }
@@ -655,8 +661,8 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
     unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
     LO16 = LO16.addImm(SOImmValV1);
     HI16 = HI16.addImm(SOImmValV2);
-    (*LO16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-    (*HI16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+    LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+    HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
     LO16.addImm(Pred).addReg(PredReg).addReg(0);
     HI16.addImm(Pred).addReg(PredReg).addReg(0);
     TransferImpOps(MI, LO16, HI16);
@@ -692,8 +698,8 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
     HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16);
   }
 
-  (*LO16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-  (*HI16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
   LO16.addImm(Pred).addReg(PredReg);
   HI16.addImm(Pred).addReg(PredReg);
 
@@ -708,6 +714,78 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
   switch (Opcode) {
     default:
       return false;
+    case ARM::VMOVScc:
+    case ARM::VMOVDcc: {
+      unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc),
+              MI.getOperand(1).getReg())
+        .addReg(MI.getOperand(2).getReg(),
+                getKillRegState(MI.getOperand(2).isKill()))
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addReg(MI.getOperand(4).getReg());
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::MOVCCr: {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVr),
+              MI.getOperand(1).getReg())
+        .addReg(MI.getOperand(2).getReg(),
+                getKillRegState(MI.getOperand(2).isKill()))
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addReg(MI.getOperand(4).getReg())
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::MOVCCs: {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVs),
+              (MI.getOperand(1).getReg()))
+        .addReg(MI.getOperand(2).getReg(),
+                getKillRegState(MI.getOperand(2).isKill()))
+        .addReg(MI.getOperand(3).getReg(),
+                getKillRegState(MI.getOperand(3).isKill()))
+        .addImm(MI.getOperand(4).getImm())
+        .addImm(MI.getOperand(5).getImm()) // 'pred'
+        .addReg(MI.getOperand(6).getReg())
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::MOVCCi16: {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi16),
+              MI.getOperand(1).getReg())
+        .addImm(MI.getOperand(2).getImm())
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addReg(MI.getOperand(4).getReg());
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::MOVCCi: {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi),
+              MI.getOperand(1).getReg())
+        .addImm(MI.getOperand(2).getImm())
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addReg(MI.getOperand(4).getReg())
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::MVNCCi: {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MVNi),
+              MI.getOperand(1).getReg())
+        .addImm(MI.getOperand(2).getImm())
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addReg(MI.getOperand(4).getReg())
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
     case ARM::Int_eh_sjlj_dispatchsetup: {
       MachineFunction &MF = *MI.getParent()->getParent();
       const ARMBaseInstrInfo *AII =
@@ -726,9 +804,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
           llvm::emitT2RegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
                                        FramePtr, -NumBytes, ARMCC::AL, 0, *TII);
         } else if (AFI->isThumbFunction()) {
-          llvm::emitThumbRegPlusImmediate(MBB, MBBI, ARM::R6,
-                                          FramePtr, -NumBytes,
-                                          *TII, RI, MI.getDebugLoc());
+          llvm::emitThumbRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
+                                          FramePtr, -NumBytes, *TII, RI);
         } else {
           llvm::emitARMRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
                                         FramePtr, -NumBytes, ARMCC::AL, 0,
@@ -785,7 +862,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
                 TII->get(ARM::BL))
         .addExternalSymbol("__aeabi_read_tp", 0);
 
-      (*MIB).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       TransferImpOps(MI, MIB, MIB);
       MI.eraseFromParent();
       return true;
@@ -800,7 +877,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
         AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
                                TII->get(NewLdOpc), DstReg)
                        .addOperand(MI.getOperand(1)));
-      (*MIB1).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      MIB1->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
                                          TII->get(ARM::tPICADD))
         .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
@@ -823,7 +900,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       const MachineOperand &MO1 = MI.getOperand(1);
       const GlobalValue *GV = MO1.getGlobal();
       unsigned TF = MO1.getTargetFlags();
-      bool isARM = Opcode != ARM::t2MOV_ga_pcrel;
+      bool isARM = (Opcode != ARM::t2MOV_ga_pcrel && Opcode != ARM::t2MOV_ga_dyn);
       bool isPIC = (Opcode != ARM::MOV_ga_dyn && Opcode != ARM::t2MOV_ga_dyn);
       unsigned LO16Opc = isARM ? ARM::MOVi16_ga_pcrel : ARM::t2MOVi16_ga_pcrel;
       unsigned HI16Opc = isARM ? ARM::MOVTi16_ga_pcrel : ARM::t2MOVTi16_ga_pcrel;
@@ -856,7 +933,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       if (isARM) {
         AddDefaultPred(MIB3);
         if (Opcode == ARM::MOV_ga_pcrel_ldr)
-          (*MIB2).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+          MIB2->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       }
       TransferImpOps(MI, MIB1, MIB3);
       MI.eraseFromParent();
@@ -896,9 +973,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       return true;
     }
 
-    case ARM::VLDMQIA:
-    case ARM::VLDMQDB: {
-      unsigned NewOpc = (Opcode == ARM::VLDMQIA) ? ARM::VLDMDIA : ARM::VLDMDDB;
+    case ARM::VLDMQIA: {
+      unsigned NewOpc = ARM::VLDMDIA;
       MachineInstrBuilder MIB =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
       unsigned OpIdx = 0;
@@ -927,9 +1003,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       return true;
     }
 
-    case ARM::VSTMQIA:
-    case ARM::VSTMQDB: {
-      unsigned NewOpc = (Opcode == ARM::VSTMQIA) ? ARM::VSTMDIA : ARM::VSTMDDB;
+    case ARM::VSTMQIA: {
+      unsigned NewOpc = ARM::VSTMDIA;
       MachineInstrBuilder MIB =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
       unsigned OpIdx = 0;
@@ -950,9 +1025,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
       MIB.addReg(D0).addReg(D1);
 
-      if (SrcIsKill)
-        // Add an implicit kill for the Q register.
-        (*MIB).addRegisterKilled(SrcReg, TRI, true);
+      if (SrcIsKill)      // Add an implicit kill for the Q register.
+        MIB->addRegisterKilled(SrcReg, TRI, true);
 
       TransferImpOps(MI, MIB, MIB);
       MI.eraseFromParent();
@@ -960,14 +1034,16 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     }
     case ARM::VDUPfqf:
     case ARM::VDUPfdf:{
-      unsigned NewOpc = Opcode == ARM::VDUPfqf ? ARM::VDUPLNfq : ARM::VDUPLNfd;
+      unsigned NewOpc = Opcode == ARM::VDUPfqf ? ARM::VDUPLN32q :
+        ARM::VDUPLN32d;
       MachineInstrBuilder MIB =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
       unsigned OpIdx = 0;
       unsigned SrcReg = MI.getOperand(1).getReg();
       unsigned Lane = getARMRegisterNumbering(SrcReg) & 1;
       unsigned DReg = TRI->getMatchingSuperReg(SrcReg,
-                                               Lane & 1 ? ARM::ssub_1 : ARM::ssub_0, &ARM::DPR_VFP2RegClass);
+                            Lane & 1 ? ARM::ssub_1 : ARM::ssub_0,
+                            &ARM::DPR_VFP2RegClass);
       // The lane is [0,1] for the containing DReg superregister.
       // Copy the dst/src register operands.
       MIB.addOperand(MI.getOperand(OpIdx++));
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 26f48b3..3baf274 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
+#include "ARMAddressingModes.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMCallingConv.h"
 #include "ARMRegisterInfo.h"
@@ -26,6 +27,7 @@
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
+#include "llvm/Operator.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -115,6 +117,11 @@ class ARMFastISel : public FastISel {
                                      const TargetRegisterClass *RC,
                                      unsigned Op0, bool Op0IsKill,
                                      unsigned Op1, bool Op1IsKill);
+    virtual unsigned FastEmitInst_rrr(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      unsigned Op1, bool Op1IsKill,
+                                      unsigned Op2, bool Op2IsKill);
     virtual unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
                                      const TargetRegisterClass *RC,
                                      unsigned Op0, bool Op0IsKill,
@@ -123,14 +130,18 @@ class ARMFastISel : public FastISel {
                                      const TargetRegisterClass *RC,
                                      unsigned Op0, bool Op0IsKill,
                                      const ConstantFP *FPImm);
-    virtual unsigned FastEmitInst_i(unsigned MachineInstOpcode,
-                                    const TargetRegisterClass *RC,
-                                    uint64_t Imm);
     virtual unsigned FastEmitInst_rri(unsigned MachineInstOpcode,
                                       const TargetRegisterClass *RC,
                                       unsigned Op0, bool Op0IsKill,
                                       unsigned Op1, bool Op1IsKill,
                                       uint64_t Imm);
+    virtual unsigned FastEmitInst_i(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC,
+                                    uint64_t Imm);
+    virtual unsigned FastEmitInst_ii(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     uint64_t Imm1, uint64_t Imm2);
+
     virtual unsigned FastEmitInst_extractsubreg(MVT RetVT,
                                                 unsigned Op0, bool Op0IsKill,
                                                 uint32_t Idx);
@@ -193,6 +204,7 @@ class ARMFastISel : public FastISel {
 
     // OptionalDef handling routines.
   private:
+    bool isARMNEONPred(const MachineInstr *MI);
     bool DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR);
     const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB);
     void AddLoadStoreOperands(EVT VT, Address &Addr,
@@ -221,6 +233,21 @@ bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) {
   return true;
 }
 
+bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) {
+  const TargetInstrDesc &TID = MI->getDesc();
+
+  // If we're a thumb2 or not NEON function we were handled via isPredicable.
+  if ((TID.TSFlags & ARMII::DomainMask) != ARMII::DomainNEON ||
+       AFI->isThumb2Function())
+    return false;
+
+  for (unsigned i = 0, e = TID.getNumOperands(); i != e; ++i)
+    if (TID.OpInfo[i].isPredicate())
+      return true;
+
+  return false;
+}
+
 // If the machine is predicable go ahead and add the predicate operands, if
 // it needs default CC operands add those.
 // TODO: If we want to support thumb1 then we'll need to deal with optional
@@ -230,8 +257,10 @@ const MachineInstrBuilder &
 ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
   MachineInstr *MI = &*MIB;
 
-  // Do we use a predicate?
-  if (TII.isPredicable(MI))
+  // Do we use a predicate? or...
+  // Are we NEON in ARM mode and have a predicate operand? If so, I know
+  // we're not predicable but add it anyways.
+  if (TII.isPredicable(MI) || isARMNEONPred(MI))
     AddDefaultPred(MIB);
 
   // Do we optionally set a predicate?  Preds is size > 0 iff the predicate
@@ -296,6 +325,31 @@ unsigned ARMFastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
+unsigned ARMFastISel::FastEmitInst_rrr(unsigned MachineInstOpcode,
+                                       const TargetRegisterClass *RC,
+                                       unsigned Op0, bool Op0IsKill,
+                                       unsigned Op1, bool Op1IsKill,
+                                       unsigned Op2, bool Op2IsKill) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addReg(Op1, Op1IsKill * RegState::Kill)
+                   .addReg(Op2, Op2IsKill * RegState::Kill));
+  else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addReg(Op1, Op1IsKill * RegState::Kill)
+                   .addReg(Op2, Op2IsKill * RegState::Kill));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
 unsigned ARMFastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
                                       const TargetRegisterClass *RC,
                                       unsigned Op0, bool Op0IsKill,
@@ -384,6 +438,26 @@ unsigned ARMFastISel::FastEmitInst_i(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
+unsigned ARMFastISel::FastEmitInst_ii(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      uint64_t Imm1, uint64_t Imm2) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+  
+  if (II.getNumDefs() >= 1)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+                    .addImm(Imm1).addImm(Imm2));
+  else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+                    .addImm(Imm1).addImm(Imm2));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, 
+                            TII.get(TargetOpcode::COPY),
+                            ResultReg)
+                    .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
 unsigned ARMFastISel::FastEmitInst_extractsubreg(MVT RetVT,
                                                  unsigned Op0, bool Op0IsKill,
                                                  uint32_t Idx) {
@@ -667,24 +741,29 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
           TmpOffset += SL->getElementOffset(Idx);
         } else {
           uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
-          SmallVector<const Value *, 4> Worklist;
-          Worklist.push_back(Op);
-          do {
-            Op = Worklist.pop_back_val();
+          for (;;) {
             if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
               // Constant-offset addressing.
               TmpOffset += CI->getSExtValue() * S;
-            } else if (isa<AddOperator>(Op) &&
-                       isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
-              // An add with a constant operand. Fold the constant.
+              break;
+            }
+            if (isa<AddOperator>(Op) &&
+                (!isa<Instruction>(Op) ||
+                 FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()]
+                 == FuncInfo.MBB) &&
+                isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
+              // An add (in the same block) with a constant operand. Fold the
+              // constant.
               ConstantInt *CI =
-                cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+              cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
               TmpOffset += CI->getSExtValue() * S;
-              // Add the other operand back to the work list.
-              Worklist.push_back(cast<AddOperator>(Op)->getOperand(0));
-            } else
-              goto unsupported_gep;
-          } while (!Worklist.empty());
+              // Iterate on the other operand.
+              Op = cast<AddOperator>(Op)->getOperand(0);
+              continue;
+            }
+            // Unsupported
+            goto unsupported_gep;
+          }
         }
       }
 
@@ -767,26 +846,9 @@ void ARMFastISel::ARMSimplifyAddress(Address &Addr, EVT VT) {
   // Since the offset is too large for the load/store instruction
   // get the reg+offset into a register.
   if (needsLowering) {
-    ARMCC::CondCodes Pred = ARMCC::AL;
-    unsigned PredReg = 0;
-
-    TargetRegisterClass *RC = isThumb ? ARM::tGPRRegisterClass :
-      ARM::GPRRegisterClass;
-    unsigned BaseReg = createResultReg(RC);
-
-    if (!isThumb)
-      emitARMRegPlusImmediate(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                              BaseReg, Addr.Base.Reg, Addr.Offset,
-                              Pred, PredReg,
-                              static_cast<const ARMBaseInstrInfo&>(TII));
-    else {
-      assert(AFI->isThumb2Function());
-      emitT2RegPlusImmediate(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                             BaseReg, Addr.Base.Reg, Addr.Offset, Pred, PredReg,
-                             static_cast<const ARMBaseInstrInfo&>(TII));
-    }
+    Addr.Base.Reg = FastEmit_ri_(MVT::i32, ISD::ADD, Addr.Base.Reg,
+                                 /*Op0IsKill*/false, Addr.Offset, MVT::i32);
     Addr.Offset = 0;
-    Addr.Base.Reg = BaseReg;
   }
 }
 
@@ -797,7 +859,7 @@ void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr,
   if (VT.getSimpleVT().SimpleTy == MVT::f32 ||
       VT.getSimpleVT().SimpleTy == MVT::f64)
     Addr.Offset /= 4;
-    
+
   // Frame base works a bit differently. Handle it separately.
   if (Addr.BaseType == Address::FrameIndexBase) {
     int FI = Addr.Base.FI;
@@ -819,7 +881,7 @@ void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr,
   } else {
     // Now add the rest of the operands.
     MIB.addReg(Addr.Base.Reg);
-  
+
     // ARM halfword load/stores need an additional operand.
     if (!isThumb && VT.getSimpleVT().SimpleTy == MVT::i16) MIB.addReg(0);
 
@@ -1007,18 +1069,16 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
   // behavior.
   // TODO: Factor this out.
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
-    if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
-      MVT VT;
-      const Type *Ty = CI->getOperand(0)->getType();
-      if (!isTypeLegal(Ty, VT))
-        return false;
-
+    MVT SourceVT;
+    const Type *Ty = CI->getOperand(0)->getType();
+    if (CI->hasOneUse() && (CI->getParent() == I->getParent())
+        && isTypeLegal(Ty, SourceVT)) {
       bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy());
       if (isFloat && !Subtarget->hasVFP2())
         return false;
 
       unsigned CmpOpc;
-      switch (VT.SimpleTy) {
+      switch (SourceVT.SimpleTy) {
         default: return false;
         // TODO: Verify compares.
         case MVT::f32:
@@ -1033,7 +1093,14 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
       }
 
       // Get the compare predicate.
-      ARMCC::CondCodes ARMPred = getComparePred(CI->getPredicate());
+      // Try to take advantage of fallthrough opportunities.
+      CmpInst::Predicate Predicate = CI->getPredicate();
+      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+        std::swap(TBB, FBB);
+        Predicate = CmpInst::getInversePredicate(Predicate);
+      }
+
+      ARMCC::CondCodes ARMPred = getComparePred(Predicate);
 
       // We may not handle every CC for now.
       if (ARMPred == ARMCC::AL) return false;
@@ -1061,19 +1128,55 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
       FuncInfo.MBB->addSuccessor(TBB);
       return true;
     }
+  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+    MVT SourceVT;
+    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+        (isTypeLegal(TI->getOperand(0)->getType(), SourceVT))) {
+      unsigned TstOpc = isThumb ? ARM::t2TSTri : ARM::TSTri;
+      unsigned OpReg = getRegForValue(TI->getOperand(0));
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              TII.get(TstOpc))
+                      .addReg(OpReg).addImm(1));
+
+      unsigned CCMode = ARMCC::NE;
+      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+        std::swap(TBB, FBB);
+        CCMode = ARMCC::EQ;
+      }
+
+      unsigned BrOpc = isThumb ? ARM::t2Bcc : ARM::Bcc;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc))
+      .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
+
+      FastEmitBranch(FBB, DL);
+      FuncInfo.MBB->addSuccessor(TBB);
+      return true;
+    }
   }
 
   unsigned CmpReg = getRegForValue(BI->getCondition());
   if (CmpReg == 0) return false;
 
-  // Re-set the flags just in case.
-  unsigned CmpOpc = isThumb ? ARM::t2CMPri : ARM::CMPri;
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc))
-                  .addReg(CmpReg).addImm(0));
+  // We've been divorced from our compare!  Our block was split, and
+  // now our compare lives in a predecessor block.  We musn't
+  // re-compare here, as the children of the compare aren't guaranteed
+  // live across the block boundary (we *could* check for this).
+  // Regardless, the compare has been done in the predecessor block,
+  // and it left a value for us in a virtual register.  Ergo, we test
+  // the one-bit value left in the virtual register.
+  unsigned TstOpc = isThumb ? ARM::t2TSTri : ARM::TSTri;
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TstOpc))
+                  .addReg(CmpReg).addImm(1));
+
+  unsigned CCMode = ARMCC::NE;
+  if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+    std::swap(TBB, FBB);
+    CCMode = ARMCC::EQ;
+  }
 
   unsigned BrOpc = isThumb ? ARM::t2Bcc : ARM::Bcc;
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc))
-                  .addMBB(TBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+                  .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
   FastEmitBranch(FBB, DL);
   FuncInfo.MBB->addSuccessor(TBB);
   return true;
@@ -1636,17 +1739,9 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
 
 unsigned ARMFastISel::ARMSelectCallOp(const GlobalValue *GV) {
 
-  // Depend our opcode for thumb on whether or not we're targeting an
-  // externally callable function. For libcalls we'll just pass a NULL GV
-  // in here.
-  bool isExternal = false;
-  if (!GV || GV->hasExternalLinkage()) isExternal = true;
-  
   // Darwin needs the r9 versions of the opcodes.
   bool isDarwin = Subtarget->isTargetDarwin();
-  if (isThumb && isExternal) {
-    return isDarwin ? ARM::tBLXi_r9 : ARM::tBLXi;
-  } else if (isThumb) {
+  if (isThumb) {
     return isDarwin ? ARM::tBLr9 : ARM::tBL;
   } else  {
     return isDarwin ? ARM::BLr9 : ARM::BL;
@@ -1671,9 +1766,6 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   else if (!isTypeLegal(RetTy, RetVT))
     return false;
 
-  // For now we're using BLX etc on the assumption that we have v5t ops.
-  if (!Subtarget->hasV5TOps()) return false;
-
   // TODO: For now if we have long calls specified we don't handle the call.
   if (EnableARMLongCalls) return false;
 
@@ -1711,7 +1803,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
     return false;
 
-  // Issue the call, BLXr9 for darwin, BLX otherwise. This uses V5 ops.
+  // Issue the call, BLr9 for darwin, BL otherwise.
   // TODO: Turn this into the table of arm call ops.
   MachineInstrBuilder MIB;
   unsigned CallOpc = ARMSelectCallOp(NULL);
@@ -1772,13 +1864,9 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
   else if (!isTypeLegal(RetTy, RetVT))
     return false;
 
-  // For now we're using BLX etc on the assumption that we have v5t ops.
-  // TODO: Maybe?
-  if (!Subtarget->hasV5TOps()) return false;
-
   // TODO: For now if we have long calls specified we don't handle the call.
   if (EnableARMLongCalls) return false;
-  
+
   // Set up the argument vectors.
   SmallVector<Value*, 8> Args;
   SmallVector<unsigned, 8> ArgRegs;
@@ -1827,7 +1915,7 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
   if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
     return false;
 
-  // Issue the call, BLXr9 for darwin, BLX otherwise. This uses V5 ops.
+  // Issue the call, BLr9 for darwin, BL otherwise.
   // TODO: Turn this into the table of arm call ops.
   MachineInstrBuilder MIB;
   unsigned CallOpc = ARMSelectCallOp(GV);
@@ -1842,7 +1930,7 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
     MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                          TII.get(CallOpc))
           .addGlobalAddress(GV, 0, 0));
-  
+
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
     MIB.addReg(RegArgs[i]);
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 68c33f0..e2e95d4 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -106,14 +106,13 @@ static void
 emitSPUpdate(bool isARM,
              MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
              DebugLoc dl, const ARMBaseInstrInfo &TII,
-             int NumBytes,
-             ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) {
+             int NumBytes, unsigned MIFlags = MachineInstr::NoFlags) {
   if (isARM)
     emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
-                            Pred, PredReg, TII);
+                            ARMCC::AL, 0, TII, MIFlags);
   else
     emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
-                           Pred, PredReg, TII);
+                           ARMCC::AL, 0, TII, MIFlags);
 }
 
 void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
@@ -141,11 +140,13 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
 
   // Allocate the vararg register save area. This is not counted in NumBytes.
   if (VARegSaveSize)
-    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -VARegSaveSize);
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -VARegSaveSize,
+                 MachineInstr::FrameSetup);
 
   if (!AFI->hasStackFrame()) {
     if (NumBytes != 0)
-      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes);
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
+                   MachineInstr::FrameSetup);
     return;
   }
 
@@ -196,7 +197,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
     unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri;
     MachineInstrBuilder MIB =
       BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr)
-      .addFrameIndex(FramePtrSpillFI).addImm(0);
+      .addFrameIndex(FramePtrSpillFI).addImm(0)
+      .setMIFlag(MachineInstr::FrameSetup);
     AddDefaultCC(AddDefaultPred(MIB));
   }
 
@@ -226,7 +228,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   NumBytes = DPRCSOffset;
   if (NumBytes) {
     // Adjust SP after all the callee-save spills.
-    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes);
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
+                 MachineInstr::FrameSetup);
     if (HasFP && isARM)
       // Restore from fp only in ARM mode: e.g. sub sp, r7, #24
       // Note it's not safe to do this in Thumb2 mode because it would have
@@ -282,6 +285,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   // of the stack pointer is at this point. Any variable size objects
   // will be allocated after this, so we can still use the base pointer
   // to reference locals.
+  // FIXME: Clarify FrameSetup flags here.
   if (RegInfo->hasBasePointer(MF)) {
     if (isARM)
       BuildMI(MBB, MBBI, dl,
@@ -396,8 +400,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
     // Jump to label or value in register.
     if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNdiND) {
       unsigned TCOpcode = (RetOpcode == ARM::TCRETURNdi)
-        ? (STI.isThumb() ? ARM::TAILJMPdt : ARM::TAILJMPd)
-        : (STI.isThumb() ? ARM::TAILJMPdNDt : ARM::TAILJMPdND);
+        ? (STI.isThumb() ? ARM::tTAILJMPd : ARM::TAILJMPd)
+        : (STI.isThumb() ? ARM::tTAILJMPdND : ARM::TAILJMPdND);
       MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode));
       if (JumpTarget.isGlobal())
         MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
@@ -408,10 +412,12 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
                               JumpTarget.getTargetFlags());
       }
     } else if (RetOpcode == ARM::TCRETURNri) {
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPr)).
+      BuildMI(MBB, MBBI, dl,
+              TII.get(STI.isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr)).
         addReg(JumpTarget.getReg(), RegState::Kill);
     } else if (RetOpcode == ARM::TCRETURNriND) {
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPrND)).
+      BuildMI(MBB, MBBI, dl,
+              TII.get(STI.isThumb() ? ARM::tTAILJMPrND : ARM::TAILJMPrND)).
         addReg(JumpTarget.getReg(), RegState::Kill);
     }
 
@@ -439,8 +445,7 @@ ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
 
 int
 ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
-                                             int FI,
-                                             unsigned &FrameReg,
+                                             int FI, unsigned &FrameReg,
                                              int SPAdj) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const ARMBaseRegisterInfo *RegInfo =
@@ -484,19 +489,23 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
       return FPOffset;
     } else if (MFI->hasVarSizedObjects()) {
       assert(RegInfo->hasBasePointer(MF) && "missing base pointer!");
-      // Try to use the frame pointer if we can, else use the base pointer
-      // since it's available. This is handy for the emergency spill slot, in
-      // particular.
       if (AFI->isThumb2Function()) {
+        // Try to use the frame pointer if we can, else use the base pointer
+        // since it's available. This is handy for the emergency spill slot, in
+        // particular.
         if (FPOffset >= -255 && FPOffset < 0) {
           FrameReg = RegInfo->getFrameRegister(MF);
           return FPOffset;
         }
-      } else
-        FrameReg = RegInfo->getBaseRegister();
+      }
     } else if (AFI->isThumb2Function()) {
+      // Use  add <rd>, sp, #<imm8> 
+      //      ldr <rd>, [sp, #<imm8>]
+      // if at all possible to save space.
+      if (Offset >= 0 && (Offset & 3) == 0 && Offset <= 1020)
+        return Offset;
       // In Thumb2 mode, the negative offset is very limited. Try to avoid
-      // out of range references.
+      // out of range references. ldr <rt>,[<rn>, #-<imm8>]
       if (FPOffset >= -255 && FPOffset < 0) {
         FrameReg = RegInfo->getFrameRegister(MF);
         return FPOffset;
@@ -524,7 +533,8 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
                                     const std::vector<CalleeSavedInfo> &CSI,
                                     unsigned StmOpc, unsigned StrOpc,
                                     bool NoGap,
-                                    bool(*Func)(unsigned, bool)) const {
+                                    bool(*Func)(unsigned, bool),
+                                    unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
 
@@ -567,14 +577,14 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
     if (Regs.size() > 1 || StrOpc== 0) {
       MachineInstrBuilder MIB =
         AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(StmOpc), ARM::SP)
-                       .addReg(ARM::SP));
+                       .addReg(ARM::SP).setMIFlags(MIFlags));
       for (unsigned i = 0, e = Regs.size(); i < e; ++i)
         MIB.addReg(Regs[i].first, getKillRegState(Regs[i].second));
     } else if (Regs.size() == 1) {
       MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc),
                                         ARM::SP)
         .addReg(Regs[0].first, getKillRegState(Regs[0].second))
-        .addReg(ARM::SP);
+        .addReg(ARM::SP).setMIFlags(MIFlags);
       // ARM mode needs an extra reg0 here due to addrmode2. Will go away once
       // that refactoring is complete (eventually).
       if (StrOpc == ARM::STR_PRE) {
@@ -676,9 +686,12 @@ bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   unsigned PushOpc = AFI->isThumbFunction() ? ARM::t2STMDB_UPD : ARM::STMDB_UPD;
   unsigned PushOneOpc = AFI->isThumbFunction() ? ARM::t2STR_PRE : ARM::STR_PRE;
   unsigned FltOpc = ARM::VSTMDDB_UPD;
-  emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register);
-  emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register);
-  emitPushInst(MBB, MI, CSI, FltOpc, 0, true, &isARMArea3Register);
+  emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register,
+               MachineInstr::FrameSetup);
+  emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register,
+               MachineInstr::FrameSetup);
+  emitPushInst(MBB, MI, CSI, FltOpc, 0, true, &isARMArea3Register,
+               MachineInstr::FrameSetup);
 
   return true;
 }
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index 1288b70..61bb8af 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -51,7 +51,8 @@ public:
   bool canSimplifyCallFramePseudos(const MachineFunction &MF) const;
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const;
-  int ResolveFrameIndexReference(const MachineFunction &MF, int FI,
+  int ResolveFrameIndexReference(const MachineFunction &MF,
+                                 int FI,
                                  unsigned &FrameReg, int SPAdj) const;
   int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
 
@@ -62,7 +63,8 @@ public:
   void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                     const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc,
                     unsigned StrOpc, bool NoGap,
-                    bool(*Func)(unsigned, bool)) const;
+                    bool(*Func)(unsigned, bool),
+                    unsigned MIFlags = 0) const;
   void emitPopInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                    const std::vector<CalleeSavedInfo> &CSI, unsigned LdmOpc,
                    unsigned LdrOpc, bool isVarArg, bool NoGap,
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index e97ce50..517bba8 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -49,6 +49,8 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
       const TargetInstrDesc &LastTID = LastMI->getDesc();
       // Skip over one non-VFP / NEON instruction.
       if (!LastTID.isBarrier() &&
+          // On A9, AGU and NEON/FPU are muxed.
+          !(STI.isCortexA9() && (LastTID.mayLoad() || LastTID.mayStore())) &&
           (LastTID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
         MachineBasicBlock::iterator I = LastMI;
         if (I != LastMI->getParent()->begin()) {
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index f0d5a7d..abe5a31 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -45,7 +45,7 @@ DisableShifterOp("disable-shifter-op", cl::Hidden,
 static cl::opt<bool>
 CheckVMLxHazard("check-vmlx-hazard", cl::Hidden,
   cl::desc("Check fp vmla / vmls hazard at isel time"),
-  cl::init(false));
+  cl::init(true));
 
 //===--------------------------------------------------------------------===//
 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
@@ -91,9 +91,14 @@ public:
   bool isShifterOpProfitable(const SDValue &Shift,
                              ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt);
   bool SelectShifterOperandReg(SDValue N, SDValue &A,
-                               SDValue &B, SDValue &C);
+                               SDValue &B, SDValue &C,
+                               bool CheckProfitability = true);
   bool SelectShiftShifterOperandReg(SDValue N, SDValue &A,
-                                    SDValue &B, SDValue &C);
+                                    SDValue &B, SDValue &C) {
+    // Don't apply the profitability check
+    return SelectShifterOperandReg(N, A, B, C, false);
+  }
+
   bool SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
   bool SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc);
 
@@ -174,16 +179,6 @@ public:
     return ARM_AM::getT2SOImmVal(~Imm) != -1;
   }
 
-  inline bool Pred_so_imm(SDNode *inN) const {
-    ConstantSDNode *N = cast<ConstantSDNode>(inN);
-    return is_so_imm(N->getZExtValue());
-  }
-
-  inline bool Pred_t2_so_imm(SDNode *inN) const {
-    ConstantSDNode *N = cast<ConstantSDNode>(inN);
-    return is_t2_so_imm(N->getZExtValue());
-  }
-
   // Include the pieces autogenerated from the target description.
 #include "ARMGenDAGISel.inc"
 
@@ -373,7 +368,8 @@ bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
 bool ARMDAGToDAGISel::SelectShifterOperandReg(SDValue N,
                                               SDValue &BaseReg,
                                               SDValue &ShReg,
-                                              SDValue &Opc) {
+                                              SDValue &Opc,
+                                              bool CheckProfitability) {
   if (DisableShifterOp)
     return false;
 
@@ -390,7 +386,7 @@ bool ARMDAGToDAGISel::SelectShifterOperandReg(SDValue N,
     ShImmVal = RHS->getZExtValue() & 31;
   } else {
     ShReg = N.getOperand(1);
-    if (!isShifterOpProfitable(N, ShOpcVal, ShImmVal))
+    if (CheckProfitability && !isShifterOpProfitable(N, ShOpcVal, ShImmVal))
       return false;
   }
   Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
@@ -398,30 +394,6 @@ bool ARMDAGToDAGISel::SelectShifterOperandReg(SDValue N,
   return true;
 }
 
-bool ARMDAGToDAGISel::SelectShiftShifterOperandReg(SDValue N,
-                                                   SDValue &BaseReg,
-                                                   SDValue &ShReg,
-                                                   SDValue &Opc) {
-  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N);
-
-  // Don't match base register only case. That is matched to a separate
-  // lower complexity pattern with explicit register operand.
-  if (ShOpcVal == ARM_AM::no_shift) return false;
-
-  BaseReg = N.getOperand(0);
-  unsigned ShImmVal = 0;
-  // Do not check isShifterOpProfitable. This must return true.
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    ShReg = CurDAG->getRegister(0, MVT::i32);
-    ShImmVal = RHS->getZExtValue() & 31;
-  } else {
-    ShReg = N.getOperand(1);
-  }
-  Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
-                                  MVT::i32);
-  return true;
-}
-
 bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
                                           SDValue &Base,
                                           SDValue &OffImm) {
@@ -437,7 +409,7 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
       OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
       return true;
     }
-    
+
     if (N.getOpcode() == ARMISD::Wrapper &&
         !(Subtarget->useMovt() &&
                      N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
@@ -1138,7 +1110,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
       OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
       return true;
     }
-    
+
     if (N.getOpcode() == ARMISD::Wrapper &&
                !(Subtarget->useMovt() &&
                  N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
@@ -1183,7 +1155,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N,
   if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
       !CurDAG->isBaseWithConstantOffset(N))
     return false;
-  
+
   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
     int RHSC = (int)RHS->getSExtValue();
     if (N.getOpcode() == ISD::SUB)
@@ -1571,6 +1543,11 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
                                  Ops.data(), Ops.size());
   }
 
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
+
   if (NumVecs == 1)
     return VLd;
 
@@ -1600,6 +1577,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
     return NULL;
 
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
   SDValue Chain = N->getOperand(0);
   EVT VT = N->getOperand(Vec0Idx).getValueType();
   bool is64BitVector = VT.is64BitVector();
@@ -1672,7 +1652,13 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
     Ops.push_back(Pred);
     Ops.push_back(Reg0);
     Ops.push_back(Chain);
-    return CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size());
+    SDNode *VSt =
+      CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size());
+
+    // Transfer memoperands.
+    cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
+
+    return VSt;
   }
 
   // Otherwise, quad registers are stored with two separate instructions,
@@ -1693,6 +1679,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
                                         MemAddr.getValueType(),
                                         MVT::Other, OpsA, 7);
+  cast<MachineSDNode>(VStA)->setMemRefs(MemOp, MemOp + 1);
   Chain = SDValue(VStA, 1);
 
   // Store the odd D registers.
@@ -1709,8 +1696,10 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   Ops.push_back(Pred);
   Ops.push_back(Reg0);
   Ops.push_back(Chain);
-  return CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
-                                Ops.data(), Ops.size());
+  SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
+                                        Ops.data(), Ops.size());
+  cast<MachineSDNode>(VStB)->setMemRefs(MemOp, MemOp + 1);
+  return VStB;
 }
 
 SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
@@ -1726,6 +1715,9 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
     return NULL;
 
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
   SDValue Chain = N->getOperand(0);
   unsigned Lane =
     cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
@@ -1812,6 +1804,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
                                   QOpcodes[OpcodeIndex]);
   SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys,
                                          Ops.data(), Ops.size());
+  cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1);
   if (!IsLoad)
     return VLdLn;
 
@@ -1838,6 +1831,9 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
   if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
     return NULL;
 
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
   SDValue Chain = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
@@ -1882,12 +1878,13 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
 
   unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
   std::vector<EVT> ResTys;
-  ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts));
+  ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,ResTyElts));
   if (isUpdating)
     ResTys.push_back(MVT::i32);
   ResTys.push_back(MVT::Other);
   SDNode *VLdDup =
     CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size());
+  cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
   SuperReg = SDValue(VLdDup, 0);
 
   // Extract the subregisters.
@@ -2168,7 +2165,7 @@ SDNode *ARMDAGToDAGISel::SelectCMOVOp(SDNode *N) {
   // Emits: (tMOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
   // Pattern complexity = 6  cost = 11  size = 0
   //
-  // Also FCPYScc and FCPYDcc.
+  // Also VMOVScc and VMOVDcc.
   SDValue Tmp2 = CurDAG->getTargetConstant(CCVal, MVT::i32);
   SDValue Ops[] = { FalseVal, TrueVal, Tmp2, CCR, InFlag };
   unsigned Opc = 0;
@@ -2450,34 +2447,6 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   }
   case ARMISD::CMOV:
     return SelectCMOVOp(N);
-  case ARMISD::CNEG: {
-    EVT VT = N->getValueType(0);
-    SDValue N0 = N->getOperand(0);
-    SDValue N1 = N->getOperand(1);
-    SDValue N2 = N->getOperand(2);
-    SDValue N3 = N->getOperand(3);
-    SDValue InFlag = N->getOperand(4);
-    assert(N2.getOpcode() == ISD::Constant);
-    assert(N3.getOpcode() == ISD::Register);
-
-    SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
-                               cast<ConstantSDNode>(N2)->getZExtValue()),
-                               MVT::i32);
-    SDValue Ops[] = { N0, N1, Tmp2, N3, InFlag };
-    unsigned Opc = 0;
-    switch (VT.getSimpleVT().SimpleTy) {
-    default: assert(false && "Illegal conditional move type!");
-      break;
-    case MVT::f32:
-      Opc = ARM::VNEGScc;
-      break;
-    case MVT::f64:
-      Opc = ARM::VNEGDcc;
-      break;
-    }
-    return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 5);
-  }
-
   case ARMISD::VZIP: {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
@@ -2870,6 +2839,35 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     break;
   }
 
+  case ARMISD::VTBL1: {
+    DebugLoc dl = N->getDebugLoc();
+    EVT VT = N->getValueType(0);
+    SmallVector<SDValue, 6> Ops;
+
+    Ops.push_back(N->getOperand(0));
+    Ops.push_back(N->getOperand(1));
+    Ops.push_back(getAL(CurDAG));                    // Predicate
+    Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register
+    return CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops.data(), Ops.size());
+  }
+  case ARMISD::VTBL2: {
+    DebugLoc dl = N->getDebugLoc();
+    EVT VT = N->getValueType(0);
+
+    // Form a REG_SEQUENCE to force register allocation.
+    SDValue V0 = N->getOperand(0);
+    SDValue V1 = N->getOperand(1);
+    SDValue RegSeq = SDValue(PairDRegs(MVT::v16i8, V0, V1), 0);
+
+    SmallVector<SDValue, 6> Ops;
+    Ops.push_back(RegSeq);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(getAL(CurDAG));                    // Predicate
+    Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register
+    return CurDAG->getMachineNode(ARM::VTBL2Pseudo, dl, VT,
+                                  Ops.data(), Ops.size());
+  }
+
   case ISD::CONCAT_VECTORS:
     return SelectConcatVector(N);
   }
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index ab9f9e1..0a31b87 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -72,6 +72,11 @@ ARMInterworking("arm-interworking", cl::Hidden,
   cl::desc("Enable / disable ARM interworking (for debugging only)"),
   cl::init(true));
 
+// The APCS parameter registers.
+static const unsigned GPRArgRegs[] = {
+  ARM::R0, ARM::R1, ARM::R2, ARM::R3
+};
+
 void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
                                        EVT PromotedBitwiseVT) {
   if (VT != PromotedLdStVT) {
@@ -393,6 +398,12 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS);
   }
 
+  // Use divmod iOS compiler-rt calls.
+  if (Subtarget->getTargetTriple().getOS() == Triple::IOS) {
+    setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
+    setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
+  }
+
   if (Subtarget->isThumb1Only())
     addRegisterClass(MVT::i32, ARM::tGPRRegisterClass);
   else
@@ -461,6 +472,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
     setOperationAction(ISD::VSETCC, MVT::v1i64, Expand);
     setOperationAction(ISD::VSETCC, MVT::v2i64, Expand);
+    // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
+    // a destination type that is wider than the source.
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
 
     setTargetDAGCombine(ISD::INTRINSIC_VOID);
     setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
@@ -502,18 +517,15 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   }
 
   // i64 operation support.
+  setOperationAction(ISD::MUL,     MVT::i64, Expand);
+  setOperationAction(ISD::MULHU,   MVT::i32, Expand);
   if (Subtarget->isThumb1Only()) {
-    setOperationAction(ISD::MUL,     MVT::i64, Expand);
-    setOperationAction(ISD::MULHU,   MVT::i32, Expand);
-    setOperationAction(ISD::MULHS,   MVT::i32, Expand);
     setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
     setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
-  } else {
-    setOperationAction(ISD::MUL,     MVT::i64, Expand);
-    setOperationAction(ISD::MULHU,   MVT::i32, Expand);
-    if (!Subtarget->hasV6Ops())
-      setOperationAction(ISD::MULHS, MVT::i32, Expand);
   }
+  if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops())
+    setOperationAction(ISD::MULHS, MVT::i32, Expand);
+
   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
@@ -597,6 +609,18 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i8,  Expand);
     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
     // Since the libcalls include locking, fold in the fences
     setShouldFoldAtomicFences(true);
   }
@@ -716,7 +740,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
 // pressure of the register class's representative and all of it's super
 // classes' representatives transitively. We have not implemented this because
 // of the difficulty prior to coalescing of modeling operand register classes
-// due to the common occurence of cross class copies and subregister insertions
+// due to the common occurrence of cross class copies and subregister insertions
 // and extractions.
 std::pair<const TargetRegisterClass*, uint8_t>
 ARMTargetLowering::findRepresentativeClass(EVT VT) const{
@@ -778,7 +802,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
   case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
   case ARMISD::CMOV:          return "ARMISD::CMOV";
-  case ARMISD::CNEG:          return "ARMISD::CNEG";
 
   case ARMISD::RBIT:          return "ARMISD::RBIT";
 
@@ -853,6 +876,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VZIP:          return "ARMISD::VZIP";
   case ARMISD::VUZP:          return "ARMISD::VUZP";
   case ARMISD::VTRN:          return "ARMISD::VTRN";
+  case ARMISD::VTBL1:         return "ARMISD::VTBL1";
+  case ARMISD::VTBL2:         return "ARMISD::VTBL2";
   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
@@ -861,6 +886,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::BFI:           return "ARMISD::BFI";
   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
   case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
+  case ARMISD::VBSL:          return "ARMISD::VBSL";
   case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
   case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
   case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
@@ -946,27 +972,6 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
   return Sched::RegPressure;
 }
 
-// FIXME: Move to RegInfo
-unsigned
-ARMTargetLowering::getRegPressureLimit(const TargetRegisterClass *RC,
-                                       MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  switch (RC->getID()) {
-  default:
-    return 0;
-  case ARM::tGPRRegClassID:
-    return TFI->hasFP(MF) ? 4 : 5;
-  case ARM::GPRRegClassID: {
-    unsigned FP = TFI->hasFP(MF) ? 1 : 0;
-    return 10 - FP - (Subtarget->isR9Reserved() ? 1 : 0);
-  }
-  case ARM::SPRRegClassID:  // Currently not used as 'rep' register class.
-  case ARM::DPRRegClassID:
-    return 32 - 10;
-  }
-}
-
 //===----------------------------------------------------------------------===//
 // Lowering Code
 //===----------------------------------------------------------------------===//
@@ -1130,22 +1135,6 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   return Chain;
 }
 
-/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
-/// by "Src" to address "Dst" of size "Size".  Alignment information is
-/// specified by the specific parameter attribute.  The copy will be passed as
-/// a byval function parameter.
-/// Sometimes what we are copying is the end of a larger object, the part that
-/// does not fit in registers.
-static SDValue
-CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
-                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
-                          DebugLoc dl) {
-  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
-  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
-                       /*isVolatile=*/false, /*AlwaysInline=*/false,
-                       MachinePointerInfo(0), MachinePointerInfo(0));
-}
-
 /// LowerMemOpCallTo - Store the argument to the stack.
 SDValue
 ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
@@ -1156,9 +1145,6 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
   unsigned LocMemOffset = VA.getLocMemOffset();
   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
-  if (Flags.isByVal())
-    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
-
   return DAG.getStore(Chain, dl, Arg, PtrOff,
                       MachinePointerInfo::getStack(LocMemOffset),
                       false, false, 0);
@@ -1224,6 +1210,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
                  *DAG.getContext());
+  CCInfo.setCallOrPrologue(Call);
   CCInfo.AnalyzeCallOperands(Outs,
                              CCAssignFnForNode(CallConv, /* Return*/ false,
                                                isVarArg));
@@ -1253,6 +1240,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     CCValAssign &VA = ArgLocs[i];
     SDValue Arg = OutVals[realArgIdx];
     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+    bool isByVal = Flags.isByVal();
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
@@ -1299,6 +1287,43 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       }
     } else if (VA.isRegLoc()) {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else if (isByVal) {
+      assert(VA.isMemLoc());
+      unsigned offset = 0;
+
+      // True if this byval aggregate will be split between registers
+      // and memory.
+      if (CCInfo.isFirstByValRegValid()) {
+        EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+        unsigned int i, j;
+        for (i = 0, j = CCInfo.getFirstByValReg(); j < ARM::R4; i++, j++) {
+          SDValue Const = DAG.getConstant(4*i, MVT::i32);
+          SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
+                                     MachinePointerInfo(),
+                                     false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(j, Load));
+        }
+        offset = ARM::R4 - CCInfo.getFirstByValReg();
+        CCInfo.clearFirstByValReg();
+      }
+
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset);
+      SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
+                                StkPtrOff);
+      SDValue SrcOffset = DAG.getIntPtrConstant(4*offset);
+      SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset);
+      SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset,
+                                         MVT::i32);
+      MemOpChains.push_back(DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode,
+                                          Flags.getByValAlign(),
+                                          /*isVolatile=*/false,
+                                          /*AlwaysInline=*/false,
+                                          MachinePointerInfo(0),
+                                          MachinePointerInfo(0)));
+
     } else if (!IsSibCall) {
       assert(VA.isMemLoc());
 
@@ -1332,7 +1357,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     // than necessary, because it means that each store effectively depends
     // on every argument instead of just those arguments it would clobber.
 
-    // Do not flag preceeding copytoreg stuff together with the following stuff.
+    // Do not flag preceding copytoreg stuff together with the following stuff.
     InFlag = SDValue();
     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
@@ -1492,6 +1517,35 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                          dl, DAG, InVals);
 }
 
+/// HandleByVal - Every parameter *after* a byval parameter is passed
+/// on the stack.  Remember the next parameter register to allocate,
+/// and then confiscate the rest of the parameter registers to insure
+/// this.
+void
+llvm::ARMTargetLowering::HandleByVal(CCState *State, unsigned &size) const {
+  unsigned reg = State->AllocateReg(GPRArgRegs, 4);
+  assert((State->getCallOrPrologue() == Prologue ||
+          State->getCallOrPrologue() == Call) &&
+         "unhandled ParmContext");
+  if ((!State->isFirstByValRegValid()) &&
+      (ARM::R0 <= reg) && (reg <= ARM::R3)) {
+    State->setFirstByValReg(reg);
+    // At a call site, a byval parameter that is split between
+    // registers and memory needs its size truncated here.  In a
+    // function prologue, such byval parameters are reassembled in
+    // memory, and are not truncated.
+    if (State->getCallOrPrologue() == Call) {
+      unsigned excess = 4 * (ARM::R4 - reg);
+      assert(size >= excess && "expected larger existing stack allocation");
+      size -= excess;
+    }
+  }
+  // Confiscate any remaining parameter registers to preclude their
+  // assignment to subsequent parameters.
+  while (State->AllocateReg(GPRArgRegs, 4))
+    ;
+}
+
 /// MatchingStackOffset - Return true if the given stack call argument is
 /// already available in the same position (relatively) of the caller's
 /// incoming argument stack.
@@ -1813,6 +1867,16 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N) const {
   return HasRet;
 }
 
+bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  if (!EnableARMTailCalls)
+    return false;
+
+  if (!CI->isTailCall())
+    return false;
+
+  return !Subtarget->isThumb1Only();
+}
+
 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
 // one of the above mentioned nodes. It has to be wrapped because otherwise
@@ -2096,7 +2160,7 @@ ARMTargetLowering::LowerEH_SJLJ_DISPATCHSETUP(SDValue Op, SelectionDAG &DAG)
   const {
   DebugLoc dl = Op.getDebugLoc();
   return DAG.getNode(ARMISD::EH_SJLJ_DISPATCHSETUP, dl, MVT::Other,
-                     Op.getOperand(0), Op.getOperand(1));
+                     Op.getOperand(0));
 }
 
 SDValue
@@ -2151,6 +2215,13 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
     }
     return Result;
   }
+  case Intrinsic::arm_neon_vmulls:
+  case Intrinsic::arm_neon_vmullu: {
+    unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
+      ? ARMISD::VMULLs : ARMISD::VMULLu;
+    return DAG.getNode(NewOpc, Op.getDebugLoc(), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
   }
 }
 
@@ -2257,6 +2328,88 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
 }
 
+void
+ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
+                                  unsigned &VARegSize, unsigned &VARegSaveSize)
+  const {
+  unsigned NumGPRs;
+  if (CCInfo.isFirstByValRegValid())
+    NumGPRs = ARM::R4 - CCInfo.getFirstByValReg();
+  else {
+    unsigned int firstUnalloced;
+    firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs,
+                                                sizeof(GPRArgRegs) /
+                                                sizeof(GPRArgRegs[0]));
+    NumGPRs = (firstUnalloced <= 3) ? (4 - firstUnalloced) : 0;
+  }
+
+  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  VARegSize = NumGPRs * 4;
+  VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1);
+}
+
+// The remaining GPRs hold either the beginning of variable-argument
+// data, or the beginning of an aggregate passed by value (usuall
+// byval).  Either way, we allocate stack slots adjacent to the data
+// provided by our caller, and store the unallocated registers there.
+// If this is a variadic function, the va_list pointer will begin with
+// these values; otherwise, this reassembles a (byval) structure that
+// was split between registers and memory.
+void
+ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
+                                        DebugLoc dl, SDValue &Chain,
+                                        unsigned ArgOffset) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned firstRegToSaveIndex;
+  if (CCInfo.isFirstByValRegValid())
+    firstRegToSaveIndex = CCInfo.getFirstByValReg() - ARM::R0;
+  else {
+    firstRegToSaveIndex = CCInfo.getFirstUnallocated
+      (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0]));
+  }
+
+  unsigned VARegSize, VARegSaveSize;
+  computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize);
+  if (VARegSaveSize) {
+    // If this function is vararg, store any remaining integer argument regs
+    // to their spots on the stack so that they may be loaded by deferencing
+    // the result of va_next.
+    AFI->setVarArgsRegSaveSize(VARegSaveSize);
+    AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(VARegSaveSize,
+                                                     ArgOffset + VARegSaveSize
+                                                     - VARegSize,
+                                                     false));
+    SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(),
+                                    getPointerTy());
+
+    SmallVector<SDValue, 4> MemOps;
+    for (; firstRegToSaveIndex < 4; ++firstRegToSaveIndex) {
+      TargetRegisterClass *RC;
+      if (AFI->isThumb1OnlyFunction())
+        RC = ARM::tGPRRegisterClass;
+      else
+        RC = ARM::GPRRegisterClass;
+
+      unsigned VReg = MF.addLiveIn(GPRArgRegs[firstRegToSaveIndex], RC);
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+      SDValue Store =
+        DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                 MachinePointerInfo::getFixedStack(AFI->getVarArgsFrameIndex()),
+                     false, false, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
+                        DAG.getConstant(4, getPointerTy()));
+    }
+    if (!MemOps.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                          &MemOps[0], MemOps.size());
+  } else
+    // This will point to the next argument passed via stack.
+    AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true));
+}
+
 SDValue
 ARMTargetLowering::LowerFormalArguments(SDValue Chain,
                                         CallingConv::ID CallConv, bool isVarArg,
@@ -2265,7 +2418,6 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
                                         DebugLoc dl, SelectionDAG &DAG,
                                         SmallVectorImpl<SDValue> &InVals)
                                           const {
-
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
@@ -2275,12 +2427,15 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
                  *DAG.getContext());
+  CCInfo.setCallOrPrologue(Prologue);
   CCInfo.AnalyzeFormalArguments(Ins,
                                 CCAssignFnForNode(CallConv, /* Return*/ false,
                                                   isVarArg));
 
   SmallVector<SDValue, 16> ArgValues;
+  int lastInsIndex = -1;
 
+  SDValue ArgValue;
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
 
@@ -2288,7 +2443,6 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
     if (VA.isRegLoc()) {
       EVT RegVT = VA.getLocVT();
 
-      SDValue ArgValue;
       if (VA.needsCustom()) {
         // f64 and vector types are split up into multiple registers or
         // combinations of registers and stack slots.
@@ -2364,67 +2518,45 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
       assert(VA.isMemLoc());
       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
 
-      unsigned ArgSize = VA.getLocVT().getSizeInBits()/8;
-      int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(), true);
+      int index = ArgLocs[i].getValNo();
+
+      // Some Ins[] entries become multiple ArgLoc[] entries.
+      // Process them only once.
+      if (index != lastInsIndex)
+        {
+          ISD::ArgFlagsTy Flags = Ins[index].Flags;
+          // FIXME: For now, all byval parameter objects are marked mutable. 
+          // This can be changed with more analysis.
+          // In case of tail call optimization mark all arguments mutable.
+          // Since they could be overwritten by lowering of arguments in case of
+          // a tail call.
+          if (Flags.isByVal()) {
+            unsigned VARegSize, VARegSaveSize;
+            computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize);
+            VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0);
+            unsigned Bytes = Flags.getByValSize() - VARegSize;
+            if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
+            int FI = MFI->CreateFixedObject(Bytes,
+                                            VA.getLocMemOffset(), false);
+            InVals.push_back(DAG.getFrameIndex(FI, getPointerTy()));
+          } else {
+            int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
+                                            VA.getLocMemOffset(), true);
 
-      // Create load nodes to retrieve arguments from the stack.
-      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-      InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
-                                   MachinePointerInfo::getFixedStack(FI),
-                                   false, false, 0));
+            // Create load nodes to retrieve arguments from the stack.
+            SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+            InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
+                                         MachinePointerInfo::getFixedStack(FI),
+                                         false, false, 0));
+          }
+          lastInsIndex = index;
+        }
     }
   }
 
   // varargs
-  if (isVarArg) {
-    static const unsigned GPRArgRegs[] = {
-      ARM::R0, ARM::R1, ARM::R2, ARM::R3
-    };
-
-    unsigned NumGPRs = CCInfo.getFirstUnallocated
-      (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0]));
-
-    unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
-    unsigned VARegSize = (4 - NumGPRs) * 4;
-    unsigned VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1);
-    unsigned ArgOffset = CCInfo.getNextStackOffset();
-    if (VARegSaveSize) {
-      // If this function is vararg, store any remaining integer argument regs
-      // to their spots on the stack so that they may be loaded by deferencing
-      // the result of va_next.
-      AFI->setVarArgsRegSaveSize(VARegSaveSize);
-      AFI->setVarArgsFrameIndex(
-        MFI->CreateFixedObject(VARegSaveSize,
-                               ArgOffset + VARegSaveSize - VARegSize,
-                               false));
-      SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(),
-                                      getPointerTy());
-
-      SmallVector<SDValue, 4> MemOps;
-      for (; NumGPRs < 4; ++NumGPRs) {
-        TargetRegisterClass *RC;
-        if (AFI->isThumb1OnlyFunction())
-          RC = ARM::tGPRRegisterClass;
-        else
-          RC = ARM::GPRRegisterClass;
-
-        unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC);
-        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
-        SDValue Store =
-          DAG.getStore(Val.getValue(1), dl, Val, FIN,
-               MachinePointerInfo::getFixedStack(AFI->getVarArgsFrameIndex()),
-                       false, false, 0);
-        MemOps.push_back(Store);
-        FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
-                          DAG.getConstant(4, getPointerTy()));
-      }
-      if (!MemOps.empty())
-        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                            &MemOps[0], MemOps.size());
-    } else
-      // This will point to the next argument passed via stack.
-      AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true));
-  }
+  if (isVarArg)
+    VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset());
 
   return Chain;
 }
@@ -2517,6 +2649,27 @@ ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
   return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
 }
 
+/// duplicateCmp - Glue values can have only one use, so this function
+/// duplicates a comparison node.
+SDValue
+ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
+  unsigned Opc = Cmp.getOpcode();
+  DebugLoc DL = Cmp.getDebugLoc();
+  if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
+    return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
+
+  assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
+  Cmp = Cmp.getOperand(0);
+  Opc = Cmp.getOpcode();
+  if (Opc == ARMISD::CMPFP)
+    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
+  else {
+    assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
+    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
+  }
+  return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
+}
+
 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cond = Op.getOperand(0);
   SDValue SelectTrue = Op.getOperand(1);
@@ -2552,7 +2705,7 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
         EVT VT = Cond.getValueType();
         SDValue ARMcc = Cond.getOperand(2);
         SDValue CCR = Cond.getOperand(3);
-        SDValue Cmp = Cond.getOperand(4);
+        SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
         return DAG.getNode(ARMISD::CMOV, dl, VT, True, False, ARMcc, CCR, Cmp);
       }
     }
@@ -2681,8 +2834,8 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
       // If one of the operand is zero, it's safe to ignore the NaN case since
       // we only care about equality comparisons.
       (SeenZero || (DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS)))) {
-    // If unsafe fp math optimization is enabled and there are no othter uses of
-    // the CMP operands, and the condition code is EQ oe NE, we can optimize it
+    // If unsafe fp math optimization is enabled and there are no other uses of
+    // the CMP operands, and the condition code is EQ or NE, we can optimize it
     // to an integer comparison.
     if (CC == ISD::SETOEQ)
       CC = ISD::SETEQ;
@@ -2811,8 +2964,39 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
 }
 
+static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  EVT OperandVT = Op.getOperand(0).getValueType();
+  assert(OperandVT == MVT::v4i16 && "Invalid type for custom lowering!");
+  if (VT != MVT::v4f32)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  unsigned CastOpc;
+  unsigned Opc;
+  switch (Op.getOpcode()) {
+  default:
+    assert(0 && "Invalid opcode!");
+  case ISD::SINT_TO_FP:
+    CastOpc = ISD::SIGN_EXTEND;
+    Opc = ISD::SINT_TO_FP;
+    break;
+  case ISD::UINT_TO_FP:
+    CastOpc = ISD::ZERO_EXTEND;
+    Opc = ISD::UINT_TO_FP;
+    break;
+  }
+
+  Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0));
+  return DAG.getNode(Opc, dl, VT, Op);
+}
+
 static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
+  if (VT.isVector())
+    return LowerVectorINT_TO_FP(Op, DAG);
+
   DebugLoc dl = Op.getDebugLoc();
   unsigned Opc;
 
@@ -2860,7 +3044,10 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
         Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
                            DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
                            DAG.getConstant(32, MVT::i32));
-    }
+    } else if (VT == MVT::f32)
+      Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
+                         DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
+                         DAG.getConstant(32, MVT::i32));
     Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
     Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
 
@@ -2869,11 +3056,11 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
     AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
     SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
                                   DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
-                                              
+
     SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
                               DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
                               DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
-    if (SrcVT == MVT::f32) {
+    if (VT == MVT::f32) {
       Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
                         DAG.getConstant(0, MVT::i32));
@@ -3508,6 +3695,13 @@ static bool isVREVMask(const SmallVectorImpl<int> &M, EVT VT,
   return true;
 }
 
+static bool isVTBLMask(const SmallVectorImpl<int> &M, EVT VT) {
+  // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
+  // range, then 0 is placed into the resulting vector. So pretty much any mask
+  // of 8 elements can work here.
+  return VT == MVT::v8i8 && M.size() == 8;
+}
+
 static bool isVTRNMask(const SmallVectorImpl<int> &M, EVT VT,
                        unsigned &WhichResult) {
   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
@@ -3947,6 +4141,7 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
           isVREVMask(M, VT, 32) ||
           isVREVMask(M, VT, 16) ||
           isVEXTMask(M, VT, ReverseVEXT, Imm) ||
+          isVTBLMask(M, VT) ||
           isVTRNMask(M, VT, WhichResult) ||
           isVUZPMask(M, VT, WhichResult) ||
           isVZIPMask(M, VT, WhichResult) ||
@@ -4024,6 +4219,29 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
   }
 }
 
+static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
+                                       SmallVectorImpl<int> &ShuffleMask,
+                                       SelectionDAG &DAG) {
+  // Check to see if we can use the VTBL instruction.
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  DebugLoc DL = Op.getDebugLoc();
+
+  SmallVector<SDValue, 8> VTBLMask;
+  for (SmallVectorImpl<int>::iterator
+         I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
+    VTBLMask.push_back(DAG.getConstant(*I, MVT::i32));
+
+  if (V2.getNode()->getOpcode() == ISD::UNDEF)
+    return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
+                       DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
+                                   &VTBLMask[0], 8));
+
+  return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
+                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
+                                 &VTBLMask[0], 8));
+}
+
 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
@@ -4141,6 +4359,12 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
   }
 
+  if (VT == MVT::v8i8) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
   return SDValue();
 }
 
@@ -4290,6 +4514,28 @@ static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) {
                      MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts);
 }
 
+static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
+  }
+  return false;
+}
+
+static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
+  }
+  return false;
+}
+
 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
   // Multiplications are only custom-lowered for 128-bit vectors so that
   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
@@ -4298,29 +4544,73 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
   SDNode *N0 = Op.getOperand(0).getNode();
   SDNode *N1 = Op.getOperand(1).getNode();
   unsigned NewOpc = 0;
-  if (isSignExtended(N0, DAG) && isSignExtended(N1, DAG))
+  bool isMLA = false;
+  bool isN0SExt = isSignExtended(N0, DAG);
+  bool isN1SExt = isSignExtended(N1, DAG);
+  if (isN0SExt && isN1SExt)
     NewOpc = ARMISD::VMULLs;
-  else if (isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG))
-    NewOpc = ARMISD::VMULLu;
-  else if (VT == MVT::v2i64)
-    // Fall through to expand this.  It is not legal.
-    return SDValue();
-  else
-    // Other vector multiplications are legal.
-    return Op;
+  else {
+    bool isN0ZExt = isZeroExtended(N0, DAG);
+    bool isN1ZExt = isZeroExtended(N1, DAG);
+    if (isN0ZExt && isN1ZExt)
+      NewOpc = ARMISD::VMULLu;
+    else if (isN1SExt || isN1ZExt) {
+      // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
+      // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
+      if (isN1SExt && isAddSubSExt(N0, DAG)) {
+        NewOpc = ARMISD::VMULLs;
+        isMLA = true;
+      } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
+        NewOpc = ARMISD::VMULLu;
+        isMLA = true;
+      } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
+        std::swap(N0, N1);
+        NewOpc = ARMISD::VMULLu;
+        isMLA = true;
+      }
+    }
+
+    if (!NewOpc) {
+      if (VT == MVT::v2i64)
+        // Fall through to expand this.  It is not legal.
+        return SDValue();
+      else
+        // Other vector multiplications are legal.
+        return Op;
+    }
+  }
 
   // Legalize to a VMULL instruction.
   DebugLoc DL = Op.getDebugLoc();
-  SDValue Op0 = SkipExtension(N0, DAG);
+  SDValue Op0;
   SDValue Op1 = SkipExtension(N1, DAG);
-
-  assert(Op0.getValueType().is64BitVector() &&
-         Op1.getValueType().is64BitVector() &&
-         "unexpected types for extended operands to VMULL");
-  return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
+  if (!isMLA) {
+    Op0 = SkipExtension(N0, DAG);
+    assert(Op0.getValueType().is64BitVector() &&
+           Op1.getValueType().is64BitVector() &&
+           "unexpected types for extended operands to VMULL");
+    return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
+  }
+
+  // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
+  // isel lowering to take advantage of no-stall back to back vmul + vmla.
+  //   vmull q0, d4, d6
+  //   vmlal q0, d5, d6
+  // is faster than
+  //   vaddl q0, d4, d5
+  //   vmovl q1, d6
+  //   vmul  q0, q0, q1
+  SDValue N00 = SkipExtension(N0->getOperand(0).getNode(), DAG);
+  SDValue N01 = SkipExtension(N0->getOperand(1).getNode(), DAG);
+  EVT Op1VT = Op1.getValueType();
+  return DAG.getNode(N0->getOpcode(), DL, VT,
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
 }
 
-static SDValue 
+static SDValue
 LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
   // Convert to float
   // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
@@ -4331,7 +4621,7 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
   Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
   // Get reciprocal estimate.
   // float4 recip = vrecpeq_f32(yf);
-  Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 
+  Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), Y);
   // Because char has a smaller range than uchar, we can actually get away
   // without any newton steps.  This requires that we use a weird bias
@@ -4349,7 +4639,7 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
   return X;
 }
 
-static SDValue 
+static SDValue
 LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) {
   SDValue N2;
   // Convert to float.
@@ -4359,13 +4649,13 @@ LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) {
   N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
   N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
-  
+
   // Use reciprocal estimate and one refinement step.
   // float4 recip = vrecpeq_f32(yf);
   // recip *= vrecpsq_f32(yf, recip);
-  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 
+  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
-  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 
+  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                    DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
                    N1, N2);
   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
@@ -4395,15 +4685,15 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
   SDValue N0 = Op.getOperand(0);
   SDValue N1 = Op.getOperand(1);
   SDValue N2, N3;
-  
+
   if (VT == MVT::v8i8) {
     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
-    
+
     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
                      DAG.getIntPtrConstant(4));
     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
-                     DAG.getIntPtrConstant(4)); 
+                     DAG.getIntPtrConstant(4));
     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
                      DAG.getIntPtrConstant(0));
     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
@@ -4414,7 +4704,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
 
     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
     N0 = LowerCONCAT_VECTORS(N0, DAG);
-    
+
     N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
     return N0;
   }
@@ -4430,32 +4720,32 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
   SDValue N0 = Op.getOperand(0);
   SDValue N1 = Op.getOperand(1);
   SDValue N2, N3;
-  
+
   if (VT == MVT::v8i8) {
     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
     N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
-    
+
     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
                      DAG.getIntPtrConstant(4));
     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
-                     DAG.getIntPtrConstant(4)); 
+                     DAG.getIntPtrConstant(4));
     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
                      DAG.getIntPtrConstant(0));
     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
                      DAG.getIntPtrConstant(0));
-    
+
     N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
     N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
-    
+
     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
     N0 = LowerCONCAT_VECTORS(N0, DAG);
-    
-    N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 
+
+    N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
                      DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, MVT::i32),
                      N0);
     return N0;
   }
-  
+
   // v4i16 sdiv ... Convert to float.
   // float4 yf = vcvt_f32_s32(vmovl_u16(y));
   // float4 xf = vcvt_f32_s32(vmovl_u16(x));
@@ -4468,13 +4758,13 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
   // float4 recip = vrecpeq_f32(yf);
   // recip *= vrecpsq_f32(yf, recip);
   // recip *= vrecpsq_f32(yf, recip);
-  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 
+  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
-  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 
+  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                    DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
                    N1, N2);
   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
-  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 
+  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                    DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
                    N1, N2);
   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
@@ -4503,7 +4793,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::GlobalAddress:
     return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) :
       LowerGlobalAddressELF(Op, DAG);
-  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
   case ISD::SELECT:        return LowerSELECT(Op, DAG);
   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
   case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
@@ -4524,7 +4814,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::EH_SJLJ_DISPATCHSETUP: return LowerEH_SJLJ_DISPATCHSETUP(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
                                                                Subtarget);
-  case ISD::BITCAST:   return ExpandBITCAST(Op.getNode(), DAG);
+  case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG);
   case ISD::SHL:
   case ISD::SRL:
   case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
@@ -4754,6 +5044,109 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   return BB;
 }
 
+MachineBasicBlock *
+ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
+                                          MachineBasicBlock *BB,
+                                          unsigned Size,
+                                          bool signExtend,
+                                          ARMCC::CondCodes Cond) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *MF = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptr = MI->getOperand(1).getReg();
+  unsigned incr = MI->getOperand(2).getReg();
+  unsigned oldval = dest;
+  DebugLoc dl = MI->getDebugLoc();
+
+  bool isThumb2 = Subtarget->isThumb2();
+  unsigned ldrOpc, strOpc, extendOpc;
+  switch (Size) {
+  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
+  case 1:
+    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
+    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
+    extendOpc = isThumb2 ? ARM::t2SXTBr : ARM::SXTBr;
+    break;
+  case 2:
+    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
+    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
+    extendOpc = isThumb2 ? ARM::t2SXTHr : ARM::SXTHr;
+    break;
+  case 4:
+    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
+    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
+    extendOpc = 0;
+    break;
+  }
+
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  unsigned scratch = RegInfo.createVirtualRegister(ARM::GPRRegisterClass);
+  unsigned scratch2 = RegInfo.createVirtualRegister(ARM::GPRRegisterClass);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   ldrex dest, ptr
+  //   (sign extend dest, if required)
+  //   cmp dest, incr
+  //   cmov.cond scratch2, dest, incr
+  //   strex scratch, scratch2, ptr
+  //   cmp scratch, #0
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+  BB = loopMBB;
+  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr));
+
+  // Sign extend the value, if necessary.
+  if (signExtend && extendOpc) {
+    oldval = RegInfo.createVirtualRegister(ARM::GPRRegisterClass);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval).addReg(dest));
+  }
+
+  // Build compare and cmov instructions.
+  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
+                 .addReg(oldval).addReg(incr));
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr), scratch2)
+         .addReg(oldval).addReg(incr).addImm(Cond).addReg(ARM::CPSR);
+
+  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2)
+                 .addReg(ptr));
+  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                 .addReg(scratch).addImm(0));
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
 static
 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
@@ -4763,6 +5156,72 @@ MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
   llvm_unreachable("Expecting a BB with two successors!");
 }
 
+// FIXME: This opcode table should obviously be expressed in the target
+// description. We probably just need a "machine opcode" value in the pseudo
+// instruction. But the ideal solution maybe to simply remove the "S" version
+// of the opcode altogether.
+struct AddSubFlagsOpcodePair {
+  unsigned PseudoOpc;
+  unsigned MachineOpc;
+};
+
+static AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
+  {ARM::ADCSri, ARM::ADCri},
+  {ARM::ADCSrr, ARM::ADCrr},
+  {ARM::ADCSrs, ARM::ADCrs},
+  {ARM::SBCSri, ARM::SBCri},
+  {ARM::SBCSrr, ARM::SBCrr},
+  {ARM::SBCSrs, ARM::SBCrs},
+  {ARM::RSBSri, ARM::RSBri},
+  {ARM::RSBSrr, ARM::RSBrr},
+  {ARM::RSBSrs, ARM::RSBrs},
+  {ARM::RSCSri, ARM::RSCri},
+  {ARM::RSCSrs, ARM::RSCrs},
+  {ARM::t2ADCSri, ARM::t2ADCri},
+  {ARM::t2ADCSrr, ARM::t2ADCrr},
+  {ARM::t2ADCSrs, ARM::t2ADCrs},
+  {ARM::t2SBCSri, ARM::t2SBCri},
+  {ARM::t2SBCSrr, ARM::t2SBCrr},
+  {ARM::t2SBCSrs, ARM::t2SBCrs},
+  {ARM::t2RSBSri, ARM::t2RSBri},
+  {ARM::t2RSBSrs, ARM::t2RSBrs},
+};
+
+// Convert and Add or Subtract with Carry and Flags to a generic opcode with
+// CPSR<def> operand. e.g. ADCS (...) -> ADC (... CPSR<def>).
+//
+// FIXME: Somewhere we should assert that CPSR<def> is in the correct
+// position to be recognized by the target descrition as the 'S' bit.
+bool ARMTargetLowering::RemapAddSubWithFlags(MachineInstr *MI,
+                                             MachineBasicBlock *BB) const {
+  unsigned OldOpc = MI->getOpcode();
+  unsigned NewOpc = 0;
+
+  // This is only called for instructions that need remapping, so iterating over
+  // the tiny opcode table is not costly.
+  static const int NPairs =
+    sizeof(AddSubFlagsOpcodeMap) / sizeof(AddSubFlagsOpcodePair);
+  for (AddSubFlagsOpcodePair *Pair = &AddSubFlagsOpcodeMap[0],
+         *End = &AddSubFlagsOpcodeMap[NPairs]; Pair != End; ++Pair) {
+    if (OldOpc == Pair->PseudoOpc) {
+      NewOpc = Pair->MachineOpc;
+      break;
+    }
+  }
+  if (!NewOpc)
+    return false;
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
+  for (unsigned i = 0; i < MI->getNumOperands(); ++i)
+    MIB.addOperand(MI->getOperand(i));
+  AddDefaultPred(MIB);
+  MIB.addReg(ARM::CPSR, RegState::Define); // S bit
+  MI->eraseFromParent();
+  return true;
+}
+
 MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                MachineBasicBlock *BB) const {
@@ -4770,10 +5229,13 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   DebugLoc dl = MI->getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
   switch (MI->getOpcode()) {
-  default:
+  default: {
+    if (RemapAddSubWithFlags(MI, BB))
+      return BB;
+
     MI->dump();
     llvm_unreachable("Unexpected instr type to insert");
-
+  }
   case ARM::ATOMIC_LOAD_ADD_I8:
      return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
   case ARM::ATOMIC_LOAD_ADD_I16:
@@ -4816,6 +5278,34 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case ARM::ATOMIC_LOAD_SUB_I32:
      return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
 
+  case ARM::ATOMIC_LOAD_MIN_I8:
+     return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::LT);
+  case ARM::ATOMIC_LOAD_MIN_I16:
+     return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::LT);
+  case ARM::ATOMIC_LOAD_MIN_I32:
+     return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::LT);
+
+  case ARM::ATOMIC_LOAD_MAX_I8:
+     return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::GT);
+  case ARM::ATOMIC_LOAD_MAX_I16:
+     return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::GT);
+  case ARM::ATOMIC_LOAD_MAX_I32:
+     return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::GT);
+
+  case ARM::ATOMIC_LOAD_UMIN_I8:
+     return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::LO);
+  case ARM::ATOMIC_LOAD_UMIN_I16:
+     return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::LO);
+  case ARM::ATOMIC_LOAD_UMIN_I32:
+     return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::LO);
+
+  case ARM::ATOMIC_LOAD_UMAX_I8:
+     return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::HI);
+  case ARM::ATOMIC_LOAD_UMAX_I16:
+     return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::HI);
+  case ARM::ATOMIC_LOAD_UMAX_I32:
+     return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::HI);
+
   case ARM::ATOMIC_SWAP_I8:  return EmitAtomicBinary(MI, BB, 1, 0);
   case ARM::ATOMIC_SWAP_I16: return EmitAtomicBinary(MI, BB, 2, 0);
   case ARM::ATOMIC_SWAP_I32: return EmitAtomicBinary(MI, BB, 4, 0);
@@ -5034,6 +5524,42 @@ static SDValue PerformSUBCombine(SDNode *N,
   return SDValue();
 }
 
+/// PerformVMULCombine
+/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
+/// special multiplier accumulator forwarding.
+///   vmul d3, d0, d2
+///   vmla d3, d1, d2
+/// is faster than
+///   vadd d3, d0, d1
+///   vmul d3, d3, d2
+static SDValue PerformVMULCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasVMLxForwarding())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  unsigned Opcode = N0.getOpcode();
+  if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
+      Opcode != ISD::FADD && Opcode != ISD::FSUB) {
+    Opcode = N0.getOpcode();
+    if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
+        Opcode != ISD::FADD && Opcode != ISD::FSUB)
+      return SDValue();
+    std::swap(N0, N1);
+  }
+
+  EVT VT = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+  SDValue N00 = N0->getOperand(0);
+  SDValue N01 = N0->getOperand(1);
+  return DAG.getNode(Opcode, DL, VT,
+                     DAG.getNode(ISD::MUL, DL, VT, N00, N1),
+                     DAG.getNode(ISD::MUL, DL, VT, N01, N1));
+}
+
 static SDValue PerformMULCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
@@ -5046,6 +5572,8 @@ static SDValue PerformMULCombine(SDNode *N,
     return SDValue();
 
   EVT VT = N->getValueType(0);
+  if (VT.is64BitVector() || VT.is128BitVector())
+    return PerformVMULCombine(N, DCI, Subtarget);
   if (VT != MVT::i32)
     return SDValue();
 
@@ -5088,12 +5616,16 @@ static SDValue PerformMULCombine(SDNode *N,
 
 static SDValue PerformANDCombine(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI) {
+
   // Attempt to use immediate-form VBIC
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
   DebugLoc dl = N->getDebugLoc();
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
 
+  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
@@ -5127,6 +5659,9 @@ static SDValue PerformORCombine(SDNode *N,
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
 
+  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
@@ -5147,6 +5682,37 @@ static SDValue PerformORCombine(SDNode *N,
     }
   }
 
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() != ISD::AND)
+    return SDValue();
+  SDValue N1 = N->getOperand(1);
+
+  // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
+  if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
+      DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+    APInt SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+
+    BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
+    APInt SplatBits0;
+    if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
+                                  HasAnyUndefs) && !HasAnyUndefs) {
+      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
+      APInt SplatBits1;
+      if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
+                                    HasAnyUndefs) && !HasAnyUndefs &&
+          SplatBits0 == ~SplatBits1) {
+        // Canonicalize the vector type to make instruction selection simpler.
+        EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
+        SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
+                                     N0->getOperand(1), N0->getOperand(0),
+                                     N1->getOperand(0));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+      }
+    }
+  }
+
   // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
   // reasonable.
 
@@ -5154,19 +5720,16 @@ static SDValue PerformORCombine(SDNode *N,
   if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
     return SDValue();
 
-  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
   DebugLoc DL = N->getDebugLoc();
   // 1) or (and A, mask), val => ARMbfi A, val, mask
   //      iff (val & mask) == val
   //
   // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
   //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
-  //          && CountPopulation_32(mask) == CountPopulation_32(~mask2)
+  //          && mask == ~mask2
   //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
-  //          && CountPopulation_32(mask) == CountPopulation_32(~mask2)
+  //          && ~mask == mask2
   //  (i.e., copy a bitfield value into another bitfield of the same width)
-  if (N0.getOpcode() != ISD::AND)
-    return SDValue();
 
   if (VT != MVT::i32)
     return SDValue();
@@ -5209,26 +5772,26 @@ static SDValue PerformORCombine(SDNode *N,
       return SDValue();
     unsigned Mask2 = N11C->getZExtValue();
 
+    // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
+    // as is to match.
     if (ARM::isBitFieldInvertedMask(Mask) &&
-        ARM::isBitFieldInvertedMask(~Mask2) &&
-        (CountPopulation_32(Mask) == CountPopulation_32(~Mask2))) {
+        (Mask == ~Mask2)) {
       // The pack halfword instruction works better for masks that fit it,
       // so use that when it's available.
       if (Subtarget->hasT2ExtractPack() &&
           (Mask == 0xffff || Mask == 0xffff0000))
         return SDValue();
       // 2a
-      unsigned lsb = CountTrailingZeros_32(Mask2);
+      unsigned amt = CountTrailingZeros_32(Mask2);
       Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
-                        DAG.getConstant(lsb, MVT::i32));
+                        DAG.getConstant(amt, MVT::i32));
       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
                         DAG.getConstant(Mask, MVT::i32));
       // Do not add new nodes to DAG combiner worklist.
       DCI.CombineTo(N, Res, false);
       return SDValue();
     } else if (ARM::isBitFieldInvertedMask(~Mask) &&
-               ARM::isBitFieldInvertedMask(Mask2) &&
-               (CountPopulation_32(~Mask) == CountPopulation_32(Mask2))) {
+               (~Mask == Mask2)) {
       // The pack halfword instruction works better for masks that fit it,
       // so use that when it's available.
       if (Subtarget->hasT2ExtractPack() &&
@@ -5239,7 +5802,7 @@ static SDValue PerformORCombine(SDNode *N,
       Res = DAG.getNode(ISD::SRL, DL, VT, N00,
                         DAG.getConstant(lsb, MVT::i32));
       Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
-                                DAG.getConstant(Mask2, MVT::i32));
+                        DAG.getConstant(Mask2, MVT::i32));
       // Do not add new nodes to DAG combiner worklist.
       DCI.CombineTo(N, Res, false);
       return SDValue();
@@ -5294,6 +5857,37 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
   SDValue InDouble = N->getOperand(0);
   if (InDouble.getOpcode() == ARMISD::VMOVDRR)
     return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
+
+  // vmovrrd(load f64) -> (load i32), (load i32)
+  SDNode *InNode = InDouble.getNode();
+  if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
+      InNode->getValueType(0) == MVT::f64 &&
+      InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
+      !cast<LoadSDNode>(InNode)->isVolatile()) {
+    // TODO: Should this be done for non-FrameIndex operands?
+    LoadSDNode *LD = cast<LoadSDNode>(InNode);
+
+    SelectionDAG &DAG = DCI.DAG;
+    DebugLoc DL = LD->getDebugLoc();
+    SDValue BasePtr = LD->getBasePtr();
+    SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr,
+                                 LD->getPointerInfo(), LD->isVolatile(),
+                                 LD->isNonTemporal(), LD->getAlignment());
+
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+                                    DAG.getConstant(4, MVT::i32));
+    SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr,
+                                 LD->getPointerInfo(), LD->isVolatile(),
+                                 LD->isNonTemporal(),
+                                 std::min(4U, LD->getAlignment() / 2));
+
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
+    SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
+    DCI.RemoveFromWorklist(LD);
+    DAG.DeleteNode(LD);
+    return Result;
+  }
+
   return SDValue();
 }
 
@@ -5323,8 +5917,28 @@ static SDValue PerformSTORECombine(SDNode *N,
   // Otherwise, the i64 value will be legalized to a pair of i32 values.
   StoreSDNode *St = cast<StoreSDNode>(N);
   SDValue StVal = St->getValue();
-  if (!ISD::isNormalStore(St) || St->isVolatile() ||
-      StVal.getValueType() != MVT::i64 ||
+  if (!ISD::isNormalStore(St) || St->isVolatile())
+    return SDValue();
+
+  if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
+      StVal.getNode()->hasOneUse() && !St->isVolatile()) {
+    SelectionDAG  &DAG = DCI.DAG;
+    DebugLoc DL = St->getDebugLoc();
+    SDValue BasePtr = St->getBasePtr();
+    SDValue NewST1 = DAG.getStore(St->getChain(), DL,
+                                  StVal.getNode()->getOperand(0), BasePtr,
+                                  St->getPointerInfo(), St->isVolatile(),
+                                  St->isNonTemporal(), St->getAlignment());
+
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+                                    DAG.getConstant(4, MVT::i32));
+    return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(1),
+                        OffsetPtr, St->getPointerInfo(), St->isVolatile(),
+                        St->isNonTemporal(),
+                        std::min(4U, St->getAlignment() / 2));
+  }
+
+  if (StVal.getValueType() != MVT::i64 ||
       StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
     return SDValue();
 
@@ -5553,7 +6167,7 @@ static SDValue CombineBaseUpdate(SDNode *N,
     EVT VecTy;
     if (isLoad)
       VecTy = N->getValueType(0);
-    else 
+    else
       VecTy = N->getOperand(AddrOpIdx+1).getValueType();
     unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
     if (isLaneOp)
@@ -5603,7 +6217,7 @@ static SDValue CombineBaseUpdate(SDNode *N,
     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
 
     break;
-  } 
+  }
   return SDValue();
 }
 
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index dc400c4..a2e6260 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -57,7 +57,6 @@ namespace llvm {
       CMPFPw0,      // ARM VFP compare against zero instruction, sets FPSCR.
       FMSTAT,       // ARM fmstat instruction.
       CMOV,         // ARM conditional move instructions.
-      CNEG,         // ARM conditional negate instructions.
 
       BCC_i64,
 
@@ -89,7 +88,7 @@ namespace llvm {
       MEMBARRIER_MCR, // Memory barrier (MCR)
 
       PRELOAD,      // Preload
-      
+
       VCEQ,         // Vector compare equal.
       VCEQZ,        // Vector compare equal to zero.
       VCGE,         // Vector compare greater than or equal.
@@ -154,6 +153,8 @@ namespace llvm {
       VZIP,         // zip (interleave)
       VUZP,         // unzip (deinterleave)
       VTRN,         // transpose
+      VTBL1,        // 1-register shuffle with mask
+      VTBL2,        // 2-register shuffle with mask
 
       // Vector multiply long:
       VMULLs,       // ...signed
@@ -172,12 +173,15 @@ namespace llvm {
 
       // Bit-field insert
       BFI,
-      
+
       // Vector OR with immediate
       VORRIMM,
       // Vector AND with NOT of immediate
       VBICIMM,
 
+      // Vector bitwise select
+      VBSL,
+
       // Vector load N-element structure to all lanes:
       VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
       VLD3DUP,
@@ -330,9 +334,6 @@ namespace llvm {
 
     Sched::Preference getSchedulingPreference(SDNode *N) const;
 
-    unsigned getRegPressureLimit(const TargetRegisterClass *RC,
-                                 MachineFunction &MF) const;
-
     bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const;
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
 
@@ -407,7 +408,7 @@ namespace llvm {
     SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                               const ARMSubtarget *ST) const;
 
     SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
@@ -425,6 +426,13 @@ namespace llvm {
                            DebugLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
 
+    void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
+                              DebugLoc dl, SDValue &Chain, unsigned ArgOffset)
+      const;
+
+    void computeRegArea(CCState &CCInfo, MachineFunction &MF,
+                        unsigned &VARegSize, unsigned &VARegSaveSize) const;
+
     virtual SDValue
       LowerCall(SDValue Chain, SDValue Callee,
                 CallingConv::ID CallConv, bool isVarArg,
@@ -435,6 +443,9 @@ namespace llvm {
                 DebugLoc dl, SelectionDAG &DAG,
                 SmallVectorImpl<SDValue> &InVals) const;
 
+    /// HandleByVal - Target-specific cleanup for ByVal support.
+    virtual void HandleByVal(CCState *, unsigned &) const;
+
     /// IsEligibleForTailCallOptimization - Check whether the call is eligible
     /// for tail call optimization. Targets which want to do tail call
     /// optimization should implement this function.
@@ -456,10 +467,13 @@ namespace llvm {
 
     virtual bool isUsedByReturnOnly(SDNode *N) const;
 
+    virtual bool mayBeEmittedAsTailCall(CallInst *CI) const;
+
     SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                       SDValue &ARMcc, SelectionDAG &DAG, DebugLoc dl) const;
     SDValue getVFPCmp(SDValue LHS, SDValue RHS,
                       SelectionDAG &DAG, DebugLoc dl) const;
+    SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const;
 
     SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const;
 
@@ -470,16 +484,22 @@ namespace llvm {
                                         MachineBasicBlock *BB,
                                         unsigned Size,
                                         unsigned BinOpcode) const;
+    MachineBasicBlock * EmitAtomicBinaryMinMax(MachineInstr *MI,
+                                               MachineBasicBlock *BB,
+                                               unsigned Size,
+                                               bool signExtend,
+                                               ARMCC::CondCodes Cond) const;
 
+    bool RemapAddSubWithFlags(MachineInstr *MI, MachineBasicBlock *BB) const;
   };
-  
+
   enum NEONModImmType {
     VMOVModImm,
     VMVNModImm,
     OtherModImm
   };
-  
-  
+
+
   namespace ARM {
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo);
   }
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 359ac45..f5fb98e 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -206,19 +206,30 @@ def setend_op : Operand<i32> {
   let PrintMethod = "printSetendOperand";
 }
 
-def cps_opt : Operand<i32> {
-  let PrintMethod = "printCPSOptionOperand";
-}
-
 def msr_mask : Operand<i32> {
   let PrintMethod = "printMSRMaskOperand";
   let ParserMatchClass = MSRMaskOperand;
 }
 
-// A8.6.117, A8.6.118.  Different instructions are generated for #0 and #-0.
-// The neg_zero operand translates -0 to -1, -1 to -2, ..., etc.
-def neg_zero : Operand<i32> {
-  let PrintMethod = "printNegZeroOperand";
+// Shift Right Immediate - A shift right immediate is encoded differently from
+// other shift immediates. The imm6 field is encoded like so:
+//
+//    Offset    Encoding
+//     8        imm6<5:3> = '001', 8 - <imm> is encoded in imm6<2:0>
+//     16       imm6<5:4> = '01', 16 - <imm> is encoded in imm6<3:0>
+//     32       imm6<5> = '1', 32 - <imm> is encoded in imm6<4:0>
+//     64       64 - <imm> is encoded in imm6<5:0>
+def shr_imm8  : Operand<i32> {
+  let EncoderMethod = "getShiftRight8Imm";
+}
+def shr_imm16 : Operand<i32> {
+  let EncoderMethod = "getShiftRight16Imm";
+}
+def shr_imm32 : Operand<i32> {
+  let EncoderMethod = "getShiftRight32Imm";
+}
+def shr_imm64 : Operand<i32> {
+  let EncoderMethod = "getShiftRight64Imm";
 }
 
 //===----------------------------------------------------------------------===//
@@ -279,6 +290,7 @@ class PseudoInst<dag oops, dag iops, InstrItinClass itin, list<dag> pattern>
   let OutOperandList = oops;
   let InOperandList = iops;
   let Pattern = pattern;
+  let isCodeGenOnly = 1;
 }
 
 // PseudoInst that's ARM-mode only.
@@ -422,11 +434,11 @@ class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
       opc, asm, "", pattern> {
   bits<4> Rd;
   bits<4> Rt;
-  bits<4> Rn;
+  bits<4> addr;
   let Inst{27-23} = 0b00011;
   let Inst{22-21} = opcod;
   let Inst{20}    = 0;
-  let Inst{19-16} = Rn;
+  let Inst{19-16} = addr;
   let Inst{15-12} = Rd;
   let Inst{11-4}  = 0b11111001;
   let Inst{3-0}   = Rt;
@@ -513,6 +525,24 @@ class AI2stridx<bit isByte, bit isPre, dag oops, dag iops,
   let Inst{19-16} = Rn;
   let Inst{11-0} = offset{11-0};
 }
+// FIXME: Merge with the above class when addrmode2 gets used for STR, STRB
+// but for now use this class for STRT and STRBT.
+class AI2stridxT<bit isByte, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
+               pattern> {
+  // AM2 store w/ two operands: (GPR, am2offset)
+  // {17-14}  Rn
+  // {13}     1 == Rm, 0 == imm12
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<18> addr;
+  let Inst{25} = addr{13};
+  let Inst{23} = addr{12};
+  let Inst{19-16} = addr{17-14};
+  let Inst{11-0} = addr{11-0};
+}
 
 // addrmode3 instructions
 class AI3ld<bits<4> op, bit op20, dag oops, dag iops, Format f,
@@ -547,6 +577,34 @@ class AI3ldstidx<bits<4> op, bit op20, bit isLd, bit isPre, dag oops, dag iops,
   let Inst{15-12} = Rt;           // Rt
   let Inst{7-4}   = op;
 }
+
+// FIXME: Merge with the above class when addrmode2 gets used for LDR, LDRB
+// but for now use this class for LDRSBT, LDRHT, LDSHT.
+class AI3ldstidxT<bits<4> op, bit op20, bit isLd, bit isPre, dag oops, dag iops,
+                  IndexMode im, Format f, InstrItinClass itin, string opc,
+                  string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, im, f, itin,
+      opc, asm, cstr, pattern> {
+  // {13}     1 == imm8, 0 == Rm
+  // {12-9}   Rn
+  // {8}      isAdd
+  // {7-4}    imm7_4/zero
+  // {3-0}    imm3_0/Rm
+  bits<14> addr;
+  bits<4> Rt;
+  let Inst{27-25} = 0b000;
+  let Inst{24}    = isPre;        // P bit
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{20}    = op20;         // L bit
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Rt;           // Rt
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{7-4}   = op;
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let AsmMatchConverter = "CvtLdWriteBackRegAddrMode3";
+}
+
 class AI3stridx<bits<4> op, bit isByte, bit isPre, dag oops, dag iops,
                 IndexMode im, Format f, InstrItinClass itin, string opc,
                 string asm, string cstr, list<dag> pattern>
@@ -619,12 +677,25 @@ class AI3sthpo<dag oops, dag iops, Format f, InstrItinClass itin,
                string opc, string asm, string cstr, list<dag> pattern>
   : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
       opc, asm, cstr,pattern> {
+  // {13}     1 == imm8, 0 == Rm
+  // {12-9}   Rn
+  // {8}      isAdd
+  // {7-4}    imm7_4/zero
+  // {3-0}    imm3_0/Rm
+  bits<14> addr;
+  bits<4> Rt;
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
   let Inst{4}     = 1;
   let Inst{5}     = 1; // H bit
   let Inst{6}     = 0; // S bit
   let Inst{7}     = 1;
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{15-12} = Rt;           // Rt
+  let Inst{19-16} = addr{12-9};   // Rn
   let Inst{20}    = 0; // L bit
   let Inst{21}    = 0; // W bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{23}    = addr{8};      // U bit
   let Inst{24}    = 0; // P bit
   let Inst{27-25} = 0b000;
 }
@@ -1670,7 +1741,8 @@ class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
 }
 
 // NEON 3 vector register format.
-class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
+
+class N3VCommon<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
           dag oops, dag iops, Format f, InstrItinClass itin,
           string opc, string dt, string asm, string cstr, list<dag> pattern>
   : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
@@ -1680,6 +1752,13 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
   let Inst{11-8}  = op11_8;
   let Inst{6}     = op6;
   let Inst{4}     = op4;
+}
+
+class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
+          dag oops, dag iops, Format f, InstrItinClass itin,
+          string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
+              oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
 
   // Instruction operands.
   bits<5> Vd;
@@ -1694,6 +1773,47 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
   let Inst{5}     = Vm{4};
 }
 
+class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
+          dag oops, dag iops, Format f, InstrItinClass itin,
+          string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
+              oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  bit lane;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7}     = Vn{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = lane;
+}
+
+class N3VLane16<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
+          dag oops, dag iops, Format f, InstrItinClass itin,
+          string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
+              oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  bits<2> lane;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7}     = Vn{4};
+  let Inst{2-0}   = Vm{2-0};
+  let Inst{5}     = lane{1};
+  let Inst{3}     = lane{0};
+}
+
 // Same as N3V except it doesn't have a data type suffix.
 class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
            bit op4,
@@ -1730,6 +1850,8 @@ class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
   let Inst{11-8}  = opcod2;
   let Inst{6-5}   = opcod3;
   let Inst{4}     = 1;
+  // A8.6.303, A8.6.328, A8.6.329
+  let Inst{3-0}   = 0b0000;
 
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ins pred:$p));
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 6e3fe2e..209c1a3 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -58,7 +58,7 @@ def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>,
                                                  SDTCisInt<2>]>;
 def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>;
 
-def SDT_ARMEH_SJLJ_DispatchSetup: SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def SDT_ARMEH_SJLJ_DispatchSetup: SDTypeProfile<0, 0, []>;
 
 def SDT_ARMMEMBARRIER     : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
@@ -93,8 +93,6 @@ def ARMretflag       : SDNode<"ARMISD::RET_FLAG", SDTNone,
 
 def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
                               [SDNPInGlue]>;
-def ARMcneg          : SDNode<"ARMISD::CNEG", SDT_ARMCMov,
-                              [SDNPInGlue]>;
 
 def ARMbrcond        : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond,
                               [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
@@ -205,13 +203,13 @@ def so_imm_not_XFORM : SDNodeXForm<imm, [{
 }]>;
 
 /// imm1_15 predicate - True if the 32-bit immediate is in the range [1,15].
-def imm1_15 : PatLeaf<(i32 imm), [{
-  return (int32_t)N->getZExtValue() >= 1 && (int32_t)N->getZExtValue() < 16;
+def imm1_15 : ImmLeaf<i32, [{
+  return (int32_t)Imm >= 1 && (int32_t)Imm < 16;
 }]>;
 
 /// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31].
-def imm16_31 : PatLeaf<(i32 imm), [{
-  return (int32_t)N->getZExtValue() >= 16 && (int32_t)N->getZExtValue() < 32;
+def imm16_31 : ImmLeaf<i32, [{
+  return (int32_t)Imm >= 16 && (int32_t)Imm < 32;
 }]>;
 
 def so_imm_neg :
@@ -241,8 +239,8 @@ def lo16AllZero : PatLeaf<(i32 imm), [{
 
 /// imm0_65535 predicate - True if the 32-bit immediate is in the range
 /// [0.65535].
-def imm0_65535 : PatLeaf<(i32 imm), [{
-  return (uint32_t)N->getZExtValue() < 65536;
+def imm0_65535 : ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 65536;
 }]>;
 
 class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
@@ -377,17 +375,23 @@ def neon_vcvt_imm32 : Operand<i32> {
 }
 
 // rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24.
-def rot_imm : Operand<i32>, PatLeaf<(i32 imm), [{
-    int32_t v = (int32_t)N->getZExtValue();
+def rot_imm : Operand<i32>, ImmLeaf<i32, [{
+    int32_t v = (int32_t)Imm;
     return v == 8 || v == 16 || v == 24; }]> {
   let EncoderMethod = "getRotImmOpValue";
 }
 
+def ShifterAsmOperand : AsmOperandClass {
+  let Name = "Shifter";
+  let SuperClasses = [];
+}
+
 // shift_imm: An integer that encodes a shift amount and the type of shift
 // (currently either asr or lsl) using the same encoding used for the
 // immediates in so_reg operands.
 def shift_imm : Operand<i32> {
   let PrintMethod = "printShiftImmOperand";
+  let ParserMatchClass = ShifterAsmOperand;
 }
 
 // shifter_operand operands: so_reg and so_imm.
@@ -396,19 +400,21 @@ def so_reg : Operand<i32>,    // reg reg imm
                             [shl,srl,sra,rotr]> {
   let EncoderMethod = "getSORegOpValue";
   let PrintMethod = "printSORegOperand";
-  let MIOperandInfo = (ops GPR, GPR, i32imm);
+  let MIOperandInfo = (ops GPR, GPR, shift_imm);
 }
 def shift_so_reg : Operand<i32>,    // reg reg imm
                    ComplexPattern<i32, 3, "SelectShiftShifterOperandReg",
                                   [shl,srl,sra,rotr]> {
   let EncoderMethod = "getSORegOpValue";
   let PrintMethod = "printSORegOperand";
-  let MIOperandInfo = (ops GPR, GPR, i32imm);
+  let MIOperandInfo = (ops GPR, GPR, shift_imm);
 }
 
 // so_imm - Match a 32-bit shifter_operand immediate operand, which is an
 // 8-bit immediate rotated by an arbitrary number of bits.
-def so_imm : Operand<i32>, PatLeaf<(imm), [{ return Pred_so_imm(N); }]> {
+def so_imm : Operand<i32>, ImmLeaf<i32, [{
+    return ARM_AM::getSOImmVal(Imm) != -1;
+  }]> {
   let EncoderMethod = "getSOImmOpValue";
   let PrintMethod = "printSOImmOperand";
 }
@@ -429,13 +435,13 @@ def arm_i32imm : PatLeaf<(imm), [{
 }]>;
 
 /// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31].
-def imm0_31 : Operand<i32>, PatLeaf<(imm), [{
-  return (int32_t)N->getZExtValue() < 32;
+def imm0_31 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 32;
 }]>;
 
 /// imm0_31_m1 - Matches and prints like imm0_31, but encodes as 'value - 1'.
-def imm0_31_m1 : Operand<i32>, PatLeaf<(imm), [{
-  return (int32_t)N->getZExtValue() < 32;
+def imm0_31_m1 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 32;
 }]> {
   let EncoderMethod = "getImmMinusOneOpValue";
 }
@@ -458,19 +464,30 @@ def bf_inv_mask_imm : Operand<i32>,
 }
 
 /// lsb_pos_imm - position of the lsb bit, used by BFI4p and t2BFI4p
-def lsb_pos_imm : Operand<i32>, PatLeaf<(imm), [{
-  return isInt<5>(N->getSExtValue());
+def lsb_pos_imm : Operand<i32>, ImmLeaf<i32, [{
+  return isInt<5>(Imm);
 }]>;
 
 /// width_imm - number of bits to be copied, used by BFI4p and t2BFI4p
-def width_imm : Operand<i32>, PatLeaf<(imm), [{
-  return N->getSExtValue() > 0 &&  N->getSExtValue() <= 32;
+def width_imm : Operand<i32>, ImmLeaf<i32, [{
+  return Imm > 0 &&  Imm <= 32;
 }] > {
   let EncoderMethod = "getMsbOpValue";
 }
 
 // Define ARM specific addressing modes.
 
+def MemMode2AsmOperand : AsmOperandClass {
+  let Name = "MemMode2";
+  let SuperClasses = [];
+  let ParserMethod = "tryParseMemMode2Operand";
+}
+
+def MemMode3AsmOperand : AsmOperandClass {
+  let Name = "MemMode3";
+  let SuperClasses = [];
+  let ParserMethod = "tryParseMemMode3Operand";
+}
 
 // addrmode_imm12 := reg +/- imm12
 //
@@ -501,6 +518,7 @@ def addrmode2 : Operand<i32>,
                 ComplexPattern<i32, 3, "SelectAddrMode2", []> {
   let EncoderMethod = "getAddrMode2OpValue";
   let PrintMethod = "printAddrMode2Operand";
+  let ParserMatchClass = MemMode2AsmOperand;
   let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
 }
 
@@ -519,6 +537,7 @@ def addrmode3 : Operand<i32>,
                 ComplexPattern<i32, 3, "SelectAddrMode3", []> {
   let EncoderMethod = "getAddrMode3OpValue";
   let PrintMethod = "printAddrMode3Operand";
+  let ParserMatchClass = MemMode3AsmOperand;
   let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
 }
 
@@ -586,6 +605,21 @@ def addrmodepc : Operand<i32>,
   let MIOperandInfo = (ops GPR, i32imm);
 }
 
+def MemMode7AsmOperand : AsmOperandClass {
+  let Name = "MemMode7";
+  let SuperClasses = [];
+}
+
+// addrmode7 := reg
+// Used by load/store exclusive instructions. Useful to enable right assembly
+// parsing and printing. Not used for any codegen matching.
+//
+def addrmode7 : Operand<i32> {
+  let PrintMethod = "printAddrMode7Operand";
+  let MIOperandInfo = (ops GPR);
+  let ParserMatchClass = MemMode7AsmOperand;
+}
+
 def nohash_imm : Operand<i32> {
   let PrintMethod = "printNoHashImmediate";
 }
@@ -902,52 +936,23 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
     let Inst{19-16} = Rn;
   }
 }
+}
+
 // Carry setting variants
-let isCodeGenOnly = 1, Defs = [CPSR] in {
-multiclass AI1_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode,
-                             bit Commutable = 0> {
-  def Sri : AXI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
-                DPFrm, IIC_iALUi, !strconcat(opc, "\t$Rd, $Rn, $imm"),
-               [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]>,
-               Requires<[IsARM]> {
-    bits<4> Rd;
-    bits<4> Rn;
-    bits<12> imm;
-    let Inst{15-12} = Rd;
-    let Inst{19-16} = Rn;
-    let Inst{11-0} = imm;
-    let Inst{20} = 1;
-    let Inst{25} = 1;
-  }
-  def Srr : AXI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-                DPFrm, IIC_iALUr, !strconcat(opc, "\t$Rd, $Rn, $Rm"),
-               [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>,
-               Requires<[IsARM]> {
-    bits<4> Rd;
-    bits<4> Rn;
-    bits<4> Rm;
-    let Inst{11-4} = 0b00000000;
+// NOTE: CPSR def omitted because it will be handled by the custom inserter.
+let usesCustomInserter = 1 in {
+multiclass AI1_adde_sube_s_irs<PatFrag opnode, bit Commutable = 0> {
+  def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
+               Size4Bytes, IIC_iALUi,
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]>;
+  def rr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+               Size4Bytes, IIC_iALUr,
+               [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]> {
     let isCommutable = Commutable;
-    let Inst{3-0} = Rm;
-    let Inst{15-12} = Rd;
-    let Inst{19-16} = Rn;
-    let Inst{20} = 1;
-    let Inst{25} = 0;
-  }
-  def Srs : AXI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
-                DPSoRegFrm, IIC_iALUsr, !strconcat(opc, "\t$Rd, $Rn, $shift"),
-               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]>,
-               Requires<[IsARM]> {
-    bits<4> Rd;
-    bits<4> Rn;
-    bits<12> shift;
-    let Inst{11-0} = shift;
-    let Inst{15-12} = Rd;
-    let Inst{19-16} = Rn;
-    let Inst{20} = 1;
-    let Inst{25} = 0;
   }
-}
+  def rs : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
+               Size4Bytes, IIC_iALUsr,
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]>;
 }
 }
 
@@ -972,6 +977,7 @@ multiclass AI_ldr1<bit isByte, string opc, InstrItinClass iii,
                  [(set GPR:$Rt, (opnode ldst_so_reg:$shift))]> {
     bits<4>  Rt;
     bits<17> shift;
+    let shift{4}    = 0;            // Inst{4} = 0
     let Inst{23}    = shift{12};    // U (add = ('U' == 1))
     let Inst{19-16} = shift{16-13}; // Rn
     let Inst{15-12} = Rt;
@@ -1001,6 +1007,7 @@ multiclass AI_str1<bit isByte, string opc, InstrItinClass iii,
                  [(opnode GPR:$Rt, ldst_so_reg:$shift)]> {
     bits<4> Rt;
     bits<17> shift;
+    let shift{4}    = 0;            // Inst{4} = 0
     let Inst{23}    = shift{12};    // U (add = ('U' == 1))
     let Inst{19-16} = shift{16-13}; // Rn
     let Inst{15-12} = Rt;
@@ -1249,7 +1256,7 @@ let neverHasSideEffects = 1, isReMaterializable = 1 in
 // The 'adr' mnemonic encodes differently if the label is before or after
 // the instruction. The {24-21} opcode bits are set by the fixup, as we don't
 // know until then which form of the instruction will be used.
-def ADR : AI1<0, (outs GPR:$Rd), (ins adrlabel:$label),
+def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label),
                  MiscFrm, IIC_iALUi, "adr", "\t$Rd, #$label", []> {
   bits<4> Rd;
   bits<12> label;
@@ -1311,6 +1318,9 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 // before calls from potentially appearing dead.
 let isCall = 1,
   // On non-Darwin platforms R9 is callee-saved.
+  // FIXME:  Do we really need a non-predicated version? If so, it should
+  // at least be a pseudo instruction expanding to the predicated version
+  // at MC lowering time.
   Defs = [R0,  R1,  R2,  R3,  R12, LR,
           D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
           D16, D17, D18, D19, D20, D21, D22, D23,
@@ -1340,7 +1350,16 @@ let isCall = 1,
             Requires<[IsARM, HasV5T, IsNotDarwin]> {
     bits<4> func;
     let Inst{31-4} = 0b1110000100101111111111110011;
-    let Inst{3-0}   = func;
+    let Inst{3-0}  = func;
+  }
+
+  def BLX_pred : AI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
+                    IIC_Br, "blx", "\t$func",
+                    [(ARMcall_pred GPR:$func)]>,
+                 Requires<[IsARM, HasV5T, IsNotDarwin]> {
+    bits<4> func;
+    let Inst{27-4} = 0b000100101111111111110011;
+    let Inst{3-0}  = func;
   }
 
   // ARMv4T
@@ -1364,30 +1383,25 @@ let isCall = 1,
           D16, D17, D18, D19, D20, D21, D22, D23,
           D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR],
   Uses = [R7, SP] in {
-  def BLr9  : ABXI<0b1011, (outs), (ins bltarget:$func, variable_ops),
-                IIC_Br, "bl\t$func",
-                [(ARMcall tglobaladdr:$func)]>, Requires<[IsARM, IsDarwin]> {
-    let Inst{31-28} = 0b1110;
-    bits<24> func;
-    let Inst{23-0} = func;
-  }
+  def BLr9  : ARMPseudoInst<(outs), (ins bltarget:$func, variable_ops),
+                Size4Bytes, IIC_Br,
+                [(ARMcall tglobaladdr:$func)]>, Requires<[IsARM, IsDarwin]>;
 
-  def BLr9_pred : ABI<0b1011, (outs), (ins bltarget:$func, variable_ops),
-                   IIC_Br, "bl", "\t$func",
+  def BLr9_pred : ARMPseudoInst<(outs),
+                   (ins bltarget:$func, pred:$p, variable_ops),
+                   Size4Bytes, IIC_Br,
                    [(ARMcall_pred tglobaladdr:$func)]>,
-                  Requires<[IsARM, IsDarwin]> {
-    bits<24> func;
-    let Inst{23-0} = func;
-  }
+                  Requires<[IsARM, IsDarwin]>;
 
   // ARMv5T and above
-  def BLXr9 : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
-                IIC_Br, "blx\t$func",
-                [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsDarwin]> {
-    bits<4> func;
-    let Inst{31-4} = 0b1110000100101111111111110011;
-    let Inst{3-0}   = func;
-  }
+  def BLXr9 : ARMPseudoInst<(outs), (ins GPR:$func, variable_ops),
+                Size4Bytes, IIC_Br,
+                [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsDarwin]>;
+
+  def BLXr9_pred: ARMPseudoInst<(outs), (ins GPR:$func, pred:$p,  variable_ops),
+                      Size4Bytes, IIC_Br,
+                      [(ARMcall_pred GPR:$func)]>,
+                   Requires<[IsARM, HasV5T, IsDarwin]>;
 
   // ARMv4T
   // Note: Restrict $func to the tGPR regclass to prevent it being in LR.
@@ -1403,11 +1417,7 @@ let isCall = 1,
 
 // Tail calls.
 
-// FIXME: These should probably be xformed into the non-TC versions of the
-// instructions as part of MC lowering.
-// FIXME: These seem to be used for both Thumb and ARM instruction selection.
-// Thumb should have its own version since the instruction is actually
-// different, even though the mnemonic is the same.
+// FIXME: The Thumb versions of these should live in ARMInstrThumb.td
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // Darwin versions.
   let Defs = [R0, R1, R2, R3, R9, R12,
@@ -1421,21 +1431,21 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
     def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst, variable_ops),
                        IIC_Br, []>, Requires<[IsDarwin]>;
 
-    def TAILJMPd : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops),
-                   IIC_Br, "b\t$dst  @ TAILCALL",
+    def TAILJMPd : ARMPseudoInst<(outs), (ins brtarget:$dst, variable_ops),
+                   Size4Bytes, IIC_Br,
                    []>, Requires<[IsARM, IsDarwin]>;
 
-    def TAILJMPdt: ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops),
-                   IIC_Br, "b.w\t$dst  @ TAILCALL",
+    def tTAILJMPd: tPseudoInst<(outs), (ins brtarget:$dst, variable_ops),
+                   Size4Bytes, IIC_Br,
                    []>, Requires<[IsThumb, IsDarwin]>;
 
-    def TAILJMPr : AXI<(outs), (ins tcGPR:$dst, variable_ops),
-                     BrMiscFrm, IIC_Br, "bx\t$dst  @ TAILCALL",
-                   []>, Requires<[IsDarwin]> {
-      bits<4> dst;
-      let Inst{31-4} = 0b1110000100101111111111110001;
-      let Inst{3-0}  = dst;
-    }
+    def TAILJMPr : ARMPseudoInst<(outs), (ins tcGPR:$dst, variable_ops),
+                     Size4Bytes, IIC_Br,
+                   []>, Requires<[IsARM, IsDarwin]>;
+
+    def tTAILJMPr : tPseudoInst<(outs), (ins tcGPR:$dst, variable_ops),
+                     Size4Bytes, IIC_Br,
+                   []>, Requires<[IsThumb, IsDarwin]>;
   }
 
   // Non-Darwin versions (the difference is R9).
@@ -1450,34 +1460,31 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
     def TCRETURNriND : PseudoInst<(outs), (ins tcGPR:$dst, variable_ops),
                        IIC_Br, []>, Requires<[IsNotDarwin]>;
 
-    def TAILJMPdND : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops),
-                   IIC_Br, "b\t$dst  @ TAILCALL",
+    def TAILJMPdND : ARMPseudoInst<(outs), (ins brtarget:$dst, variable_ops),
+                   Size4Bytes, IIC_Br,
                    []>, Requires<[IsARM, IsNotDarwin]>;
 
-    def TAILJMPdNDt : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops),
-                   IIC_Br, "b.w\t$dst  @ TAILCALL",
+    def tTAILJMPdND : tPseudoInst<(outs), (ins brtarget:$dst, variable_ops),
+                   Size4Bytes, IIC_Br,
                    []>, Requires<[IsThumb, IsNotDarwin]>;
 
-    def TAILJMPrND : AXI<(outs), (ins tcGPR:$dst, variable_ops),
-                     BrMiscFrm, IIC_Br, "bx\t$dst  @ TAILCALL",
-                   []>, Requires<[IsNotDarwin]> {
-      bits<4> dst;
-      let Inst{31-4} = 0b1110000100101111111111110001;
-      let Inst{3-0}  = dst;
-    }
+    def TAILJMPrND : ARMPseudoInst<(outs), (ins tcGPR:$dst, variable_ops),
+                     Size4Bytes, IIC_Br,
+                   []>, Requires<[IsARM, IsNotDarwin]>;
+    def tTAILJMPrND : tPseudoInst<(outs), (ins tcGPR:$dst, variable_ops),
+                     Size4Bytes, IIC_Br,
+                   []>, Requires<[IsThumb, IsNotDarwin]>;
   }
 }
 
 let isBranch = 1, isTerminator = 1 in {
-  // B is "predicable" since it can be xformed into a Bcc.
+  // B is "predicable" since it's just a Bcc with an 'always' condition.
   let isBarrier = 1 in {
     let isPredicable = 1 in
-    def B : ABXI<0b1010, (outs), (ins brtarget:$target), IIC_Br,
-                "b\t$target", [(br bb:$target)]> {
-      bits<24> target;
-      let Inst{31-28} = 0b1110;
-      let Inst{23-0} = target;
-    }
+    // FIXME: We shouldn't need this pseudo at all. Just using Bcc directly
+    // should be sufficient.
+    def B : ARMPseudoInst<(outs), (ins brtarget:$target), Size4Bytes, IIC_Br,
+                [(br bb:$target)]>;
 
     let isNotDuplicable = 1, isIndirectBranch = 1 in {
     def BR_JTr : ARMPseudoInst<(outs),
@@ -1509,6 +1516,16 @@ let isBranch = 1, isTerminator = 1 in {
   }
 }
 
+// BLX (immediate) -- for disassembly only
+def BLXi : AXI<(outs), (ins br_target:$target), BrMiscFrm, NoItinerary,
+               "blx\t$target", [/* pattern left blank */]>,
+           Requires<[IsARM, HasV5T]> {
+  let Inst{31-25} = 0b1111101;
+  bits<25> target;
+  let Inst{23-0} = target{24-1};
+  let Inst{24} = target{0};
+}
+
 // Branch and Exchange Jazelle -- for disassembly only
 def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
               [/* For disassembly only; pattern left blank */]> {
@@ -1533,6 +1550,7 @@ def SVC : ABI<0b1111, (outs), (ins i32imm:$svc), IIC_Br, "svc", "\t$svc",
   let Inst{23-0} = svc;
 }
 }
+def : MnemonicAlias<"swi", "svc">;
 
 // Store Return State is a system instruction -- for disassembly only
 let isCodeGenOnly = 1 in {  // FIXME: This should not use submode!
@@ -1541,6 +1559,8 @@ def SRSW : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, i32imm:$mode),
                 [/* For disassembly only; pattern left blank */]> {
   let Inst{31-28} = 0b1111;
   let Inst{22-20} = 0b110; // W = 1
+  let Inst{19-8} = 0xd05;
+  let Inst{7-5} = 0b000;
 }
 
 def SRS  : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, i32imm:$mode),
@@ -1548,6 +1568,8 @@ def SRS  : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, i32imm:$mode),
                 [/* For disassembly only; pattern left blank */]> {
   let Inst{31-28} = 0b1111;
   let Inst{22-20} = 0b100; // W = 0
+  let Inst{19-8} = 0xd05;
+  let Inst{7-5} = 0b000;
 }
 
 // Return From Exception is a system instruction -- for disassembly only
@@ -1556,6 +1578,7 @@ def RFEW : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, GPR:$base),
                 [/* For disassembly only; pattern left blank */]> {
   let Inst{31-28} = 0b1111;
   let Inst{22-20} = 0b011; // W = 1
+  let Inst{15-0} = 0x0a00;
 }
 
 def RFE  : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, GPR:$base),
@@ -1563,6 +1586,7 @@ def RFE  : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, GPR:$base),
                 [/* For disassembly only; pattern left blank */]> {
   let Inst{31-28} = 0b1111;
   let Inst{22-20} = 0b001; // W = 0
+  let Inst{15-0} = 0x0a00;
 }
 } // isCodeGenOnly = 1
 
@@ -1610,15 +1634,11 @@ def LDRSB : AI3ld<0b1101, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
                    IIC_iLoad_bh_r, "ldrsb", "\t$Rt, $addr",
                    [(set GPR:$Rt, (sextloadi8 addrmode3:$addr))]>;
 
-let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1,
-    isCodeGenOnly = 1 in { // $dst2 doesn't exist in asmstring?
-// FIXME: $dst2 isn't in the asm string as it's implied by $Rd (dst2 = Rd+1)
-//        how to represent that such that tblgen is happy and we don't
-//        mark this codegen only?
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 // Load doubleword
 def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rd, GPR:$dst2),
                  (ins addrmode3:$addr), LdMiscFrm,
-                 IIC_iLoad_d_r, "ldrd", "\t$Rd, $addr",
+                 IIC_iLoad_d_r, "ldrd", "\t$Rd, $dst2, $addr",
                  []>, Requires<[IsARM, HasV5TE]>;
 }
 
@@ -1636,6 +1656,7 @@ multiclass AI2_ldridx<bit isByte, string opc, InstrItinClass itin> {
     let Inst{23} = addr{12};
     let Inst{19-16} = addr{17-14};
     let Inst{11-0} = addr{11-0};
+    let AsmMatchConverter = "CvtLdWriteBackRegAddrMode2";
   }
   def _POST : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                       (ins GPR:$Rn, am2offset:$offset),
@@ -1688,40 +1709,80 @@ let mayLoad = 1, neverHasSideEffects = 1 in {
 defm LDRH  : AI3_ldridx<0b1011, 1, "ldrh", IIC_iLoad_bh_ru>;
 defm LDRSH : AI3_ldridx<0b1111, 1, "ldrsh", IIC_iLoad_bh_ru>;
 defm LDRSB : AI3_ldridx<0b1101, 1, "ldrsb", IIC_iLoad_bh_ru>;
-let hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in
-defm LDRD :  AI3_ldridx<0b1101, 0, "ldrd", IIC_iLoad_d_ru>;
+let hasExtraDefRegAllocReq = 1 in {
+def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
+                          (ins addrmode3:$addr), IndexModePre,
+                          LdMiscFrm, IIC_iLoad_d_ru,
+                          "ldrd", "\t$Rt, $Rt2, $addr!",
+                          "$addr.base = $Rn_wb", []> {
+  bits<14> addr;
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+}
+def LDRD_POST: AI3ldstidx<0b1101, 0, 1, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
+                          (ins GPR:$Rn, am3offset:$offset), IndexModePost,
+                          LdMiscFrm, IIC_iLoad_d_ru,
+                          "ldrd", "\t$Rt, $Rt2, [$Rn], $offset",
+                          "$Rn = $Rn_wb", []> {
+  bits<10> offset;
+  bits<4> Rn;
+  let Inst{23}    = offset{8};      // U bit
+  let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
+  let Inst{19-16} = Rn;
+  let Inst{11-8}  = offset{7-4};    // imm7_4/zero
+  let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+}
+} // hasExtraDefRegAllocReq = 1
 } // mayLoad = 1, neverHasSideEffects = 1
 
 // LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT are for disassembly only.
 let mayLoad = 1, neverHasSideEffects = 1 in {
-def LDRT : AI2ldstidx<1, 0, 0, (outs GPR:$dst, GPR:$base_wb),
-                   (ins GPR:$base, am2offset:$offset), IndexModeNone,
-                   LdFrm, IIC_iLoad_ru,
-                   "ldrt", "\t$dst, [$base], $offset", "$base = $base_wb", []> {
+def LDRT : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$base_wb),
+                   (ins addrmode2:$addr), IndexModePost, LdFrm, IIC_iLoad_ru,
+                   "ldrt", "\t$Rt, $addr", "$addr.base = $base_wb", []> {
+  // {17-14}  Rn
+  // {13}     1 == Rm, 0 == imm12
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<18> addr;
+  let Inst{25} = addr{13};
+  let Inst{23} = addr{12};
   let Inst{21} = 1; // overwrite
-}
-def LDRBT : AI2ldstidx<1, 1, 0, (outs GPR:$dst, GPR:$base_wb),
-                  (ins GPR:$base, am2offset:$offset), IndexModeNone,
-                  LdFrm, IIC_iLoad_bh_ru,
-                  "ldrbt", "\t$dst, [$base], $offset", "$base = $base_wb", []> {
+  let Inst{19-16} = addr{17-14};
+  let Inst{11-0} = addr{11-0};
+  let AsmMatchConverter = "CvtLdWriteBackRegAddrMode2";
+}
+def LDRBT : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$base_wb),
+                  (ins addrmode2:$addr), IndexModePost, LdFrm, IIC_iLoad_bh_ru,
+                  "ldrbt", "\t$Rt, $addr", "$addr.base = $base_wb", []> {
+  // {17-14}  Rn
+  // {13}     1 == Rm, 0 == imm12
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<18> addr;
+  let Inst{25} = addr{13};
+  let Inst{23} = addr{12};
   let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr{17-14};
+  let Inst{11-0} = addr{11-0};
+  let AsmMatchConverter = "CvtLdWriteBackRegAddrMode2";
 }
-def LDRSBT : AI3ldstidx<0b1101, 1, 1, 0, (outs GPR:$dst, GPR:$base_wb),
-                 (ins GPR:$base, am3offset:$offset), IndexModePost,
-                 LdMiscFrm, IIC_iLoad_bh_ru,
-                 "ldrsbt", "\t$dst, [$base], $offset", "$base = $base_wb", []> {
+def LDRSBT : AI3ldstidxT<0b1101, 1, 1, 0, (outs GPR:$Rt, GPR:$base_wb),
+             (ins addrmode3:$addr), IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru,
+             "ldrsbt", "\t$Rt, $addr", "$addr.base = $base_wb", []> {
   let Inst{21} = 1; // overwrite
 }
-def LDRHT : AI3ldstidx<0b1011, 1, 1, 0, (outs GPR:$dst, GPR:$base_wb),
-                 (ins GPR:$base, am3offset:$offset), IndexModePost,
-                 LdMiscFrm, IIC_iLoad_bh_ru,
-                 "ldrht", "\t$dst, [$base], $offset", "$base = $base_wb", []> {
+def LDRHT  : AI3ldstidxT<0b1011, 1, 1, 0, (outs GPR:$Rt, GPR:$base_wb),
+             (ins addrmode3:$addr), IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru,
+             "ldrht", "\t$Rt, $addr", "$addr.base = $base_wb", []> {
   let Inst{21} = 1; // overwrite
 }
-def LDRSHT : AI3ldstidx<0b1111, 1, 1, 0, (outs GPR:$dst, GPR:$base_wb),
-                 (ins GPR:$base, am3offset:$offset), IndexModePost,
-                 LdMiscFrm, IIC_iLoad_bh_ru,
-                 "ldrsht", "\t$dst, [$base], $offset", "$base = $base_wb", []> {
+def LDRSHT : AI3ldstidxT<0b1111, 1, 1, 0, (outs GPR:$Rt, GPR:$base_wb),
+             (ins addrmode3:$addr), IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru,
+             "ldrsht", "\t$Rt, $addr", "$addr.base = $base_wb", []> {
   let Inst{21} = 1; // overwrite
 }
 }
@@ -1734,55 +1795,61 @@ def STRH : AI3str<0b1011, (outs), (ins GPR:$Rt, addrmode3:$addr), StMiscFrm,
                [(truncstorei16 GPR:$Rt, addrmode3:$addr)]>;
 
 // Store doubleword
-let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1,
-    isCodeGenOnly = 1 in  // $src2 doesn't exist in asm string
-def STRD : AI3str<0b1111, (outs), (ins GPR:$src1, GPR:$src2, addrmode3:$addr),
+let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in
+def STRD : AI3str<0b1111, (outs), (ins GPR:$Rt, GPR:$src2, addrmode3:$addr),
                StMiscFrm, IIC_iStore_d_r,
-               "strd", "\t$src1, $addr", []>, Requires<[IsARM, HasV5TE]>;
+               "strd", "\t$Rt, $src2, $addr", []>, Requires<[IsARM, HasV5TE]>;
 
 // Indexed stores
 def STR_PRE  : AI2stridx<0, 1, (outs GPR:$Rn_wb),
                      (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
                      IndexModePre, StFrm, IIC_iStore_ru,
-                     "str", "\t$Rt, [$Rn, $offset]!", "$Rn = $Rn_wb",
+                     "str", "\t$Rt, [$Rn, $offset]!",
+                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
                      [(set GPR:$Rn_wb,
                       (pre_store GPR:$Rt, GPR:$Rn, am2offset:$offset))]>;
 
 def STR_POST : AI2stridx<0, 0, (outs GPR:$Rn_wb),
                      (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
                      IndexModePost, StFrm, IIC_iStore_ru,
-                     "str", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb",
+                     "str", "\t$Rt, [$Rn], $offset",
+                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
                      [(set GPR:$Rn_wb,
                       (post_store GPR:$Rt, GPR:$Rn, am2offset:$offset))]>;
 
 def STRB_PRE : AI2stridx<1, 1, (outs GPR:$Rn_wb),
                      (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
                      IndexModePre, StFrm, IIC_iStore_bh_ru,
-                     "strb", "\t$Rt, [$Rn, $offset]!", "$Rn = $Rn_wb",
+                     "strb", "\t$Rt, [$Rn, $offset]!",
+                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
                      [(set GPR:$Rn_wb, (pre_truncsti8 GPR:$Rt,
                                         GPR:$Rn, am2offset:$offset))]>;
 def STRB_POST: AI2stridx<1, 0, (outs GPR:$Rn_wb),
                      (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
                      IndexModePost, StFrm, IIC_iStore_bh_ru,
-                     "strb", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb",
+                     "strb", "\t$Rt, [$Rn], $offset",
+                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
                      [(set GPR:$Rn_wb, (post_truncsti8 GPR:$Rt,
                                         GPR:$Rn, am2offset:$offset))]>;
 
 def STRH_PRE : AI3stridx<0b1011, 0, 1, (outs GPR:$Rn_wb),
                      (ins GPR:$Rt, GPR:$Rn, am3offset:$offset),
                      IndexModePre, StMiscFrm, IIC_iStore_ru,
-                     "strh", "\t$Rt, [$Rn, $offset]!", "$Rn = $Rn_wb",
+                     "strh", "\t$Rt, [$Rn, $offset]!",
+                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
                      [(set GPR:$Rn_wb,
                       (pre_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>;
 
 def STRH_POST: AI3stridx<0b1011, 0, 0, (outs GPR:$Rn_wb),
                      (ins GPR:$Rt, GPR:$Rn, am3offset:$offset),
                      IndexModePost, StMiscFrm, IIC_iStore_bh_ru,
-                     "strh", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb",
+                     "strh", "\t$Rt, [$Rn], $offset",
+                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
                      [(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt,
                                         GPR:$Rn, am3offset:$offset))]>;
 
 // For disassembly only
+let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
 def STRD_PRE : AI3stdpr<(outs GPR:$base_wb),
                      (ins GPR:$src1, GPR:$src2, GPR:$base, am3offset:$offset),
                      StMiscFrm, IIC_iStore_d_ru,
@@ -1795,31 +1862,32 @@ def STRD_POST: AI3stdpo<(outs GPR:$base_wb),
                      StMiscFrm, IIC_iStore_d_ru,
                      "strd", "\t$src1, $src2, [$base], $offset",
                      "$base = $base_wb", []>;
+} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1
 
 // STRT, STRBT, and STRHT are for disassembly only.
 
-def STRT : AI2stridx<0, 0, (outs GPR:$Rn_wb),
-                    (ins GPR:$Rt, GPR:$Rn,am2offset:$offset),
-                    IndexModeNone, StFrm, IIC_iStore_ru,
-                    "strt", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb",
-                    [/* For disassembly only; pattern left blank */]> {
+def STRT : AI2stridxT<0, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, addrmode2:$addr),
+                     IndexModePost, StFrm, IIC_iStore_ru,
+                     "strt", "\t$Rt, $addr", "$addr.base = $Rn_wb",
+                     [/* For disassembly only; pattern left blank */]> {
   let Inst{21} = 1; // overwrite
+  let AsmMatchConverter = "CvtStWriteBackRegAddrMode2";
 }
 
-def STRBT : AI2stridx<1, 0, (outs GPR:$Rn_wb),
-                     (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
-                     IndexModeNone, StFrm, IIC_iStore_bh_ru,
-                     "strbt", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb",
-                     [/* For disassembly only; pattern left blank */]> {
+def STRBT : AI2stridxT<1, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, addrmode2:$addr),
+                      IndexModePost, StFrm, IIC_iStore_bh_ru,
+                      "strbt", "\t$Rt, $addr", "$addr.base = $Rn_wb",
+                      [/* For disassembly only; pattern left blank */]> {
   let Inst{21} = 1; // overwrite
+  let AsmMatchConverter = "CvtStWriteBackRegAddrMode2";
 }
 
-def STRHT: AI3sthpo<(outs GPR:$base_wb),
-                    (ins GPR:$src, GPR:$base,am3offset:$offset),
+def STRHT: AI3sthpo<(outs GPR:$base_wb), (ins GPR:$Rt, addrmode3:$addr),
                     StMiscFrm, IIC_iStore_bh_ru,
-                    "strht", "\t$src, [$base], $offset", "$base = $base_wb",
+                    "strht", "\t$Rt, $addr", "$addr.base = $base_wb",
                     [/* For disassembly only; pattern left blank */]> {
   let Inst{21} = 1; // overwrite
+  let AsmMatchConverter = "CvtStWriteBackRegAddrMode3";
 }
 
 //===----------------------------------------------------------------------===//
@@ -1892,7 +1960,7 @@ multiclass arm_ldst_mult<string asm, bit L_bit, Format f,
     let Inst{21}    = 1;          // Writeback
     let Inst{20}    = L_bit;
   }
-} 
+}
 
 let neverHasSideEffects = 1 in {
 
@@ -1912,16 +1980,10 @@ def : MnemonicAlias<"stm", "stmia">;
 // FIXME: Should pc be an implicit operand like PICADD, etc?
 let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
     hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in
-// FIXME: Should be a pseudo-instruction.
-def LDMIA_RET : AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p,
-                                      reglist:$regs, variable_ops),
-                     IndexModeUpd, LdStMulFrm, IIC_iLoad_mBr,
-                     "ldmia${p}\t$Rn!, $regs",
-                     "$Rn = $wb", []> {
-  let Inst{24-23} = 0b01;       // Increment After
-  let Inst{21}    = 1;          // Writeback
-  let Inst{20}    = 1;          // Load
-}
+def LDMIA_RET : ARMPseudoInst<(outs GPR:$wb), (ins GPR:$Rn, pred:$p,
+                                               reglist:$regs, variable_ops),
+                     Size4Bytes, IIC_iLoad_mBr, []>,
+      RegConstraint<"$Rn = $wb">;
 
 //===----------------------------------------------------------------------===//
 //  Move Instructions.
@@ -1933,6 +1995,7 @@ def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr,
   bits<4> Rd;
   bits<4> Rm;
 
+  let Inst{19-16} = 0b0000;
   let Inst{11-4} = 0b00000000;
   let Inst{25} = 0;
   let Inst{3-0} = Rm;
@@ -1959,6 +2022,7 @@ def MOVs : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg:$src),
   bits<4> Rd;
   bits<12> src;
   let Inst{15-12} = Rd;
+  let Inst{19-16} = 0b0000;
   let Inst{11-0} = src;
   let Inst{25} = 0;
 }
@@ -2145,10 +2209,12 @@ defm SBC : AI1_adde_sube_irs<0b0110, "sbc",
                           BinOpFrag<(sube_dead_carry node:$LHS, node:$RHS)>>;
 
 // ADC and SUBC with 's' bit set.
-defm ADCS : AI1_adde_sube_s_irs<0b0101, "adcs",
-                          BinOpFrag<(adde_live_carry node:$LHS, node:$RHS)>, 1>;
-defm SBCS : AI1_adde_sube_s_irs<0b0110, "sbcs",
-                          BinOpFrag<(sube_live_carry node:$LHS, node:$RHS) >>;
+let usesCustomInserter = 1 in {
+defm ADCS : AI1_adde_sube_s_irs<
+              BinOpFrag<(adde_live_carry node:$LHS, node:$RHS)>, 1>;
+defm SBCS : AI1_adde_sube_s_irs<
+              BinOpFrag<(sube_live_carry node:$LHS, node:$RHS) >>;
+}
 
 def RSBri : AsI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
                  IIC_iALUi, "rsb", "\t$Rd, $Rn, $imm",
@@ -2190,31 +2256,17 @@ def RSBrs : AsI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
 }
 
 // RSB with 's' bit set.
-let isCodeGenOnly = 1, Defs = [CPSR] in {
-def RSBSri : AI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
-                 IIC_iALUi, "rsbs", "\t$Rd, $Rn, $imm",
-                 [(set GPR:$Rd, (subc so_imm:$imm, GPR:$Rn))]> {
-  bits<4> Rd;
-  bits<4> Rn;
-  bits<12> imm;
-  let Inst{25} = 1;
-  let Inst{20} = 1;
-  let Inst{15-12} = Rd;
-  let Inst{19-16} = Rn;
-  let Inst{11-0} = imm;
-}
-def RSBSrs : AI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
-                 DPSoRegFrm, IIC_iALUsr, "rsbs", "\t$Rd, $Rn, $shift",
-                 [(set GPR:$Rd, (subc so_reg:$shift, GPR:$Rn))]> {
-  bits<4> Rd;
-  bits<4> Rn;
-  bits<12> shift;
-  let Inst{25} = 0;
-  let Inst{20} = 1;
-  let Inst{11-0} = shift;
-  let Inst{15-12} = Rd;
-  let Inst{19-16} = Rn;
-}
+// NOTE: CPSR def omitted because it will be handled by the custom inserter.
+let usesCustomInserter = 1 in {
+def RSBSri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
+                 Size4Bytes, IIC_iALUi,
+                 [(set GPR:$Rd, (subc so_imm:$imm, GPR:$Rn))]>;
+def RSBSrr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                 Size4Bytes, IIC_iALUr,
+                 [/* For disassembly only; pattern left blank */]>;
+def RSBSrs : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
+                 Size4Bytes, IIC_iALUsr,
+                 [(set GPR:$Rd, (subc so_reg:$shift, GPR:$Rn))]>;
 }
 
 let Uses = [CPSR] in {
@@ -2258,34 +2310,14 @@ def RSCrs : AsI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
 }
 }
 
-// FIXME: Allow these to be predicated.
-let isCodeGenOnly = 1, Defs = [CPSR], Uses = [CPSR] in {
-def RSCSri : AXI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
-                  DPFrm, IIC_iALUi, "rscs\t$Rd, $Rn, $imm",
-                  [(set GPR:$Rd, (sube_dead_carry so_imm:$imm, GPR:$Rn))]>,
-                  Requires<[IsARM]> {
-  bits<4> Rd;
-  bits<4> Rn;
-  bits<12> imm;
-  let Inst{25} = 1;
-  let Inst{20} = 1;
-  let Inst{15-12} = Rd;
-  let Inst{19-16} = Rn;
-  let Inst{11-0} = imm;
-}
-def RSCSrs : AXI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
-                  DPSoRegFrm, IIC_iALUsr, "rscs\t$Rd, $Rn, $shift",
-                  [(set GPR:$Rd, (sube_dead_carry so_reg:$shift, GPR:$Rn))]>,
-                  Requires<[IsARM]> {
-  bits<4> Rd;
-  bits<4> Rn;
-  bits<12> shift;
-  let Inst{25} = 0;
-  let Inst{20} = 1;
-  let Inst{11-0} = shift;
-  let Inst{15-12} = Rd;
-  let Inst{19-16} = Rn;
-}
+// NOTE: CPSR def omitted because it will be handled by the custom inserter.
+let usesCustomInserter = 1, Uses = [CPSR] in {
+def RSCSri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
+                  Size4Bytes, IIC_iALUi,
+                  [(set GPR:$Rd, (sube_dead_carry so_imm:$imm, GPR:$Rn))]>;
+def RSCSrs : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
+                  Size4Bytes, IIC_iALUsr,
+                  [(set GPR:$Rd, (sube_dead_carry so_reg:$shift, GPR:$Rn))]>;
 }
 
 // (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
@@ -2300,8 +2332,10 @@ def : ARMPat<(addc   GPR:$src, so_imm_neg:$imm),
 // The with-carry-in form matches bitwise not instead of the negation.
 // Effectively, the inverse interpretation of the carry flag already accounts
 // for part of the negation.
-def : ARMPat<(adde   GPR:$src, so_imm_not:$imm),
+def : ARMPat<(adde_dead_carry   GPR:$src, so_imm_not:$imm),
              (SBCri  GPR:$src, so_imm_not:$imm)>;
+def : ARMPat<(adde_live_carry   GPR:$src, so_imm_not:$imm),
+             (SBCSri GPR:$src, so_imm_not:$imm)>;
 
 // Note: These are implemented in C++ code, because they have to generate
 // ADD/SUBrs instructions, which use a complex pattern that a xform function
@@ -2617,14 +2651,16 @@ def MULv5: ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm,
 def MUL  : AsMul1I32<0b0000000, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
                    IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm",
                    [(set GPR:$Rd, (mul GPR:$Rn, GPR:$Rm))]>,
-                   Requires<[IsARM, HasV6]>;
+                   Requires<[IsARM, HasV6]> {
+  let Inst{15-12} = 0b0000;
+}
 }
 
 let Constraints = "@earlyclobber $Rd" in
 def MLAv5: ARMPseudoInst<(outs GPR:$Rd),
                          (ins GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s),
-                         Size4Bytes, IIC_iMAC32, 
-                         [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>, 
+                         Size4Bytes, IIC_iMAC32,
+                         [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
                         Requires<[IsARM, NoV6]> {
   bits<4> Ra;
   let Inst{15-12} = Ra;
@@ -2657,7 +2693,7 @@ let neverHasSideEffects = 1 in {
 let isCommutable = 1 in {
 let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
 def SMULLv5 : ARMPseudoInst<(outs GPR:$RdLo, GPR:$RdHi),
-                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 
+                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
                             Size4Bytes, IIC_iMUL64, []>,
                            Requires<[IsARM, NoV6]>;
 
@@ -2681,15 +2717,15 @@ def UMULL : AsMul1I64<0b0000100, (outs GPR:$RdLo, GPR:$RdHi),
 // Multiply + accumulate
 let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
 def SMLALv5 : ARMPseudoInst<(outs GPR:$RdLo, GPR:$RdHi),
-                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 
+                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
                             Size4Bytes, IIC_iMAC64, []>,
                            Requires<[IsARM, NoV6]>;
 def UMLALv5 : ARMPseudoInst<(outs GPR:$RdLo, GPR:$RdHi),
-                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 
+                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
                             Size4Bytes, IIC_iMAC64, []>,
                            Requires<[IsARM, NoV6]>;
 def UMAALv5 : ARMPseudoInst<(outs GPR:$RdLo, GPR:$RdHi),
-                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 
+                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
                             Size4Bytes, IIC_iMAC64, []>,
                            Requires<[IsARM, NoV6]>;
 
@@ -2970,17 +3006,25 @@ def REVSH : AMiscA1I<0b01101111, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm),
                IIC_iUNAr, "revsh", "\t$Rd, $Rm",
                [(set GPR:$Rd,
                   (sext_inreg
-                    (or (srl (and GPR:$Rm, 0xFF00), (i32 8)),
+                    (or (srl GPR:$Rm, (i32 8)),
                         (shl GPR:$Rm, (i32 8))), i16))]>,
                Requires<[IsARM, HasV6]>;
 
+def : ARMV6Pat<(sext_inreg (or (srl (and GPR:$Rm, 0xFF00), (i32 8)),
+                               (shl GPR:$Rm, (i32 8))), i16),
+               (REVSH GPR:$Rm)>;
+
+// Need the AddedComplexity or else MOVs + REV would be chosen.
+let AddedComplexity = 5 in
+def : ARMV6Pat<(sra (bswap GPR:$Rm), (i32 16)), (REVSH GPR:$Rm)>;
+
 def lsl_shift_imm : SDNodeXForm<imm, [{
   unsigned Sh = ARM_AM::getSORegOpc(ARM_AM::lsl, N->getZExtValue());
   return CurDAG->getTargetConstant(Sh, MVT::i32);
 }]>;
 
-def lsl_amt : PatLeaf<(i32 imm), [{
-  return (N->getZExtValue() < 32);
+def lsl_amt : ImmLeaf<i32, [{
+  return Imm > 0 && Imm < 32;
 }], lsl_shift_imm>;
 
 def PKHBT : APKHI<0b01101000, 0, (outs GPR:$Rd),
@@ -3002,8 +3046,8 @@ def asr_shift_imm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(Sh, MVT::i32);
 }]>;
 
-def asr_amt : PatLeaf<(i32 imm), [{
-  return (N->getZExtValue() <= 32);
+def asr_amt : ImmLeaf<i32, [{
+  return Imm > 0 && Imm <= 32;
 }], asr_shift_imm>;
 
 // Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
@@ -3119,88 +3163,43 @@ def BCCZi64 : PseudoInst<(outs),
 // Conditional moves
 // FIXME: should be able to write a pattern for ARMcmov, but can't use
 // a two-value operand where a dag node expects two operands. :(
-// FIXME: These should all be pseudo-instructions that get expanded to
-//        the normal MOV instructions. That would fix the dependency on
-//        special casing them in tblgen.
 let neverHasSideEffects = 1 in {
-def MOVCCr : AI1<0b1101, (outs GPR:$Rd), (ins GPR:$false, GPR:$Rm), DPFrm,
-                IIC_iCMOVr, "mov", "\t$Rd, $Rm",
-      [/*(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm, imm:$cc, CCR:$ccr))*/]>,
-                RegConstraint<"$false = $Rd">, UnaryDP {
-  bits<4> Rd;
-  bits<4> Rm;
-  let Inst{25} = 0;
-  let Inst{20} = 0;
-  let Inst{15-12} = Rd;
-  let Inst{11-4} = 0b00000000;
-  let Inst{3-0} = Rm;
-}
-
-def MOVCCs : AI1<0b1101, (outs GPR:$Rd),
-                 (ins GPR:$false, so_reg:$shift), DPSoRegFrm, IIC_iCMOVsr,
-                "mov", "\t$Rd, $shift",
-   [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg:$shift, imm:$cc, CCR:$ccr))*/]>,
-                RegConstraint<"$false = $Rd">, UnaryDP {
-  bits<4> Rd;
-  bits<12> shift;
-  let Inst{25} = 0;
-  let Inst{20} = 0;
-  let Inst{19-16} = 0;
-  let Inst{15-12} = Rd;
-  let Inst{11-0} = shift;
-}
+def MOVCCr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$false, GPR:$Rm, pred:$p),
+                           Size4Bytes, IIC_iCMOVr,
+  [/*(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm, imm:$cc, CCR:$ccr))*/]>,
+      RegConstraint<"$false = $Rd">;
+def MOVCCs : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, so_reg:$shift, pred:$p),
+                           Size4Bytes, IIC_iCMOVsr,
+  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg:$shift, imm:$cc, CCR:$ccr))*/]>,
+      RegConstraint<"$false = $Rd">;
 
 let isMoveImm = 1 in
-def MOVCCi16 : AI1<0b1000, (outs GPR:$Rd), (ins GPR:$false, i32imm_hilo16:$imm),
-                 DPFrm, IIC_iMOVi,
-                 "movw", "\t$Rd, $imm",
-                 []>,
-                 RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>,
-                 UnaryDP {
-  bits<4> Rd;
-  bits<16> imm;
-  let Inst{25} = 1;
-  let Inst{20} = 0;
-  let Inst{19-16} = imm{15-12};
-  let Inst{15-12} = Rd;
-  let Inst{11-0}  = imm{11-0};
-}
+def MOVCCi16 : ARMPseudoInst<(outs GPR:$Rd),
+                             (ins GPR:$false, i32imm_hilo16:$imm, pred:$p),
+                             Size4Bytes, IIC_iMOVi,
+                             []>,
+      RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>;
 
 let isMoveImm = 1 in
-def MOVCCi : AI1<0b1101, (outs GPR:$Rd),
-                         (ins GPR:$false, so_imm:$imm), DPFrm, IIC_iCMOVi,
-                "mov", "\t$Rd, $imm",
+def MOVCCi : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, so_imm:$imm, pred:$p),
+                           Size4Bytes, IIC_iCMOVi,
    [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm:$imm, imm:$cc, CCR:$ccr))*/]>,
-                RegConstraint<"$false = $Rd">, UnaryDP {
-  bits<4> Rd;
-  bits<12> imm;
-  let Inst{25} = 1;
-  let Inst{20} = 0;
-  let Inst{19-16} = 0b0000;
-  let Inst{15-12} = Rd;
-  let Inst{11-0} = imm;
-}
+      RegConstraint<"$false = $Rd">;
 
 // Two instruction predicate mov immediate.
 let isMoveImm = 1 in
-def MOVCCi32imm : PseudoInst<(outs GPR:$Rd),
-                             (ins GPR:$false, i32imm:$src, pred:$p),
-                  IIC_iCMOVix2, []>, RegConstraint<"$false = $Rd">;
+def MOVCCi32imm : ARMPseudoInst<(outs GPR:$Rd),
+                                (ins GPR:$false, i32imm:$src, pred:$p),
+                  Size8Bytes, IIC_iCMOVix2, []>, RegConstraint<"$false = $Rd">;
 
 let isMoveImm = 1 in
-def MVNCCi : AI1<0b1111, (outs GPR:$Rd),
-                         (ins GPR:$false, so_imm:$imm), DPFrm, IIC_iCMOVi,
-                "mvn", "\t$Rd, $imm",
+def MVNCCi : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, so_imm:$imm, pred:$p),
+                           Size4Bytes, IIC_iCMOVi,
  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm, imm:$cc, CCR:$ccr))*/]>,
-                RegConstraint<"$false = $Rd">, UnaryDP {
-  bits<4> Rd;
-  bits<12> imm;
-  let Inst{25} = 1;
-  let Inst{20} = 0;
-  let Inst{19-16} = 0b0000;
-  let Inst{15-12} = Rd;
-  let Inst{11-0} = imm;
-}
+                RegConstraint<"$false = $Rd">;
 } // neverHasSideEffects
 
 //===----------------------------------------------------------------------===//
@@ -3221,13 +3220,6 @@ def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
   let Inst{31-4} = 0xf57ff05;
   let Inst{3-0} = opt;
 }
-
-def DMB_MCR : AInoP<(outs), (ins GPR:$zero), MiscFrm, NoItinerary,
-                       "mcr", "\tp15, 0, $zero, c7, c10, 5",
-                       [(ARMMemBarrierMCR GPR:$zero)]>,
-                       Requires<[IsARM, HasV6]> {
-  // FIXME: add encoding
-}
 }
 
 def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
@@ -3266,6 +3258,18 @@ let usesCustomInserter = 1 in {
     def ATOMIC_LOAD_NAND_I8 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
       [(set GPR:$dst, (atomic_load_nand_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_MIN_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_min_8 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_MAX_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_max_8 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_UMIN_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_min_8 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_UMAX_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_max_8 GPR:$ptr, GPR:$val))]>;
     def ATOMIC_LOAD_ADD_I16 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
       [(set GPR:$dst, (atomic_load_add_16 GPR:$ptr, GPR:$incr))]>;
@@ -3284,6 +3288,18 @@ let usesCustomInserter = 1 in {
     def ATOMIC_LOAD_NAND_I16 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
       [(set GPR:$dst, (atomic_load_nand_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_MIN_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_min_16 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_MAX_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_max_16 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_UMIN_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_min_16 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_UMAX_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_max_16 GPR:$ptr, GPR:$val))]>;
     def ATOMIC_LOAD_ADD_I32 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
       [(set GPR:$dst, (atomic_load_add_32 GPR:$ptr, GPR:$incr))]>;
@@ -3302,6 +3318,18 @@ let usesCustomInserter = 1 in {
     def ATOMIC_LOAD_NAND_I32 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
       [(set GPR:$dst, (atomic_load_nand_32 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_MIN_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_min_32 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_MAX_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_max_32 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_UMIN_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_min_32 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_UMAX_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_max_32 GPR:$ptr, GPR:$val))]>;
 
     def ATOMIC_SWAP_I8 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
@@ -3326,39 +3354,26 @@ let usesCustomInserter = 1 in {
 }
 
 let mayLoad = 1 in {
-def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins GPR:$Rn), NoItinerary,
-                    "ldrexb", "\t$Rt, [$Rn]",
-                    []>;
-def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins GPR:$Rn), NoItinerary,
-                    "ldrexh", "\t$Rt, [$Rn]",
-                    []>;
-def LDREX  : AIldrex<0b00, (outs GPR:$Rt), (ins GPR:$Rn), NoItinerary,
-                    "ldrex", "\t$Rt, [$Rn]",
-                    []>;
-def LDREXD : AIldrex<0b01, (outs GPR:$Rt, GPR:$Rt2), (ins GPR:$Rn),
-                    NoItinerary,
-                    "ldrexd", "\t$Rt, $Rt2, [$Rn]",
-                    []>;
+def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addrmode7:$addr), NoItinerary,
+                    "ldrexb", "\t$Rt, $addr", []>;
+def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addrmode7:$addr), NoItinerary,
+                    "ldrexh", "\t$Rt, $addr", []>;
+def LDREX  : AIldrex<0b00, (outs GPR:$Rt), (ins addrmode7:$addr), NoItinerary,
+                    "ldrex", "\t$Rt, $addr", []>;
+def LDREXD : AIldrex<0b01, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode7:$addr),
+                    NoItinerary, "ldrexd", "\t$Rt, $Rt2, $addr", []>;
 }
 
 let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
-def STREXB : AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$src, GPR:$Rn),
-                    NoItinerary,
-                    "strexb", "\t$Rd, $src, [$Rn]",
-                    []>;
-def STREXH : AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, GPR:$Rn),
-                    NoItinerary,
-                    "strexh", "\t$Rd, $Rt, [$Rn]",
-                    []>;
-def STREX  : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, GPR:$Rn),
-                    NoItinerary,
-                    "strex", "\t$Rd, $Rt, [$Rn]",
-                    []>;
+def STREXB : AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addrmode7:$addr),
+                    NoItinerary, "strexb", "\t$Rd, $Rt, $addr", []>;
+def STREXH : AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addrmode7:$addr),
+                    NoItinerary, "strexh", "\t$Rd, $Rt, $addr", []>;
+def STREX  : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addrmode7:$addr),
+                    NoItinerary, "strex", "\t$Rd, $Rt, $addr", []>;
 def STREXD : AIstrex<0b01, (outs GPR:$Rd),
-                    (ins GPR:$Rt, GPR:$Rt2, GPR:$Rn),
-                    NoItinerary,
-                    "strexd", "\t$Rd, $Rt, $Rt2, [$Rn]",
-                    []>;
+                    (ins GPR:$Rt, GPR:$Rt2, addrmode7:$addr),
+                    NoItinerary, "strexd", "\t$Rd, $Rt, $Rt2, $addr", []>;
 }
 
 // Clear-Exclusive is for disassembly only.
@@ -3377,238 +3392,7 @@ def SWPB : AIswp<1, (outs GPR:$Rt), (ins GPR:$Rt2, GPR:$Rn), "swpb",
 }
 
 //===----------------------------------------------------------------------===//
-// TLS Instructions
-//
-
-// __aeabi_read_tp preserves the registers r1-r3.
-// This is a pseudo inst so that we can get the encoding right, 
-// complete with fixup for the aeabi_read_tp function.
-let isCall = 1,
-  Defs = [R0, R12, LR, CPSR], Uses = [SP] in {
-  def TPsoft : PseudoInst<(outs), (ins), IIC_Br,
-               [(set R0, ARMthread_pointer)]>;
-}
-
-//===----------------------------------------------------------------------===//
-// SJLJ Exception handling intrinsics
-//   eh_sjlj_setjmp() is an instruction sequence to store the return
-//   address and save #0 in R0 for the non-longjmp case.
-//   Since by its nature we may be coming from some other function to get
-//   here, and we're using the stack frame for the containing function to
-//   save/restore registers, we can't keep anything live in regs across
-//   the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
-//   when we get here from a longjmp(). We force everthing out of registers
-//   except for our own input by listing the relevant registers in Defs. By
-//   doing so, we also cause the prologue/epilogue code to actively preserve
-//   all of the callee-saved resgisters, which is exactly what we want.
-//   A constant value is passed in $val, and we use the location as a scratch.
-//
-// These are pseudo-instructions and are lowered to individual MC-insts, so
-// no encoding information is necessary.
-let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR,  D0,
-    D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
-    D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
-    D31 ], hasSideEffects = 1, isBarrier = 1 in {
-  def Int_eh_sjlj_setjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
-                               NoItinerary,
-                         [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
-                           Requires<[IsARM, HasVFP2]>;
-}
-
-let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR ],
-  hasSideEffects = 1, isBarrier = 1 in {
-  def Int_eh_sjlj_setjmp_nofp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
-                                   NoItinerary,
-                         [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
-                                Requires<[IsARM, NoVFP]>;
-}
-
-// FIXME: Non-Darwin version(s)
-let isBarrier = 1, hasSideEffects = 1, isTerminator = 1,
-    Defs = [ R7, LR, SP ] in {
-def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch),
-                             NoItinerary,
-                         [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
-                                Requires<[IsARM, IsDarwin]>;
-}
-
-// eh.sjlj.dispatchsetup pseudo-instruction.
-// This pseudo is used for ARM, Thumb1 and Thumb2. Any differences are
-// handled when the pseudo is expanded (which happens before any passes
-// that need the instruction size).
-let isBarrier = 1, hasSideEffects = 1 in
-def Int_eh_sjlj_dispatchsetup :
- PseudoInst<(outs), (ins GPR:$src), NoItinerary,
-            [(ARMeh_sjlj_dispatchsetup GPR:$src)]>,
-              Requires<[IsDarwin]>;
-
-//===----------------------------------------------------------------------===//
-// Non-Instruction Patterns
-//
-
-// Large immediate handling.
-
-// 32-bit immediate using two piece so_imms or movw + movt.
-// This is a single pseudo instruction, the benefit is that it can be remat'd
-// as a single unit instead of having to handle reg inputs.
-// FIXME: Remove this when we can do generalized remat.
-let isReMaterializable = 1, isMoveImm = 1 in
-def MOVi32imm : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVix2,
-                           [(set GPR:$dst, (arm_i32imm:$src))]>,
-                           Requires<[IsARM]>;
-
-// Pseudo instruction that combines movw + movt + add pc (if PIC).
-// It also makes it possible to rematerialize the instructions.
-// FIXME: Remove this when we can do generalized remat and when machine licm
-// can properly the instructions.
-let isReMaterializable = 1 in {
-def MOV_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
-                              IIC_iMOVix2addpc,
-                        [(set GPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>,
-                        Requires<[IsARM, UseMovt]>;
-
-def MOV_ga_dyn : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
-                             IIC_iMOVix2,
-                        [(set GPR:$dst, (ARMWrapperDYN tglobaladdr:$addr))]>,
-                        Requires<[IsARM, UseMovt]>;
-
-let AddedComplexity = 10 in
-def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
-                                IIC_iMOVix2ld,
-                    [(set GPR:$dst, (load (ARMWrapperPIC tglobaladdr:$addr)))]>,
-                    Requires<[IsARM, UseMovt]>;
-} // isReMaterializable
-
-// ConstantPool, GlobalAddress, and JumpTable
-def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>,
-            Requires<[IsARM, DontUseMovt]>;
-def : ARMPat<(ARMWrapper  tconstpool  :$dst), (LEApcrel tconstpool  :$dst)>;
-def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>,
-            Requires<[IsARM, UseMovt]>;
-def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
-             (LEApcrelJT tjumptable:$dst, imm:$id)>;
-
-// TODO: add,sub,and, 3-instr forms?
-
-// Tail calls
-def : ARMPat<(ARMtcret tcGPR:$dst),
-          (TCRETURNri tcGPR:$dst)>, Requires<[IsDarwin]>;
-
-def : ARMPat<(ARMtcret (i32 tglobaladdr:$dst)),
-          (TCRETURNdi texternalsym:$dst)>, Requires<[IsDarwin]>;
-
-def : ARMPat<(ARMtcret (i32 texternalsym:$dst)),
-          (TCRETURNdi texternalsym:$dst)>, Requires<[IsDarwin]>;
-
-def : ARMPat<(ARMtcret tcGPR:$dst),
-          (TCRETURNriND tcGPR:$dst)>, Requires<[IsNotDarwin]>;
-
-def : ARMPat<(ARMtcret (i32 tglobaladdr:$dst)),
-          (TCRETURNdiND texternalsym:$dst)>, Requires<[IsNotDarwin]>;
-
-def : ARMPat<(ARMtcret (i32 texternalsym:$dst)),
-          (TCRETURNdiND texternalsym:$dst)>, Requires<[IsNotDarwin]>;
-
-// Direct calls
-def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>,
-      Requires<[IsARM, IsNotDarwin]>;
-def : ARMPat<(ARMcall texternalsym:$func), (BLr9 texternalsym:$func)>,
-      Requires<[IsARM, IsDarwin]>;
-
-// zextload i1 -> zextload i8
-def : ARMPat<(zextloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>;
-def : ARMPat<(zextloadi1 ldst_so_reg:$addr),    (LDRBrs ldst_so_reg:$addr)>;
-
-// extload -> zextload
-def : ARMPat<(extloadi1 addrmode_imm12:$addr),  (LDRBi12 addrmode_imm12:$addr)>;
-def : ARMPat<(extloadi1 ldst_so_reg:$addr),     (LDRBrs ldst_so_reg:$addr)>;
-def : ARMPat<(extloadi8 addrmode_imm12:$addr),  (LDRBi12 addrmode_imm12:$addr)>;
-def : ARMPat<(extloadi8 ldst_so_reg:$addr),     (LDRBrs ldst_so_reg:$addr)>;
-
-def : ARMPat<(extloadi16 addrmode3:$addr),  (LDRH addrmode3:$addr)>;
-
-def : ARMPat<(extloadi8  addrmodepc:$addr), (PICLDRB addrmodepc:$addr)>;
-def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>;
-
-// smul* and smla*
-def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
-                      (sra (shl GPR:$b, (i32 16)), (i32 16))),
-                 (SMULBB GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b),
-                 (SMULBB GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
-                      (sra GPR:$b, (i32 16))),
-                 (SMULBT GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))),
-                 (SMULBT GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)),
-                      (sra (shl GPR:$b, (i32 16)), (i32 16))),
-                 (SMULTB GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b),
-                (SMULTB GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
-                      (i32 16)),
-                 (SMULWB GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), (i32 16)),
-                 (SMULWB GPR:$a, GPR:$b)>;
-
-def : ARMV5TEPat<(add GPR:$acc,
-                      (mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
-                           (sra (shl GPR:$b, (i32 16)), (i32 16)))),
-                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
-                      (mul sext_16_node:$a, sext_16_node:$b)),
-                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
-                      (mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
-                           (sra GPR:$b, (i32 16)))),
-                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
-                      (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))),
-                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
-                      (mul (sra GPR:$a, (i32 16)),
-                           (sra (shl GPR:$b, (i32 16)), (i32 16)))),
-                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
-                      (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)),
-                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
-                      (sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
-                           (i32 16))),
-                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
-                      (sra (mul GPR:$a, sext_16_node:$b), (i32 16))),
-                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
-
-//===----------------------------------------------------------------------===//
-// Thumb Support
-//
-
-include "ARMInstrThumb.td"
-
-//===----------------------------------------------------------------------===//
-// Thumb2 Support
-//
-
-include "ARMInstrThumb2.td"
-
-//===----------------------------------------------------------------------===//
-// Floating Point Support
-//
-
-include "ARMInstrVFP.td"
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD (NEON) Support
-//
-
-include "ARMInstrNEON.td"
-
-//===----------------------------------------------------------------------===//
-// Coprocessor Instructions.  For disassembly only.
+// Coprocessor Instructions.
 //
 
 def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1,
@@ -3652,17 +3436,18 @@ def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1,
   let Inst{23-20} = opc1;
 }
 
-class ACI<dag oops, dag iops, string opc, string asm>
-  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, NoItinerary,
-      opc, asm, "", [/* For disassembly only; pattern left blank */]> {
+class ACI<dag oops, dag iops, string opc, string asm,
+          IndexMode im = IndexModeNone>
+  : InoP<oops, iops, AddrModeNone, Size4Bytes, im, BrFrm, NoItinerary,
+         opc, asm, "", [/* For disassembly only; pattern left blank */]> {
   let Inst{27-25} = 0b110;
 }
 
-multiclass LdStCop<bits<4> op31_28, bit load, string opc> {
+multiclass LdStCop<bits<4> op31_28, bit load, dag ops, string opc, string cond>{
 
   def _OFFSET : ACI<(outs),
-      (ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr),
-      opc, "\tp$cop, cr$CRd, $addr"> {
+      !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops),
+      !strconcat(opc, cond), "\tp$cop, cr$CRd, $addr"> {
     let Inst{31-28} = op31_28;
     let Inst{24} = 1; // P = 1
     let Inst{21} = 0; // W = 0
@@ -3671,8 +3456,8 @@ multiclass LdStCop<bits<4> op31_28, bit load, string opc> {
   }
 
   def _PRE : ACI<(outs),
-      (ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr),
-      opc, "\tp$cop, cr$CRd, $addr!"> {
+      !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops),
+      !strconcat(opc, cond), "\tp$cop, cr$CRd, $addr!", IndexModePre> {
     let Inst{31-28} = op31_28;
     let Inst{24} = 1; // P = 1
     let Inst{21} = 1; // W = 1
@@ -3681,8 +3466,8 @@ multiclass LdStCop<bits<4> op31_28, bit load, string opc> {
   }
 
   def _POST : ACI<(outs),
-      (ins nohash_imm:$cop, nohash_imm:$CRd, GPR:$base, am2offset:$offset),
-      opc, "\tp$cop, cr$CRd, [$base], $offset"> {
+      !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops),
+      !strconcat(opc, cond), "\tp$cop, cr$CRd, $addr", IndexModePost> {
     let Inst{31-28} = op31_28;
     let Inst{24} = 0; // P = 0
     let Inst{21} = 1; // W = 1
@@ -3691,8 +3476,9 @@ multiclass LdStCop<bits<4> op31_28, bit load, string opc> {
   }
 
   def _OPTION : ACI<(outs),
-      (ins nohash_imm:$cop, nohash_imm:$CRd, GPR:$base, i32imm:$option),
-      opc, "\tp$cop, cr$CRd, [$base], $option"> {
+      !con((ins nohash_imm:$cop,nohash_imm:$CRd,GPR:$base, nohash_imm:$option),
+            ops),
+      !strconcat(opc, cond), "\tp$cop, cr$CRd, [$base], \\{$option\\}"> {
     let Inst{31-28} = op31_28;
     let Inst{24} = 0; // P = 0
     let Inst{23} = 1; // U = 1
@@ -3702,8 +3488,8 @@ multiclass LdStCop<bits<4> op31_28, bit load, string opc> {
   }
 
   def L_OFFSET : ACI<(outs),
-      (ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr),
-      !strconcat(opc, "l"), "\tp$cop, cr$CRd, $addr"> {
+      !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops),
+      !strconcat(!strconcat(opc, "l"), cond), "\tp$cop, cr$CRd, $addr"> {
     let Inst{31-28} = op31_28;
     let Inst{24} = 1; // P = 1
     let Inst{21} = 0; // W = 0
@@ -3712,8 +3498,9 @@ multiclass LdStCop<bits<4> op31_28, bit load, string opc> {
   }
 
   def L_PRE : ACI<(outs),
-      (ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr),
-      !strconcat(opc, "l"), "\tp$cop, cr$CRd, $addr!"> {
+      !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops),
+      !strconcat(!strconcat(opc, "l"), cond), "\tp$cop, cr$CRd, $addr!",
+      IndexModePre> {
     let Inst{31-28} = op31_28;
     let Inst{24} = 1; // P = 1
     let Inst{21} = 1; // W = 1
@@ -3722,8 +3509,9 @@ multiclass LdStCop<bits<4> op31_28, bit load, string opc> {
   }
 
   def L_POST : ACI<(outs),
-      (ins nohash_imm:$cop, nohash_imm:$CRd, GPR:$base, am2offset:$offset),
-      !strconcat(opc, "l"), "\tp$cop, cr$CRd, [$base], $offset"> {
+      !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops),
+      !strconcat(!strconcat(opc, "l"), cond), "\tp$cop, cr$CRd, $addr",
+      IndexModePost> {
     let Inst{31-28} = op31_28;
     let Inst{24} = 0; // P = 0
     let Inst{21} = 1; // W = 1
@@ -3732,8 +3520,10 @@ multiclass LdStCop<bits<4> op31_28, bit load, string opc> {
   }
 
   def L_OPTION : ACI<(outs),
-      (ins nohash_imm:$cop, nohash_imm:$CRd, GPR:$base, nohash_imm:$option),
-      !strconcat(opc, "l"), "\tp$cop, cr$CRd, [$base], $option"> {
+      !con((ins nohash_imm:$cop, nohash_imm:$CRd,GPR:$base,nohash_imm:$option),
+            ops),
+      !strconcat(!strconcat(opc, "l"), cond),
+      "\tp$cop, cr$CRd, [$base], \\{$option\\}"> {
     let Inst{31-28} = op31_28;
     let Inst{24} = 0; // P = 0
     let Inst{23} = 1; // U = 1
@@ -3743,19 +3533,18 @@ multiclass LdStCop<bits<4> op31_28, bit load, string opc> {
   }
 }
 
-defm LDC  : LdStCop<{?,?,?,?}, 1, "ldc">;
-defm LDC2 : LdStCop<0b1111,    1, "ldc2">;
-defm STC  : LdStCop<{?,?,?,?}, 0, "stc">;
-defm STC2 : LdStCop<0b1111,    0, "stc2">;
+defm LDC  : LdStCop<{?,?,?,?}, 1, (ins pred:$p), "ldc",  "${p}">;
+defm LDC2 : LdStCop<0b1111,    1, (ins),         "ldc2", "">;
+defm STC  : LdStCop<{?,?,?,?}, 0, (ins pred:$p), "stc",  "${p}">;
+defm STC2 : LdStCop<0b1111,    0, (ins),         "stc2", "">;
 
 //===----------------------------------------------------------------------===//
 // Move between coprocessor and ARM core register -- for disassembly only
 //
 
-class MovRCopro<string opc, bit direction>
-  : ABI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1,
-        GPR:$Rt, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
-        NoItinerary, opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2",
+class MovRCopro<string opc, bit direction, dag oops, dag iops>
+  : ABI<0b1110, oops, iops, NoItinerary, opc,
+        "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2",
         [/* For disassembly only; pattern left blank */]> {
   let Inst{20} = direction;
   let Inst{4} = 1;
@@ -3775,13 +3564,17 @@ class MovRCopro<string opc, bit direction>
   let Inst{19-16} = CRn;
 }
 
-def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */>;
-def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */>;
+def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
+                    (outs), (ins p_imm:$cop, i32imm:$opc1,
+                                 GPR:$Rt, c_imm:$CRn, c_imm:$CRm,
+                                 i32imm:$opc2)>;
+def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */,
+                    (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1,
+                                         c_imm:$CRn, c_imm:$CRm, i32imm:$opc2)>;
 
-class MovRCopro2<string opc, bit direction>
-  : ABXI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1,
-         GPR:$Rt, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
-         NoItinerary, !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"),
+class MovRCopro2<string opc, bit direction, dag oops, dag iops>
+  : ABXI<0b1110, oops, iops, NoItinerary,
+         !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"),
          [/* For disassembly only; pattern left blank */]> {
   let Inst{31-28} = 0b1111;
   let Inst{20} = direction;
@@ -3802,8 +3595,14 @@ class MovRCopro2<string opc, bit direction>
   let Inst{19-16} = CRn;
 }
 
-def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */>;
-def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */>;
+def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
+                      (outs), (ins p_imm:$cop, i32imm:$opc1,
+                                   GPR:$Rt, c_imm:$CRn, c_imm:$CRm,
+                                   i32imm:$opc2)>;
+def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
+                      (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1,
+                                           c_imm:$CRn, c_imm:$CRm,
+                                           i32imm:$opc2)>;
 
 class MovRRCopro<string opc, bit direction>
   : ABI<0b1100, (outs), (ins p_imm:$cop, i32imm:$opc1,
@@ -3909,3 +3708,241 @@ def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask,  so_imm:$a), NoItinerary,
   let Inst{15-12} = 0b1111;
   let Inst{11-0} = a;
 }
+
+//===----------------------------------------------------------------------===//
+// TLS Instructions
+//
+
+// __aeabi_read_tp preserves the registers r1-r3.
+// This is a pseudo inst so that we can get the encoding right,
+// complete with fixup for the aeabi_read_tp function.
+let isCall = 1,
+  Defs = [R0, R12, LR, CPSR], Uses = [SP] in {
+  def TPsoft : PseudoInst<(outs), (ins), IIC_Br,
+               [(set R0, ARMthread_pointer)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling intrinsics
+//   eh_sjlj_setjmp() is an instruction sequence to store the return
+//   address and save #0 in R0 for the non-longjmp case.
+//   Since by its nature we may be coming from some other function to get
+//   here, and we're using the stack frame for the containing function to
+//   save/restore registers, we can't keep anything live in regs across
+//   the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
+//   when we get here from a longjmp(). We force everything out of registers
+//   except for our own input by listing the relevant registers in Defs. By
+//   doing so, we also cause the prologue/epilogue code to actively preserve
+//   all of the callee-saved resgisters, which is exactly what we want.
+//   A constant value is passed in $val, and we use the location as a scratch.
+//
+// These are pseudo-instructions and are lowered to individual MC-insts, so
+// no encoding information is necessary.
+let Defs =
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR,  D0,
+    D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
+    D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
+    D31 ], hasSideEffects = 1, isBarrier = 1 in {
+  def Int_eh_sjlj_setjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
+                               NoItinerary,
+                         [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
+                           Requires<[IsARM, HasVFP2]>;
+}
+
+let Defs =
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR ],
+  hasSideEffects = 1, isBarrier = 1 in {
+  def Int_eh_sjlj_setjmp_nofp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
+                                   NoItinerary,
+                         [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
+                                Requires<[IsARM, NoVFP]>;
+}
+
+// FIXME: Non-Darwin version(s)
+let isBarrier = 1, hasSideEffects = 1, isTerminator = 1,
+    Defs = [ R7, LR, SP ] in {
+def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch),
+                             NoItinerary,
+                         [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
+                                Requires<[IsARM, IsDarwin]>;
+}
+
+// eh.sjlj.dispatchsetup pseudo-instruction.
+// This pseudo is used for ARM, Thumb1 and Thumb2. Any differences are
+// handled when the pseudo is expanded (which happens before any passes
+// that need the instruction size).
+let isBarrier = 1, hasSideEffects = 1 in
+def Int_eh_sjlj_dispatchsetup :
+ PseudoInst<(outs), (ins), NoItinerary,
+            [(ARMeh_sjlj_dispatchsetup)]>,
+              Requires<[IsDarwin]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// Large immediate handling.
+
+// 32-bit immediate using two piece so_imms or movw + movt.
+// This is a single pseudo instruction, the benefit is that it can be remat'd
+// as a single unit instead of having to handle reg inputs.
+// FIXME: Remove this when we can do generalized remat.
+let isReMaterializable = 1, isMoveImm = 1 in
+def MOVi32imm : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVix2,
+                           [(set GPR:$dst, (arm_i32imm:$src))]>,
+                           Requires<[IsARM]>;
+
+// Pseudo instruction that combines movw + movt + add pc (if PIC).
+// It also makes it possible to rematerialize the instructions.
+// FIXME: Remove this when we can do generalized remat and when machine licm
+// can properly the instructions.
+let isReMaterializable = 1 in {
+def MOV_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                              IIC_iMOVix2addpc,
+                        [(set GPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>,
+                        Requires<[IsARM, UseMovt]>;
+
+def MOV_ga_dyn : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                             IIC_iMOVix2,
+                        [(set GPR:$dst, (ARMWrapperDYN tglobaladdr:$addr))]>,
+                        Requires<[IsARM, UseMovt]>;
+
+let AddedComplexity = 10 in
+def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                                IIC_iMOVix2ld,
+                    [(set GPR:$dst, (load (ARMWrapperPIC tglobaladdr:$addr)))]>,
+                    Requires<[IsARM, UseMovt]>;
+} // isReMaterializable
+
+// ConstantPool, GlobalAddress, and JumpTable
+def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>,
+            Requires<[IsARM, DontUseMovt]>;
+def : ARMPat<(ARMWrapper  tconstpool  :$dst), (LEApcrel tconstpool  :$dst)>;
+def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>,
+            Requires<[IsARM, UseMovt]>;
+def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
+             (LEApcrelJT tjumptable:$dst, imm:$id)>;
+
+// TODO: add,sub,and, 3-instr forms?
+
+// Tail calls
+def : ARMPat<(ARMtcret tcGPR:$dst),
+          (TCRETURNri tcGPR:$dst)>, Requires<[IsDarwin]>;
+
+def : ARMPat<(ARMtcret (i32 tglobaladdr:$dst)),
+          (TCRETURNdi texternalsym:$dst)>, Requires<[IsDarwin]>;
+
+def : ARMPat<(ARMtcret (i32 texternalsym:$dst)),
+          (TCRETURNdi texternalsym:$dst)>, Requires<[IsDarwin]>;
+
+def : ARMPat<(ARMtcret tcGPR:$dst),
+          (TCRETURNriND tcGPR:$dst)>, Requires<[IsNotDarwin]>;
+
+def : ARMPat<(ARMtcret (i32 tglobaladdr:$dst)),
+          (TCRETURNdiND texternalsym:$dst)>, Requires<[IsNotDarwin]>;
+
+def : ARMPat<(ARMtcret (i32 texternalsym:$dst)),
+          (TCRETURNdiND texternalsym:$dst)>, Requires<[IsNotDarwin]>;
+
+// Direct calls
+def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>,
+      Requires<[IsARM, IsNotDarwin]>;
+def : ARMPat<(ARMcall texternalsym:$func), (BLr9 texternalsym:$func)>,
+      Requires<[IsARM, IsDarwin]>;
+
+// zextload i1 -> zextload i8
+def : ARMPat<(zextloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>;
+def : ARMPat<(zextloadi1 ldst_so_reg:$addr),    (LDRBrs ldst_so_reg:$addr)>;
+
+// extload -> zextload
+def : ARMPat<(extloadi1 addrmode_imm12:$addr),  (LDRBi12 addrmode_imm12:$addr)>;
+def : ARMPat<(extloadi1 ldst_so_reg:$addr),     (LDRBrs ldst_so_reg:$addr)>;
+def : ARMPat<(extloadi8 addrmode_imm12:$addr),  (LDRBi12 addrmode_imm12:$addr)>;
+def : ARMPat<(extloadi8 ldst_so_reg:$addr),     (LDRBrs ldst_so_reg:$addr)>;
+
+def : ARMPat<(extloadi16 addrmode3:$addr),  (LDRH addrmode3:$addr)>;
+
+def : ARMPat<(extloadi8  addrmodepc:$addr), (PICLDRB addrmodepc:$addr)>;
+def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>;
+
+// smul* and smla*
+def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                      (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                 (SMULBB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b),
+                 (SMULBB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                      (sra GPR:$b, (i32 16))),
+                 (SMULBT GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))),
+                 (SMULBT GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)),
+                      (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                 (SMULTB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b),
+                (SMULTB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                      (i32 16)),
+                 (SMULWB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), (i32 16)),
+                 (SMULWB GPR:$a, GPR:$b)>;
+
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                           (sra (shl GPR:$b, (i32 16)), (i32 16)))),
+                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul sext_16_node:$a, sext_16_node:$b)),
+                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                           (sra GPR:$b, (i32 16)))),
+                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))),
+                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul (sra GPR:$a, (i32 16)),
+                           (sra (shl GPR:$b, (i32 16)), (i32 16)))),
+                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)),
+                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                           (i32 16))),
+                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (sra (mul GPR:$a, sext_16_node:$b), (i32 16))),
+                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
+
+
+// Pre-v7 uses MCR for synchronization barriers.
+def : ARMPat<(ARMMemBarrierMCR GPR:$zero), (MCR 15, 0, GPR:$zero, 7, 10, 5)>,
+         Requires<[IsARM, HasV6]>;
+
+
+//===----------------------------------------------------------------------===//
+// Thumb Support
+//
+
+include "ARMInstrThumb.td"
+
+//===----------------------------------------------------------------------===//
+// Thumb2 Support
+//
+
+include "ARMInstrThumb2.td"
+
+//===----------------------------------------------------------------------===//
+// Floating Point Support
+//
+
+include "ARMInstrVFP.td"
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD (NEON) Support
+//
+
+include "ARMInstrNEON.td"
+
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index dc3d63e..e34d69a 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -80,6 +80,12 @@ def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
 def NEONvorrImm   : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>;
 def NEONvbicImm   : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>;
 
+def NEONvbsl      : SDNode<"ARMISD::VBSL",
+                           SDTypeProfile<1, 3, [SDTCisVec<0>,
+                                                SDTCisSameAs<0, 1>,
+                                                SDTCisSameAs<0, 2>,
+                                                SDTCisSameAs<0, 3>]>>;
+
 def NEONvdup      : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
 
 // VDUPLANE can produce a quad-register result from a double-register source,
@@ -146,10 +152,6 @@ def VLDMQIA
   : PseudoVFPLdStM<(outs QPR:$dst), (ins GPR:$Rn),
                     IIC_fpLoad_m, "",
                    [(set QPR:$dst, (v2f64 (load GPR:$Rn)))]>;
-def VLDMQDB
-  : PseudoVFPLdStM<(outs QPR:$dst), (ins GPR:$Rn),
-                    IIC_fpLoad_m, "",
-                   [(set QPR:$dst, (v2f64 (load GPR:$Rn)))]>;
 
 // Use VSTM to store a Q register as a D register pair.
 // This is a pseudo instruction that is expanded to VSTMD after reg alloc.
@@ -157,10 +159,6 @@ def VSTMQIA
   : PseudoVFPLdStM<(outs), (ins QPR:$src, GPR:$Rn),
                     IIC_fpStore_m, "",
                    [(store (v2f64 QPR:$src), GPR:$Rn)]>;
-def VSTMQDB
-  : PseudoVFPLdStM<(outs), (ins QPR:$src, GPR:$Rn),
-                    IIC_fpStore_m, "",
-                   [(store (v2f64 QPR:$src), GPR:$Rn)]>;
 
 // Classes for VLD* pseudo-instructions with multi-register operands.
 // These are expanded to real instructions after register allocation.
@@ -1801,7 +1799,7 @@ class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 class N3VDSL<bits<2> op21_20, bits<4> op11_8,
              InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType Ty, SDNode ShOp>
-  : N3V<0, 1, op21_20, op11_8, 1, 0,
+  : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
         [(set (Ty DPR:$Vd),
@@ -1811,7 +1809,7 @@ class N3VDSL<bits<2> op21_20, bits<4> op11_8,
 }
 class N3VDSL16<bits<2> op21_20, bits<4> op11_8,
                string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
-  : N3V<0, 1, op21_20, op11_8, 1, 0,
+  : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
         NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$Vd, $Vn, $Vm[$lane]","",
         [(set (Ty DPR:$Vd),
@@ -1841,7 +1839,7 @@ class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 class N3VQSL<bits<2> op21_20, bits<4> op11_8,
              InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType ResTy, ValueType OpTy, SDNode ShOp>
-  : N3V<1, 1, op21_20, op11_8, 1, 0,
+  : N3VLane32<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
         [(set (ResTy QPR:$Vd),
@@ -1852,7 +1850,7 @@ class N3VQSL<bits<2> op21_20, bits<4> op11_8,
 }
 class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt,
                ValueType ResTy, ValueType OpTy, SDNode ShOp>
-  : N3V<1, 1, op21_20, op11_8, 1, 0,
+  : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
         NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$Vd, $Vn, $Vm[$lane]","",
         [(set (ResTy QPR:$Vd),
@@ -1874,7 +1872,7 @@ class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 }
 class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp>
-  : N3V<0, 1, op21_20, op11_8, 1, 0,
+  : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
         [(set (Ty DPR:$Vd),
@@ -1885,7 +1883,7 @@ class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
 }
 class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp>
-  : N3V<0, 1, op21_20, op11_8, 1, 0,
+  : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
         [(set (Ty DPR:$Vd),
@@ -1915,7 +1913,7 @@ class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt,
                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
-  : N3V<1, 1, op21_20, op11_8, 1, 0,
+  : N3VLane32<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
         [(set (ResTy QPR:$Vd),
@@ -1927,7 +1925,7 @@ class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
 class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   string OpcodeStr, string Dt,
                   ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
-  : N3V<1, 1, op21_20, op11_8, 1, 0,
+  : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
         [(set (ResTy QPR:$Vd),
@@ -1959,7 +1957,7 @@ class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   string OpcodeStr, string Dt,
                   ValueType Ty, SDPatternOperator MulOp, SDPatternOperator ShOp>
-  : N3V<0, 1, op21_20, op11_8, 1, 0,
+  : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$Vd),
         (ins DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin,
@@ -1972,7 +1970,7 @@ class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
 class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                     string OpcodeStr, string Dt,
                     ValueType Ty, SDNode MulOp, SDNode ShOp>
-  : N3V<0, 1, op21_20, op11_8, 1, 0,
+  : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$Vd),
         (ins DPR:$src1, DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin,
@@ -1994,7 +1992,7 @@ class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
                   SDPatternOperator MulOp, SDPatternOperator ShOp>
-  : N3V<1, 1, op21_20, op11_8, 1, 0,
+  : N3VLane32<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd),
         (ins QPR:$src1, QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin,
@@ -2008,7 +2006,7 @@ class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                     string OpcodeStr, string Dt,
                     ValueType ResTy, ValueType OpTy,
                     SDNode MulOp, SDNode ShOp>
-  : N3V<1, 1, op21_20, op11_8, 1, 0,
+  : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd),
         (ins QPR:$src1, QPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin,
@@ -2069,7 +2067,7 @@ class N3VLMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 class N3VLMulOpSL<bit op24, bits<2> op21_20, bits<4> op11_8,
                   InstrItinClass itin, string OpcodeStr, string Dt,
                   ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
-  : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd),
+  : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd),
         (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd",
@@ -2081,7 +2079,7 @@ class N3VLMulOpSL<bit op24, bits<2> op21_20, bits<4> op11_8,
 class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                     InstrItinClass itin, string OpcodeStr, string Dt,
                     ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
-  : N3V<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd),
+  : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd),
         (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "$src1 = $Vd",
@@ -2116,7 +2114,7 @@ class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                  string OpcodeStr, string Dt,
                  ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
-  : N3V<op24, 1, op21_20, op11_8, 1, 0,
+  : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd),
         (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin,
@@ -2129,7 +2127,7 @@ class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
 class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                    InstrItinClass itin, string OpcodeStr, string Dt,
                    ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
-  : N3V<op24, 1, op21_20, op11_8, 1, 0,
+  : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd),
         (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin,
@@ -2164,7 +2162,7 @@ class N3VL<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8,
              InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType TyQ, ValueType TyD, SDNode OpNode>
-  : N3V<op24, 1, op21_20, op11_8, 1, 0,
+  : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
         [(set QPR:$Vd,
@@ -2173,7 +2171,7 @@ class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8,
 class N3VLSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                InstrItinClass itin, string OpcodeStr, string Dt,
                ValueType TyQ, ValueType TyD, SDNode OpNode>
-  : N3V<op24, 1, op21_20, op11_8, 1, 0,
+  : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
         [(set QPR:$Vd,
@@ -2219,7 +2217,7 @@ class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt,
                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
-  : N3V<op24, 1, op21_20, op11_8, 1, 0,
+  : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
         [(set (ResTy QPR:$Vd),
@@ -2229,7 +2227,7 @@ class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
 class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                   InstrItinClass itin, string OpcodeStr, string Dt,
                   ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
-  : N3V<op24, 1, op21_20, op11_8, 1, 0,
+  : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, nohash_imm:$lane),
         NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm[$lane]", "",
         [(set (ResTy QPR:$Vd),
@@ -2288,17 +2286,17 @@ class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 // Shift by immediate,
 // both double- and quad-register.
 class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
-             Format f, InstrItinClass itin, string OpcodeStr, string Dt,
-             ValueType Ty, SDNode OpNode>
+             Format f, InstrItinClass itin, Operand ImmTy,
+             string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode>
   : N2VImm<op24, op23, op11_8, op7, 0, op4,
-           (outs DPR:$Vd), (ins DPR:$Vm, i32imm:$SIMM), f, itin,
+           (outs DPR:$Vd), (ins DPR:$Vm, ImmTy:$SIMM), f, itin,
            OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
            [(set DPR:$Vd, (Ty (OpNode (Ty DPR:$Vm), (i32 imm:$SIMM))))]>;
 class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
-             Format f, InstrItinClass itin, string OpcodeStr, string Dt,
-             ValueType Ty, SDNode OpNode>
+             Format f, InstrItinClass itin, Operand ImmTy,
+             string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode>
   : N2VImm<op24, op23, op11_8, op7, 1, op4,
-           (outs QPR:$Vd), (ins QPR:$Vm, i32imm:$SIMM), f, itin,
+           (outs QPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), f, itin,
            OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
            [(set QPR:$Vd, (Ty (OpNode (Ty QPR:$Vm), (i32 imm:$SIMM))))]>;
 
@@ -2315,9 +2313,9 @@ class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
 // Narrow shift by immediate.
 class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
              InstrItinClass itin, string OpcodeStr, string Dt,
-             ValueType ResTy, ValueType OpTy, SDNode OpNode>
+             ValueType ResTy, ValueType OpTy, Operand ImmTy, SDNode OpNode>
   : N2VImm<op24, op23, op11_8, op7, op6, op4,
-           (outs DPR:$Vd), (ins QPR:$Vm, i32imm:$SIMM), N2RegVShRFrm, itin,
+           (outs DPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, itin,
            OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
            [(set DPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm),
                                           (i32 imm:$SIMM))))]>;
@@ -2325,16 +2323,18 @@ class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
 // Shift right by immediate and accumulate,
 // both double- and quad-register.
 class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
-                string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
+                Operand ImmTy, string OpcodeStr, string Dt,
+                ValueType Ty, SDNode ShOp>
   : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd),
-           (ins DPR:$src1, DPR:$Vm, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD,
+           (ins DPR:$src1, DPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, IIC_VPALiD,
            OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
            [(set DPR:$Vd, (Ty (add DPR:$src1,
                                 (Ty (ShOp DPR:$Vm, (i32 imm:$SIMM))))))]>;
 class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
-                string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
+                Operand ImmTy, string OpcodeStr, string Dt,
+                ValueType Ty, SDNode ShOp>
   : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd),
-           (ins QPR:$src1, QPR:$Vm, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD,
+           (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, IIC_VPALiD,
            OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
            [(set QPR:$Vd, (Ty (add QPR:$src1,
                                 (Ty (ShOp QPR:$Vm, (i32 imm:$SIMM))))))]>;
@@ -2342,15 +2342,17 @@ class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
 // Shift by immediate and insert,
 // both double- and quad-register.
 class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
-                Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp>
+                Operand ImmTy, Format f, string OpcodeStr, string Dt,
+                ValueType Ty,SDNode ShOp>
   : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd),
-           (ins DPR:$src1, DPR:$Vm, i32imm:$SIMM), f, IIC_VSHLiD,
+           (ins DPR:$src1, DPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiD,
            OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
            [(set DPR:$Vd, (Ty (ShOp DPR:$src1, DPR:$Vm, (i32 imm:$SIMM))))]>;
 class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
-                Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp>
+                Operand ImmTy, Format f, string OpcodeStr, string Dt,
+                ValueType Ty,SDNode ShOp>
   : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd),
-           (ins QPR:$src1, QPR:$Vm, i32imm:$SIMM), f, IIC_VSHLiQ,
+           (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiQ,
            OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
            [(set QPR:$Vd, (Ty (ShOp QPR:$src1, QPR:$Vm, (i32 imm:$SIMM))))]>;
 
@@ -3010,40 +3012,77 @@ multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
 // Neon 2-register vector shift by immediate,
 //   with f of either N2RegVShLFrm or N2RegVShRFrm
 //   element sizes of 8, 16, 32 and 64 bits:
-multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
-                     InstrItinClass itin, string OpcodeStr, string Dt,
-                     SDNode OpNode, Format f> {
+multiclass N2VShL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>;
+                             // imm6 = xxxxxx
+}
+multiclass N2VShR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       SDNode OpNode> {
   // 64-bit vector types.
-  def v8i8  : N2VDSh<op24, op23, op11_8, 0, op4, f, itin,
+  def v8i8  : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm8,
                      OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> {
     let Inst{21-19} = 0b001; // imm6 = 001xxx
   }
-  def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, f, itin,
+  def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm16,
                      OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> {
     let Inst{21-20} = 0b01;  // imm6 = 01xxxx
   }
-  def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, f, itin,
+  def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm32,
                      OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> {
     let Inst{21} = 0b1;      // imm6 = 1xxxxx
   }
-  def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, f, itin,
+  def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64,
                      OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>;
                              // imm6 = xxxxxx
 
   // 128-bit vector types.
-  def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin,
+  def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm8,
                      OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> {
     let Inst{21-19} = 0b001; // imm6 = 001xxx
   }
-  def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin,
+  def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm16,
                      OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> {
     let Inst{21-20} = 0b01;  // imm6 = 01xxxx
   }
-  def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin,
+  def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm32,
                      OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> {
     let Inst{21} = 0b1;      // imm6 = 1xxxxx
   }
-  def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, f, itin,
+  def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64,
                      OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>;
                              // imm6 = xxxxxx
 }
@@ -3053,79 +3092,113 @@ multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
 multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
                          string OpcodeStr, string Dt, SDNode ShOp> {
   // 64-bit vector types.
-  def v8i8  : N2VDShAdd<op24, op23, op11_8, 0, op4,
+  def v8i8  : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm8,
                         OpcodeStr, !strconcat(Dt, "8"), v8i8, ShOp> {
     let Inst{21-19} = 0b001; // imm6 = 001xxx
   }
-  def v4i16 : N2VDShAdd<op24, op23, op11_8, 0, op4,
+  def v4i16 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm16,
                         OpcodeStr, !strconcat(Dt, "16"), v4i16, ShOp> {
     let Inst{21-20} = 0b01;  // imm6 = 01xxxx
   }
-  def v2i32 : N2VDShAdd<op24, op23, op11_8, 0, op4,
+  def v2i32 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm32,
                         OpcodeStr, !strconcat(Dt, "32"), v2i32, ShOp> {
     let Inst{21} = 0b1;      // imm6 = 1xxxxx
   }
-  def v1i64 : N2VDShAdd<op24, op23, op11_8, 1, op4,
+  def v1i64 : N2VDShAdd<op24, op23, op11_8, 1, op4, shr_imm64,
                         OpcodeStr, !strconcat(Dt, "64"), v1i64, ShOp>;
                              // imm6 = xxxxxx
 
   // 128-bit vector types.
-  def v16i8 : N2VQShAdd<op24, op23, op11_8, 0, op4,
+  def v16i8 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm8,
                         OpcodeStr, !strconcat(Dt, "8"), v16i8, ShOp> {
     let Inst{21-19} = 0b001; // imm6 = 001xxx
   }
-  def v8i16 : N2VQShAdd<op24, op23, op11_8, 0, op4,
+  def v8i16 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm16,
                         OpcodeStr, !strconcat(Dt, "16"), v8i16, ShOp> {
     let Inst{21-20} = 0b01;  // imm6 = 01xxxx
   }
-  def v4i32 : N2VQShAdd<op24, op23, op11_8, 0, op4,
+  def v4i32 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm32,
                         OpcodeStr, !strconcat(Dt, "32"), v4i32, ShOp> {
     let Inst{21} = 0b1;      // imm6 = 1xxxxx
   }
-  def v2i64 : N2VQShAdd<op24, op23, op11_8, 1, op4,
+  def v2i64 : N2VQShAdd<op24, op23, op11_8, 1, op4, shr_imm64,
                         OpcodeStr, !strconcat(Dt, "64"), v2i64, ShOp>;
                              // imm6 = xxxxxx
 }
 
-
 // Neon Shift-Insert vector operations,
 //   with f of either N2RegVShLFrm or N2RegVShRFrm
 //   element sizes of 8, 16, 32 and 64 bits:
-multiclass N2VShIns_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
-                         string OpcodeStr, SDNode ShOp,
-                         Format f> {
+multiclass N2VShInsL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                          string OpcodeStr> {
+  // 64-bit vector types.
+  def v8i8  : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "8", v8i8, NEONvsli> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "16", v4i16, NEONvsli> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "32", v2i32, NEONvsli> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "64", v1i64, NEONvsli>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "8", v16i8, NEONvsli> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "16", v8i16, NEONvsli> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "32", v4i32, NEONvsli> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "64", v2i64, NEONvsli>;
+                             // imm6 = xxxxxx
+}
+multiclass N2VShInsR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                          string OpcodeStr> {
   // 64-bit vector types.
-  def v8i8  : N2VDShIns<op24, op23, op11_8, 0, op4,
-                        f, OpcodeStr, "8", v8i8, ShOp> {
+  def v8i8  : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm8,
+                        N2RegVShRFrm, OpcodeStr, "8", v8i8, NEONvsri> {
     let Inst{21-19} = 0b001; // imm6 = 001xxx
   }
-  def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4,
-                        f, OpcodeStr, "16", v4i16, ShOp> {
+  def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm16,
+                        N2RegVShRFrm, OpcodeStr, "16", v4i16, NEONvsri> {
     let Inst{21-20} = 0b01;  // imm6 = 01xxxx
   }
-  def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4,
-                        f, OpcodeStr, "32", v2i32, ShOp> {
+  def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm32,
+                        N2RegVShRFrm, OpcodeStr, "32", v2i32, NEONvsri> {
     let Inst{21} = 0b1;      // imm6 = 1xxxxx
   }
-  def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4,
-                        f, OpcodeStr, "64", v1i64, ShOp>;
+  def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, shr_imm64,
+                        N2RegVShRFrm, OpcodeStr, "64", v1i64, NEONvsri>;
                              // imm6 = xxxxxx
 
   // 128-bit vector types.
-  def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4,
-                        f, OpcodeStr, "8", v16i8, ShOp> {
+  def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm8,
+                        N2RegVShRFrm, OpcodeStr, "8", v16i8, NEONvsri> {
     let Inst{21-19} = 0b001; // imm6 = 001xxx
   }
-  def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4,
-                        f, OpcodeStr, "16", v8i16, ShOp> {
+  def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm16,
+                        N2RegVShRFrm, OpcodeStr, "16", v8i16, NEONvsri> {
     let Inst{21-20} = 0b01;  // imm6 = 01xxxx
   }
-  def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4,
-                        f, OpcodeStr, "32", v4i32, ShOp> {
+  def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm32,
+                        N2RegVShRFrm, OpcodeStr, "32", v4i32, NEONvsri> {
     let Inst{21} = 0b1;      // imm6 = 1xxxxx
   }
-  def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4,
-                        f, OpcodeStr, "64", v2i64, ShOp>;
+  def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, shr_imm64,
+                        N2RegVShRFrm, OpcodeStr, "64", v2i64, NEONvsri>;
                              // imm6 = xxxxxx
 }
 
@@ -3153,15 +3226,18 @@ multiclass N2VNSh_HSD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
                       bit op4, InstrItinClass itin, string OpcodeStr, string Dt,
                       SDNode OpNode> {
   def v8i8 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
-                    OpcodeStr, !strconcat(Dt, "16"), v8i8, v8i16, OpNode> {
+                    OpcodeStr, !strconcat(Dt, "16"),
+                    v8i8, v8i16, shr_imm8, OpNode> {
     let Inst{21-19} = 0b001; // imm6 = 001xxx
   }
   def v4i16 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
-                     OpcodeStr, !strconcat(Dt, "32"), v4i16, v4i32, OpNode> {
+                     OpcodeStr, !strconcat(Dt, "32"),
+                     v4i16, v4i32, shr_imm16, OpNode> {
     let Inst{21-20} = 0b01;  // imm6 = 01xxxx
   }
   def v2i32 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
-                     OpcodeStr, !strconcat(Dt, "64"), v2i32, v2i64, OpNode> {
+                     OpcodeStr, !strconcat(Dt, "64"),
+                     v2i32, v2i64, shr_imm32, OpNode> {
     let Inst{21} = 0b1;      // imm6 = 1xxxxx
   }
 }
@@ -3697,16 +3773,21 @@ def  VBSLd    : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
                      (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
                      N3RegFrm, IIC_VCNTiD,
                      "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
-                     [(set DPR:$Vd,
-                       (v2i32 (or (and DPR:$Vn, DPR:$src1),
-                                  (and DPR:$Vm, (vnotd DPR:$src1)))))]>;
+                     [(set DPR:$Vd, (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
+
+def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd),
+                     (and DPR:$Vm, (vnotd DPR:$Vd)))),
+          (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
+
 def  VBSLq    : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
                      (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
                      N3RegFrm, IIC_VCNTiQ,
                      "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
-                     [(set QPR:$Vd,
-                       (v4i32 (or (and QPR:$Vn, QPR:$src1),
-                                  (and QPR:$Vm, (vnotq QPR:$src1)))))]>;
+                     [(set QPR:$Vd, (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
+
+def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd),
+                     (and QPR:$Vm, (vnotq QPR:$Vd)))),
+          (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
 
 //   VBIF     : Vector Bitwise Insert if False
 //              like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst",
@@ -3917,14 +3998,13 @@ defm VSHLs    : N3VInt_QHSDSh<0, 0, 0b0100, 0, N3RegVShFrm,
 defm VSHLu    : N3VInt_QHSDSh<1, 0, 0b0100, 0, N3RegVShFrm,
                             IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ,
                             "vshl", "u", int_arm_neon_vshiftu>;
+
 //   VSHL     : Vector Shift Left (Immediate)
-defm VSHLi    : N2VSh_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl,
-                           N2RegVShLFrm>;
+defm VSHLi    : N2VShL_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl>;
+
 //   VSHR     : Vector Shift Right (Immediate)
-defm VSHRs    : N2VSh_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", NEONvshrs,
-                           N2RegVShRFrm>;
-defm VSHRu    : N2VSh_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", NEONvshru,
-                           N2RegVShRFrm>;
+defm VSHRs    : N2VShR_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s",NEONvshrs>;
+defm VSHRu    : N2VShR_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u",NEONvshru>;
 
 //   VSHLL    : Vector Shift Left Long
 defm VSHLLs   : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s", NEONvshlls>;
@@ -3957,10 +4037,8 @@ defm VRSHLu   : N3VInt_QHSDSh<1, 0, 0b0101, 0, N3RegVShFrm,
                             IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
                             "vrshl", "u", int_arm_neon_vrshiftu>;
 //   VRSHR    : Vector Rounding Shift Right
-defm VRSHRs   : N2VSh_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", NEONvrshrs,
-                           N2RegVShRFrm>;
-defm VRSHRu   : N2VSh_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", NEONvrshru,
-                           N2RegVShRFrm>;
+defm VRSHRs   : N2VShR_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s",NEONvrshrs>;
+defm VRSHRu   : N2VShR_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u",NEONvrshru>;
 
 //   VRSHRN   : Vector Rounding Shift Right and Narrow
 defm VRSHRN   : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i",
@@ -3974,13 +4052,11 @@ defm VQSHLu   : N3VInt_QHSDSh<1, 0, 0b0100, 1, N3RegVShFrm,
                             IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
                             "vqshl", "u", int_arm_neon_vqshiftu>;
 //   VQSHL    : Vector Saturating Shift Left (Immediate)
-defm VQSHLsi  : N2VSh_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls,
-                           N2RegVShLFrm>;
-defm VQSHLui  : N2VSh_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu,
-                           N2RegVShLFrm>;
+defm VQSHLsi  : N2VShL_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls>;
+defm VQSHLui  : N2VShL_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu>;
+
 //   VQSHLU   : Vector Saturating Shift Left (Immediate, Unsigned)
-defm VQSHLsu  : N2VSh_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu,
-                           N2RegVShLFrm>;
+defm VQSHLsu  : N2VShL_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu>;
 
 //   VQSHRN   : Vector Saturating Shift Right and Narrow
 defm VQSHRNs  : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s",
@@ -4018,9 +4094,10 @@ defm VRSRAs   : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrs>;
 defm VRSRAu   : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshru>;
 
 //   VSLI     : Vector Shift Left and Insert
-defm VSLI     : N2VShIns_QHSD<1, 1, 0b0101, 1, "vsli", NEONvsli, N2RegVShLFrm>;
+defm VSLI     : N2VShInsL_QHSD<1, 1, 0b0101, 1, "vsli">;
+
 //   VSRI     : Vector Shift Right and Insert
-defm VSRI     : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri", NEONvsri, N2RegVShRFrm>;
+defm VSRI     : N2VShInsR_QHSD<1, 1, 0b0100, 1, "vsri">;
 
 // Vector Absolute and Saturating Absolute.
 
@@ -4362,14 +4439,8 @@ def  VDUP8q   : VDUPQ<0b11101110, 0b00, "8", v16i8>;
 def  VDUP16q  : VDUPQ<0b11101010, 0b01, "16", v8i16>;
 def  VDUP32q  : VDUPQ<0b11101010, 0b00, "32", v4i32>;
 
-def  VDUPfd   : NVDup<0b11101000, 0b1011, 0b00, (outs DPR:$V), (ins GPR:$R),
-                      IIC_VMOVIS, "vdup", "32", "$V, $R",
-                      [(set DPR:$V, (v2f32 (NEONvdup
-                                              (f32 (bitconvert GPR:$R)))))]>;
-def  VDUPfq   : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$V), (ins GPR:$R),
-                      IIC_VMOVIS, "vdup", "32", "$V, $R",
-                      [(set QPR:$V, (v4f32 (NEONvdup
-                                              (f32 (bitconvert GPR:$R)))))]>;
+def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>;
+def : Pat<(v4f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>;
 
 //   VDUP     : Vector Duplicate Lane (from scalar to all elements)
 
@@ -4397,9 +4468,6 @@ def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16> {
 def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32> {
   let Inst{19} = lane{0};
 }
-def VDUPLNfd  : VDUPLND<{?,1,0,0}, "vdup", "32", v2f32> {
-  let Inst{19} = lane{0};
-}
 def VDUPLN8q  : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8> {
   let Inst{19-17} = lane{2-0};
 }
@@ -4409,9 +4477,12 @@ def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16> {
 def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32> {
   let Inst{19} = lane{0};
 }
-def VDUPLNfq  : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4f32, v2f32> {
-  let Inst{19} = lane{0};
-}
+
+def : Pat<(v2f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)),
+          (VDUPLN32d DPR:$Vm, imm:$lane)>;
+
+def : Pat<(v4f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)),
+          (VDUPLN32q DPR:$Vm, imm:$lane)>;
 
 def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)),
           (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src,
@@ -4426,7 +4497,7 @@ def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)),
                                     (DSubReg_i32_reg imm:$lane))),
                             (SubReg_i32_lane imm:$lane)))>;
 def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)),
-          (v4f32 (VDUPLNfq (v2f32 (EXTRACT_SUBREG QPR:$src,
+          (v4f32 (VDUPLN32q (v2f32 (EXTRACT_SUBREG QPR:$src,
                                    (DSubReg_i32_reg imm:$lane))),
                            (SubReg_i32_lane imm:$lane)))>;
 
@@ -4517,12 +4588,12 @@ class VREV64Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
 def VREV64d8  : VREV64D<0b00, "vrev64", "8", v8i8>;
 def VREV64d16 : VREV64D<0b01, "vrev64", "16", v4i16>;
 def VREV64d32 : VREV64D<0b10, "vrev64", "32", v2i32>;
-def VREV64df  : VREV64D<0b10, "vrev64", "32", v2f32>;
+def : Pat<(v2f32 (NEONvrev64 (v2f32 DPR:$Vm))), (VREV64d32 DPR:$Vm)>;
 
 def VREV64q8  : VREV64Q<0b00, "vrev64", "8", v16i8>;
 def VREV64q16 : VREV64Q<0b01, "vrev64", "16", v8i16>;
 def VREV64q32 : VREV64Q<0b10, "vrev64", "32", v4i32>;
-def VREV64qf  : VREV64Q<0b10, "vrev64", "32", v4f32>;
+def : Pat<(v4f32 (NEONvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>;
 
 //   VREV32   : Vector Reverse elements within 32-bit words
 
@@ -4628,8 +4699,8 @@ def VEXTq32 : VEXTq<"vext", "32", v4i32> {
   let Inst{9-8}    = 0b00;
 }
 def VEXTqf  : VEXTq<"vext", "32", v4f32> {
-  let Inst{11}    = index{0};
-  let Inst{10-8}  = 0b000;
+  let Inst{11-10} = index{1-0};
+  let Inst{9-8}   = 0b00;
 }
 
 //   VTRN     : Vector Transpose
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 826ef46..8c542fe 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -27,22 +27,22 @@ def imm_comp_XFORM : SDNodeXForm<imm, [{
 }]>;
 
 /// imm0_7 predicate - True if the 32-bit immediate is in the range [0,7].
-def imm0_7 : PatLeaf<(i32 imm), [{
-  return (uint32_t)N->getZExtValue() < 8;
+def imm0_7 : ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 8;
 }]>;
 def imm0_7_neg : PatLeaf<(i32 imm), [{
   return (uint32_t)-N->getZExtValue() < 8;
 }], imm_neg_XFORM>;
 
-def imm0_255 : PatLeaf<(i32 imm), [{
-  return (uint32_t)N->getZExtValue() < 256;
+def imm0_255 : ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 256;
 }]>;
 def imm0_255_comp : PatLeaf<(i32 imm), [{
   return ~((uint32_t)N->getZExtValue()) < 256;
 }]>;
 
-def imm8_255 : PatLeaf<(i32 imm), [{
-  return (uint32_t)N->getZExtValue() >= 8 && (uint32_t)N->getZExtValue() < 256;
+def imm8_255 : ImmLeaf<i32, [{
+  return Imm >= 8 && Imm < 256;
 }]>;
 def imm8_255_neg : PatLeaf<(i32 imm), [{
   unsigned Val = -N->getZExtValue();
@@ -369,6 +369,15 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
     let Inst{2-0} = 0b000;
   }
 
+  def tBX_Rm : TI<(outs), (ins pred:$p, GPR:$Rm), IIC_Br, "bx${p}\t$Rm",
+                  [/* for disassembly only */]>,
+               T1Special<{1,1,0,?}> {
+    // A6.2.3 & A8.6.25
+    bits<4> Rm;
+    let Inst{6-3} = Rm;
+    let Inst{2-0} = 0b000;
+  }
+
   // Alternative return instruction used by vararg functions.
   def tBX_RET_vararg : TI<(outs), (ins tGPR:$Rm),
                           IIC_Br, "bx\t$Rm",
@@ -712,6 +721,19 @@ def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
   let Inst{7-0}  = addr;
 }
 
+// FIXME: Remove this entry when the above ldr.n workaround is fixed.
+// For disassembly use only.
+def tLDRpciDIS : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
+                       "ldr", "\t$Rt, $addr",
+                       [/* disassembly only */]>,
+                 T1Encoding<{0,1,0,0,1,?}> {
+  // A6.2 & A8.6.59
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0}  = addr;
+}
+
 // A8.6.194 & A8.6.192
 defm tSTR  : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rrs4,
                                 t_addrmode_is4, AddrModeT1_4,
@@ -1175,10 +1197,18 @@ def tREVSH :                    // A8.6.136
                  "revsh", "\t$Rd, $Rm",
                  [(set tGPR:$Rd,
                        (sext_inreg
-                         (or (srl (and tGPR:$Rm, 0xFF00), (i32 8)),
+                         (or (srl tGPR:$Rm, (i32 8)),
                              (shl tGPR:$Rm, (i32 8))), i16))]>,
                  Requires<[IsThumb, IsThumb1Only, HasV6]>;
 
+def : T1Pat<(sext_inreg (or (srl (and tGPR:$Rm, 0xFF00), (i32 8)),
+                            (shl tGPR:$Rm, (i32 8))), i16),
+            (tREVSH tGPR:$Rm)>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+def : T1Pat<(sra (bswap tGPR:$Rm), (i32 16)), (tREVSH tGPR:$Rm)>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
 // Rotate right register
 def tROR :                      // A8.6.139
   T1sItDPEncode<0b0111, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
@@ -1322,10 +1352,8 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
 // Move between coprocessor and ARM core register -- for disassembly only
 //
 
-class tMovRCopro<string opc, bit direction>
-  : T1Cop<(outs), (ins p_imm:$cop, i32imm:$opc1,
-                       GPR:$Rt, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
-          !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"),
+class tMovRCopro<string opc, bit direction, dag oops, dag iops>
+  : T1Cop<oops, iops, !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"),
           [/* For disassembly only; pattern left blank */]> {
   let Inst{27-24} = 0b1110;
   let Inst{20} = direction;
@@ -1346,8 +1374,12 @@ class tMovRCopro<string opc, bit direction>
   let Inst{19-16} = CRn;
 }
 
-def tMCR : tMovRCopro<"mcr", 0 /* from ARM core register to coprocessor */>;
-def tMRC : tMovRCopro<"mrc", 1 /* from coprocessor to ARM core register */>;
+def tMCR : tMovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
+           (outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn,
+                        c_imm:$CRm, i32imm:$opc2)>;
+def tMRC : tMovRCopro<"mrc", 1 /* from coprocessor to ARM core register */,
+           (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn,
+                                c_imm:$CRm, i32imm:$opc2)>;
 
 class tMovRRCopro<string opc, bit direction>
   : T1Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
@@ -1420,7 +1452,7 @@ def tTPsoft : TIx2<0b11110, 0b11, 1, (outs), (ins), IIC_Br,
 // from some other function to get here, and we're using the stack frame for the
 // containing function to save/restore registers, we can't keep anything live in
 // regs across the eh_sjlj_setjmp(), else it will almost certainly have been
-// tromped upon when we get here from a longjmp(). We force everthing out of
+// tromped upon when we get here from a longjmp(). We force everything out of
 // registers except for our own input by listing the relevant registers in
 // Defs. By doing so, we also cause the prologue/epilogue code to actively
 // preserve all of the callee-saved resgisters, which is exactly what we want.
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 0e01be5..600a121 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -44,7 +44,9 @@ def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{
 // t2_so_imm - Match a 32-bit immediate operand, which is an
 // 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit
 // immediate splatted into multiple bytes of the word.
-def t2_so_imm : Operand<i32>, PatLeaf<(imm), [{ return Pred_t2_so_imm(N); }]> {
+def t2_so_imm : Operand<i32>, ImmLeaf<i32, [{
+    return ARM_AM::getT2SOImmVal(Imm) != -1;
+  }]> {
   let EncoderMethod = "getT2SOImmOpValue";
 }
 
@@ -61,49 +63,15 @@ def t2_so_imm_neg : Operand<i32>,
   return ARM_AM::getT2SOImmVal(-((uint32_t)N->getZExtValue())) != -1;
 }], t2_so_imm_neg_XFORM>;
 
-// Break t2_so_imm's up into two pieces.  This handles immediates with up to 16
-// bits set in them.  This uses t2_so_imm2part to match and t2_so_imm2part_[12]
-// to get the first/second pieces.
-def t2_so_imm2part : Operand<i32>,
-                  PatLeaf<(imm), [{
-      return ARM_AM::isT2SOImmTwoPartVal((unsigned)N->getZExtValue());
-    }]> {
-}
-
-def t2_so_imm2part_1 : SDNodeXForm<imm, [{
-  unsigned V = ARM_AM::getT2SOImmTwoPartFirst((unsigned)N->getZExtValue());
-  return CurDAG->getTargetConstant(V, MVT::i32);
-}]>;
-
-def t2_so_imm2part_2 : SDNodeXForm<imm, [{
-  unsigned V = ARM_AM::getT2SOImmTwoPartSecond((unsigned)N->getZExtValue());
-  return CurDAG->getTargetConstant(V, MVT::i32);
-}]>;
-
-def t2_so_neg_imm2part : Operand<i32>, PatLeaf<(imm), [{
-      return ARM_AM::isT2SOImmTwoPartVal(-(int)N->getZExtValue());
-    }]> {
-}
-
-def t2_so_neg_imm2part_1 : SDNodeXForm<imm, [{
-  unsigned V = ARM_AM::getT2SOImmTwoPartFirst(-(int)N->getZExtValue());
-  return CurDAG->getTargetConstant(V, MVT::i32);
-}]>;
-
-def t2_so_neg_imm2part_2 : SDNodeXForm<imm, [{
-  unsigned V = ARM_AM::getT2SOImmTwoPartSecond(-(int)N->getZExtValue());
-  return CurDAG->getTargetConstant(V, MVT::i32);
-}]>;
-
 /// imm1_31 predicate - True if the 32-bit immediate is in the range [1,31].
-def imm1_31 : PatLeaf<(i32 imm), [{
-  return (int32_t)N->getZExtValue() >= 1 && (int32_t)N->getZExtValue() < 32;
+def imm1_31 : ImmLeaf<i32, [{
+  return (int32_t)Imm >= 1 && (int32_t)Imm < 32;
 }]>;
 
 /// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095].
 def imm0_4095 : Operand<i32>,
-                PatLeaf<(i32 imm), [{
-  return (uint32_t)N->getZExtValue() < 4096;
+                ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 4096;
 }]>;
 
 def imm0_4095_neg : PatLeaf<(i32 imm), [{
@@ -118,6 +86,11 @@ def imm0_255_not : PatLeaf<(i32 imm), [{
   return (uint32_t)(~N->getZExtValue()) < 255;
 }], imm_comp_XFORM>;
 
+def lo5AllOne : PatLeaf<(i32 imm), [{
+  // Returns true if all low 5-bits are 1.
+  return (((uint32_t)N->getZExtValue()) & 0x1FUL) == 0x1FUL;
+}]>;
+
 // Define Thumb2 specific addressing modes.
 
 // t2addrmode_imm12  := reg + imm12
@@ -129,6 +102,12 @@ def t2addrmode_imm12 : Operand<i32>,
   let ParserMatchClass = MemMode5AsmOperand;
 }
 
+// t2ldrlabel  := imm12
+def t2ldrlabel : Operand<i32> {
+  let EncoderMethod = "getAddrModeImm12OpValue";
+}
+
+
 // ADR instruction labels.
 def t2adrlabel : Operand<i32> {
   let EncoderMethod = "getT2AdrLabelOpValue";
@@ -173,6 +152,15 @@ def t2addrmode_so_reg : Operand<i32>,
   let ParserMatchClass = MemMode5AsmOperand;
 }
 
+// t2addrmode_reg := reg
+// Used by load/store exclusive instructions. Useful to enable right assembly
+// parsing and printing. Not used for any codegen matching.
+//
+def t2addrmode_reg : Operand<i32> {
+  let PrintMethod = "printAddrMode7Operand";
+  let MIOperandInfo = (ops tGPR);
+  let ParserMatchClass = MemMode7AsmOperand;
+}
 
 //===----------------------------------------------------------------------===//
 // Multiclass helpers...
@@ -700,49 +688,27 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
      let Inst{24-21} = opcod;
    }
 }
+}
 
 // Carry setting variants
-let isCodeGenOnly = 1, Defs = [CPSR] in {
-multiclass T2I_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode,
-                               bit Commutable = 0> {
+// NOTE: CPSR def omitted because it will be handled by the custom inserter.
+let usesCustomInserter = 1 in {
+multiclass T2I_adde_sube_s_irs<PatFrag opnode, bit Commutable = 0> {
    // shifted imm
-   def ri : T2sTwoRegImm<
-                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi,
-                 opc, "\t$Rd, $Rn, $imm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>,
-                 Requires<[IsThumb2]> {
-     let Inst{31-27} = 0b11110;
-     let Inst{25} = 0;
-     let Inst{24-21} = opcod;
-     let Inst{20} = 1; // The S bit.
-     let Inst{15} = 0;
-   }
+   def ri : t2PseudoInst<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm),
+                Size4Bytes, IIC_iALUi,
+                [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>;
    // register
-   def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
-                 opc, ".w\t$Rd, $Rn, $Rm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
-                 Requires<[IsThumb2]> {
+   def rr : t2PseudoInst<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
+                Size4Bytes, IIC_iALUr,
+                [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]> {
      let isCommutable = Commutable;
-     let Inst{31-27} = 0b11101;
-     let Inst{26-25} = 0b01;
-     let Inst{24-21} = opcod;
-     let Inst{20} = 1; // The S bit.
-     let Inst{14-12} = 0b000; // imm3
-     let Inst{7-6} = 0b00; // imm2
-     let Inst{5-4} = 0b00; // type
    }
    // shifted register
-   def rs : T2sTwoRegShiftedReg<
-                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
-                 IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>,
-                 Requires<[IsThumb2]> {
-     let Inst{31-27} = 0b11101;
-     let Inst{26-25} = 0b01;
-     let Inst{24-21} = opcod;
-     let Inst{20} = 1; // The S bit.
-   }
-}
+   def rs : t2PseudoInst<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
+                Size4Bytes, IIC_iALUsi,
+                [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>;
 }
 }
 
@@ -864,6 +830,7 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
     let Inst{15-12} = Rt;
 
     bits<17> addr;
+    let addr{12}    = 1;           // add = TRUE
     let Inst{19-16} = addr{16-13}; // Rn
     let Inst{23}    = addr{12};    // U
     let Inst{11-0}  = addr{11-0};  // imm
@@ -911,7 +878,7 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
   }
 
   // FIXME: Is the pci variant actually needed?
-  def pci : T2Ipc <(outs GPR:$Rt), (ins i32imm:$addr), iii,
+  def pci : T2Ipc <(outs GPR:$Rt), (ins t2ldrlabel:$addr), iii,
                    opc, ".w\t$Rt, $addr",
                    [(set GPR:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]> {
     let isReMaterializable = 1;
@@ -944,6 +911,7 @@ multiclass T2I_st<bits<2> opcod, string opc,
     let Inst{15-12} = Rt;
 
     bits<17> addr;
+    let addr{12}    = 1;           // add = TRUE
     let Inst{19-16} = addr{16-13}; // Rn
     let Inst{23}    = addr{12};    // U
     let Inst{11-0}  = addr{11-0};  // imm
@@ -1398,7 +1366,7 @@ def t2LDRSH_POST : T2Iidxldst<1, 0b01, 1, 0, (outs GPR:$dst, GPR:$Rn),
 // for disassembly only.
 // Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4
 class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii>
-  : T2Ii8<(outs GPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
+  : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
           "\t$Rt, $addr", []> {
   let Inst{31-27} = 0b11111;
   let Inst{26-25} = 0b00;
@@ -1440,42 +1408,48 @@ def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
 def t2STR_PRE  : T2Iidxldst<0, 0b10, 0, 1, (outs GPR:$base_wb),
                             (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
-                         "str", "\t$Rt, [$Rn, $addr]!", "$Rn = $base_wb",
+                         "str", "\t$Rt, [$Rn, $addr]!",
+                         "$Rn = $base_wb,@earlyclobber $base_wb",
              [(set GPR:$base_wb,
                    (pre_store GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
 
 def t2STR_POST : T2Iidxldst<0, 0b10, 0, 0, (outs GPR:$base_wb),
                             (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
                             AddrModeT2_i8, IndexModePost, IIC_iStore_iu,
-                          "str", "\t$Rt, [$Rn], $addr", "$Rn = $base_wb",
+                          "str", "\t$Rt, [$Rn], $addr",
+                          "$Rn = $base_wb,@earlyclobber $base_wb",
              [(set GPR:$base_wb,
                   (post_store GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
 
 def t2STRH_PRE  : T2Iidxldst<0, 0b01, 0, 1, (outs GPR:$base_wb),
                             (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
-                        "strh", "\t$Rt, [$Rn, $addr]!", "$Rn = $base_wb",
+                        "strh", "\t$Rt, [$Rn, $addr]!",
+                        "$Rn = $base_wb,@earlyclobber $base_wb",
         [(set GPR:$base_wb,
               (pre_truncsti16 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
 
 def t2STRH_POST : T2Iidxldst<0, 0b01, 0, 0, (outs GPR:$base_wb),
                             (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
                             AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
-                         "strh", "\t$Rt, [$Rn], $addr", "$Rn = $base_wb",
+                         "strh", "\t$Rt, [$Rn], $addr",
+                         "$Rn = $base_wb,@earlyclobber $base_wb",
        [(set GPR:$base_wb,
              (post_truncsti16 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
 
 def t2STRB_PRE  : T2Iidxldst<0, 0b00, 0, 1, (outs GPR:$base_wb),
                             (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu,
-                        "strb", "\t$Rt, [$Rn, $addr]!", "$Rn = $base_wb",
+                        "strb", "\t$Rt, [$Rn, $addr]!",
+                        "$Rn = $base_wb,@earlyclobber $base_wb",
          [(set GPR:$base_wb,
                (pre_truncsti8 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
 
 def t2STRB_POST : T2Iidxldst<0, 0b00, 0, 0, (outs GPR:$base_wb),
                             (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
                             AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
-                         "strb", "\t$Rt, [$Rn], $addr", "$Rn = $base_wb",
+                         "strb", "\t$Rt, [$Rn], $addr",
+                         "$Rn = $base_wb,@earlyclobber $base_wb",
         [(set GPR:$base_wb,
               (post_truncsti8 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
 
@@ -1483,7 +1457,7 @@ def t2STRB_POST : T2Iidxldst<0, 0b00, 0, 0, (outs GPR:$base_wb),
 // only.
 // Ref: A8.6.193 STR (immediate, Thumb) Encoding T4
 class T2IstT<bits<2> type, string opc, InstrItinClass ii>
-  : T2Ii8<(outs GPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
+  : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
           "\t$Rt, $addr", []> {
   let Inst{31-27} = 0b11111;
   let Inst{26-25} = 0b00;
@@ -1508,20 +1482,20 @@ def t2STRHT  : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
 // ldrd / strd pre / post variants
 // For disassembly only.
 
-def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs GPR:$Rt, GPR:$Rt2),
+def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2),
                  (ins GPR:$base, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru,
                  "ldrd", "\t$Rt, $Rt2, [$base, $imm]!", []>;
 
-def t2LDRD_POST : T2Ii8s4<0, 1, 1, (outs GPR:$Rt, GPR:$Rt2),
+def t2LDRD_POST : T2Ii8s4<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2),
                  (ins GPR:$base, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru,
                  "ldrd", "\t$Rt, $Rt2, [$base], $imm", []>;
 
 def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs),
-                 (ins GPR:$Rt, GPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm),
+                 (ins rGPR:$Rt, rGPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm),
                  IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, [$base, $imm]!", []>;
 
 def t2STRD_POST : T2Ii8s4<0, 1, 0, (outs),
-                 (ins GPR:$Rt, GPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm),
+                 (ins rGPR:$Rt, rGPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm),
                  IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, [$base], $imm", []>;
 
 // T2Ipl (Preload Data/Instruction) signals the memory system of possible future
@@ -1541,6 +1515,7 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
     let Inst{15-12} = 0b1111;
 
     bits<17> addr;
+    let addr{12}    = 1;           // add = TRUE
     let Inst{19-16} = addr{16-13}; // Rn
     let Inst{23}    = addr{12};    // U
     let Inst{11-0}  = addr{11-0};  // imm12
@@ -1813,10 +1788,8 @@ defm t2ADC  : T2I_adde_sube_irs<0b1010, "adc",
                           BinOpFrag<(adde_dead_carry node:$LHS, node:$RHS)>, 1>;
 defm t2SBC  : T2I_adde_sube_irs<0b1011, "sbc",
                           BinOpFrag<(sube_dead_carry node:$LHS, node:$RHS)>>;
-defm t2ADCS : T2I_adde_sube_s_irs<0b1010, "adc",
-                          BinOpFrag<(adde_live_carry node:$LHS, node:$RHS)>, 1>;
-defm t2SBCS : T2I_adde_sube_s_irs<0b1011, "sbc",
-                          BinOpFrag<(sube_live_carry node:$LHS, node:$RHS)>>;
+defm t2ADCS : T2I_adde_sube_s_irs<BinOpFrag<(adde_live_carry node:$LHS, node:$RHS)>, 1>;
+defm t2SBCS : T2I_adde_sube_s_irs<BinOpFrag<(sube_live_carry node:$LHS, node:$RHS)>>;
 
 // RSB
 defm t2RSB  : T2I_rbin_irs  <0b1110, "rsb",
@@ -1847,9 +1820,14 @@ def : T2Pat<(addc       rGPR:$src, t2_so_imm_neg:$imm),
 // Effectively, the inverse interpretation of the carry flag already accounts
 // for part of the negation.
 let AddedComplexity = 1 in
-def : T2Pat<(adde       rGPR:$src, imm0_255_not:$imm),
+def : T2Pat<(adde_dead_carry       rGPR:$src, imm0_255_not:$imm),
+            (t2SBCri    rGPR:$src, imm0_255_not:$imm)>;
+def : T2Pat<(adde_dead_carry       rGPR:$src, t2_so_imm_not:$imm),
+            (t2SBCri    rGPR:$src, t2_so_imm_not:$imm)>;
+let AddedComplexity = 1 in
+def : T2Pat<(adde_live_carry       rGPR:$src, imm0_255_not:$imm),
             (t2SBCSri   rGPR:$src, imm0_255_not:$imm)>;
-def : T2Pat<(adde       rGPR:$src, t2_so_imm_not:$imm),
+def : T2Pat<(adde_live_carry       rGPR:$src, t2_so_imm_not:$imm),
             (t2SBCSri   rGPR:$src, t2_so_imm_not:$imm)>;
 
 // Select Bytes -- for disassembly only
@@ -2052,6 +2030,10 @@ defm t2LSR  : T2I_sh_ir<0b01, "lsr", BinOpFrag<(srl  node:$LHS, node:$RHS)>>;
 defm t2ASR  : T2I_sh_ir<0b10, "asr", BinOpFrag<(sra  node:$LHS, node:$RHS)>>;
 defm t2ROR  : T2I_sh_ir<0b11, "ror", BinOpFrag<(rotr node:$LHS, node:$RHS)>>;
 
+// (rotr x, (and y, 0x...1f)) ==> (ROR x, y)
+def : Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
+          (t2RORrr rGPR:$lhs, rGPR:$rhs)>;
+
 let Uses = [CPSR] in {
 def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
                    "rrx", "\t$Rd, $Rm",
@@ -2140,10 +2122,12 @@ def t2BFC : T2BitFI<(outs rGPR:$Rd), (ins rGPR:$src, bf_inv_mask_imm:$imm),
                 IIC_iUNAsi, "bfc", "\t$Rd, $imm",
                 [(set rGPR:$Rd, (and rGPR:$src, bf_inv_mask_imm:$imm))]> {
   let Inst{31-27} = 0b11110;
+  let Inst{26} = 0; // should be 0.
   let Inst{25} = 1;
   let Inst{24-20} = 0b10110;
   let Inst{19-16} = 0b1111; // Rn
   let Inst{15} = 0;
+  let Inst{5} = 0; // should be 0.
 
   bits<10> imm;
   let msb{4-0} = imm{9-5};
@@ -2176,9 +2160,11 @@ let Constraints = "$src = $Rd" in {
                   [(set rGPR:$Rd, (ARMbfi rGPR:$src, rGPR:$Rn,
                                    bf_inv_mask_imm:$imm))]> {
     let Inst{31-27} = 0b11110;
+    let Inst{26} = 0; // should be 0.
     let Inst{25} = 1;
     let Inst{24-20} = 0b10110;
     let Inst{15} = 0;
+    let Inst{5} = 0; // should be 0.
 
     bits<10> imm;
     let msb{4-0} = imm{9-5};
@@ -2193,9 +2179,11 @@ let Constraints = "$src = $Rd" in {
                   IIC_iBITi, "bfi", "\t$Rd, $Rn, $lsbit, $width",
                   []> {
     let Inst{31-27} = 0b11110;
+    let Inst{26} = 0; // should be 0.
     let Inst{25} = 1;
     let Inst{24-20} = 0b10110;
     let Inst{15} = 0;
+    let Inst{5} = 0; // should be 0.
 
     bits<5> lsbit;
     bits<5> width;
@@ -2607,9 +2595,15 @@ def t2REVSH : T2I_misc<0b01, 0b11, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
                        "revsh", ".w\t$Rd, $Rm",
                  [(set rGPR:$Rd,
                     (sext_inreg
-                      (or (srl (and rGPR:$Rm, 0xFF00), (i32 8)),
+                      (or (srl rGPR:$Rm, (i32 8)),
                           (shl rGPR:$Rm, (i32 8))), i16))]>;
 
+def : T2Pat<(sext_inreg (or (srl (and rGPR:$Rm, 0xFF00), (i32 8)),
+                            (shl rGPR:$Rm, (i32 8))), i16),
+            (t2REVSH rGPR:$Rm)>;
+
+def : T2Pat<(sra (bswap rGPR:$Rm), (i32 16)), (t2REVSH rGPR:$Rm)>;
+
 def t2PKHBT : T2ThreeReg<
             (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, shift_imm:$sh),
                   IIC_iBITsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh",
@@ -2843,9 +2837,9 @@ class T2I_ldrex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz,
   let Inst{5-4} = opcod;
   let Inst{3-0} = 0b1111;
 
-  bits<4> Rn;
+  bits<4> addr;
   bits<4> Rt;
-  let Inst{19-16} = Rn;
+  let Inst{19-16} = addr;
   let Inst{15-12} = Rt;
 }
 class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz,
@@ -2859,37 +2853,37 @@ class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz,
   let Inst{5-4} = opcod;
 
   bits<4> Rd;
-  bits<4> Rn;
+  bits<4> addr;
   bits<4> Rt;
-  let Inst{11-8}  = Rd;
-  let Inst{19-16} = Rn;
+  let Inst{3-0}  = Rd;
+  let Inst{19-16} = addr;
   let Inst{15-12} = Rt;
 }
 
 let mayLoad = 1 in {
-def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins rGPR:$Rn), AddrModeNone,
-                         Size4Bytes, NoItinerary, "ldrexb", "\t$Rt, [$Rn]",
+def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins t2addrmode_reg:$addr), AddrModeNone,
+                         Size4Bytes, NoItinerary, "ldrexb", "\t$Rt, $addr",
                          "", []>;
-def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins rGPR:$Rn), AddrModeNone,
-                         Size4Bytes, NoItinerary, "ldrexh", "\t$Rt, [$Rn]",
+def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins t2addrmode_reg:$addr), AddrModeNone,
+                         Size4Bytes, NoItinerary, "ldrexh", "\t$Rt, $addr",
                          "", []>;
-def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins rGPR:$Rn), AddrModeNone,
+def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_reg:$addr), AddrModeNone,
                        Size4Bytes, NoItinerary,
-                       "ldrex", "\t$Rt, [$Rn]", "",
+                       "ldrex", "\t$Rt, $addr", "",
                       []> {
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0000101;
   let Inst{11-8} = 0b1111;
   let Inst{7-0} = 0b00000000; // imm8 = 0
 
-  bits<4> Rn;
   bits<4> Rt;
-  let Inst{19-16} = Rn;
+  bits<4> addr;
+  let Inst{19-16} = addr;
   let Inst{15-12} = Rt;
 }
-def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2), (ins rGPR:$Rn),
+def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2), (ins t2addrmode_reg:$addr),
                          AddrModeNone, Size4Bytes, NoItinerary,
-                         "ldrexd", "\t$Rt, $Rt2, [$Rn]", "",
+                         "ldrexd", "\t$Rt, $Rt2, $addr", "",
                          [], {?, ?, ?, ?}> {
   bits<4> Rt2;
   let Inst{11-8} = Rt2;
@@ -2897,31 +2891,31 @@ def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2), (ins rGPR:$Rn),
 }
 
 let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
-def t2STREXB : T2I_strex<0b00, (outs rGPR:$Rd), (ins rGPR:$Rt, rGPR:$Rn),
-                         AddrModeNone, Size4Bytes, NoItinerary,
-                         "strexb", "\t$Rd, $Rt, [$Rn]", "", []>;
-def t2STREXH : T2I_strex<0b01, (outs rGPR:$Rd), (ins rGPR:$Rt, rGPR:$Rn),
-                         AddrModeNone, Size4Bytes, NoItinerary,
-                         "strexh", "\t$Rd, $Rt, [$Rn]", "", []>;
-def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, rGPR:$Rn),
-                       AddrModeNone, Size4Bytes, NoItinerary,
-                       "strex", "\t$Rd, $Rt, [$Rn]", "",
-                      []> {
+def t2STREXB : T2I_strex<0b00, (outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_reg:$addr),
+                  AddrModeNone, Size4Bytes, NoItinerary,
+                  "strexb", "\t$Rd, $Rt, $addr", "", []>;
+def t2STREXH : T2I_strex<0b01, (outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_reg:$addr),
+                  AddrModeNone, Size4Bytes, NoItinerary,
+                  "strexh", "\t$Rd, $Rt, $addr", "", []>;
+def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_reg:$addr),
+                  AddrModeNone, Size4Bytes, NoItinerary,
+                  "strex", "\t$Rd, $Rt, $addr", "",
+                  []> {
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0000100;
   let Inst{7-0} = 0b00000000; // imm8 = 0
 
   bits<4> Rd;
-  bits<4> Rn;
+  bits<4> addr;
   bits<4> Rt;
   let Inst{11-8}  = Rd;
-  let Inst{19-16} = Rn;
+  let Inst{19-16} = addr;
   let Inst{15-12} = Rt;
 }
 def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd),
-                         (ins rGPR:$Rt, rGPR:$Rt2, rGPR:$Rn),
+                         (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_reg:$addr),
                          AddrModeNone, Size4Bytes, NoItinerary,
-                         "strexd", "\t$Rd, $Rt, $Rt2, [$Rn]", "", [],
+                         "strexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
                          {?, ?, ?, ?}> {
   bits<4> Rt2;
   let Inst{11-8} = Rt2;
@@ -2965,7 +2959,7 @@ let isCall = 1,
 //   here, and we're using the stack frame for the containing function to
 //   save/restore registers, we can't keep anything live in regs across
 //   the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
-//   when we get here from a longjmp(). We force everthing out of registers
+//   when we get here from a longjmp(). We force everything out of registers
 //   except for our own input by listing the relevant registers in Defs. By
 //   doing so, we also cause the prologue/epilogue code to actively preserve
 //   all of the callee-saved resgisters, which is exactly what we want.
@@ -3238,19 +3232,20 @@ class T2RFE<bits<12> op31_20, dag oops, dag iops, InstrItinClass itin,
 
   bits<4> Rn;
   let Inst{19-16} = Rn;
+  let Inst{15-0} = 0xc000;
 }
 
 def t2RFEDBW : T2RFE<0b111010000011,
-                   (outs), (ins rGPR:$Rn), NoItinerary, "rfedb", "\t$Rn!",
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn!",
                    [/* For disassembly only; pattern left blank */]>;
 def t2RFEDB  : T2RFE<0b111010000001,
-                   (outs), (ins rGPR:$Rn), NoItinerary, "rfeab", "\t$Rn",
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn",
                    [/* For disassembly only; pattern left blank */]>;
 def t2RFEIAW : T2RFE<0b111010011011,
-                   (outs), (ins rGPR:$Rn), NoItinerary, "rfeia", "\t$Rn!",
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn!",
                    [/* For disassembly only; pattern left blank */]>;
 def t2RFEIA  : T2RFE<0b111010011001,
-                   (outs), (ins rGPR:$Rn), NoItinerary, "rfeia", "\t$Rn",
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn",
                    [/* For disassembly only; pattern left blank */]>;
 
 //===----------------------------------------------------------------------===//
@@ -3352,10 +3347,8 @@ def t2MSR : T2SpecialReg<0b111100111000 /* op31-20 */, 0b10 /* op15-14 */,
 // Move between coprocessor and ARM core register -- for disassembly only
 //
 
-class t2MovRCopro<string opc, bit direction>
-  : T2Cop<(outs), (ins p_imm:$cop, i32imm:$opc1,
-                       GPR:$Rt, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
-          !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"),
+class t2MovRCopro<string opc, bit direction, dag oops, dag iops>
+  : T2Cop<oops, iops, !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"),
           [/* For disassembly only; pattern left blank */]> {
   let Inst{27-24} = 0b1110;
   let Inst{20} = direction;
@@ -3376,8 +3369,12 @@ class t2MovRCopro<string opc, bit direction>
   let Inst{19-16} = CRn;
 }
 
-def t2MCR2 : t2MovRCopro<"mcr2", 0 /* from ARM core register to coprocessor */>;
-def t2MRC2 : t2MovRCopro<"mrc2", 1 /* from coprocessor to ARM core register */>;
+def t2MCR2 : t2MovRCopro<"mcr2", 0 /* from ARM core register to coprocessor */,
+             (outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn,
+                          c_imm:$CRm, i32imm:$opc2)>;
+def t2MRC2 : t2MovRCopro<"mrc2", 1 /* from coprocessor to ARM core register */,
+             (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn,
+                                  c_imm:$CRm, i32imm:$opc2)>;
 
 class t2MovRRCopro<string opc, bit direction>
   : T2Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 2990283..376bd96 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -101,14 +101,6 @@ multiclass vfp_ldst_mult<string asm, bit L_bit,
     let Inst{21}    = 1;          // Writeback
     let Inst{20}    = L_bit;
   }
-  def DDB :
-    AXDI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
-          IndexModeNone, itin,
-          !strconcat(asm, "db${p}\t$Rn, $regs"), "", []> {
-    let Inst{24-23} = 0b10;       // Decrement Before
-    let Inst{21}    = 0;          // No writeback
-    let Inst{20}    = L_bit;
-  }
   def DDB_UPD :
     AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
           IndexModeUpd, itin_upd,
@@ -143,18 +135,6 @@ multiclass vfp_ldst_mult<string asm, bit L_bit,
     // VFP pipelines.
     let D = VFPNeonDomain;
   }
-  def SDB :
-    AXSI4<(outs), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops),
-          IndexModeNone, itin,
-          !strconcat(asm, "db${p}\t$Rn, $regs"), "", []> {
-    let Inst{24-23} = 0b10;       // Decrement Before
-    let Inst{21}    = 0;          // No writeback
-    let Inst{20}    = L_bit;
-
-    // Some single precision VFP instructions may be executed on both NEON and
-    // VFP pipelines.
-    let D = VFPNeonDomain;
-  }
   def SDB_UPD :
     AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops),
           IndexModeUpd, itin_upd,
@@ -467,6 +447,10 @@ def VMOVRS : AVConv2I<0b11100001, 0b1010,
 
   let Inst{6-5}   = 0b00;
   let Inst{3-0}   = 0b0000;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
 }
 
 def VMOVSR : AVConv4I<0b11100000, 0b1010,
@@ -484,6 +468,10 @@ def VMOVSR : AVConv4I<0b11100000, 0b1010,
 
   let Inst{6-5}   = 0b00;
   let Inst{3-0}   = 0b0000;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
 }
 
 let neverHasSideEffects = 1 in {
@@ -503,6 +491,10 @@ def VMOVRRD  : AVConv3I<0b11000101, 0b1011,
   let Inst{19-16} = Rt2;
 
   let Inst{7-6} = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
 }
 
 def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
@@ -510,6 +502,10 @@ def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
                  IIC_fpMOVDI, "vmov", "\t$wb, $dst2, $src1, $src2",
                  [/* For disassembly only; pattern left blank */]> {
   let Inst{7-6} = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
 }
 } // neverHasSideEffects
 
@@ -532,6 +528,10 @@ def VMOVDRR : AVConv5I<0b11000100, 0b1011,
   let Inst{19-16} = Rt2;
 
   let Inst{7-6}   = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
 }
 
 let neverHasSideEffects = 1 in
@@ -540,6 +540,10 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010,
                 IIC_fpMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2",
                 [/* For disassembly only; pattern left blank */]> {
   let Inst{7-6} = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
 }
 
 // FMRDH: SPR -> GPR
@@ -972,33 +976,15 @@ def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
 //
 
 let neverHasSideEffects = 1 in {
-def VMOVDcc  : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
-                    (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
-                    IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm",
+def VMOVDcc  : ARMPseudoInst<(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm, pred:$p),
+                    Size4Bytes, IIC_fpUNA64,
                     [/*(set DPR:$Dd, (ARMcmov DPR:$Dn, DPR:$Dm, imm:$cc))*/]>,
                  RegConstraint<"$Dn = $Dd">;
 
-def VMOVScc  : ASuI<0b11101, 0b11, 0b0000, 0b01, 0,
-                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
-                    IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm",
+def VMOVScc  : ARMPseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, pred:$p),
+                    Size4Bytes, IIC_fpUNA32,
                     [/*(set SPR:$Sd, (ARMcmov SPR:$Sn, SPR:$Sm, imm:$cc))*/]>,
                  RegConstraint<"$Sn = $Sd">;
-
-def VNEGDcc  : ADuI<0b11101, 0b11, 0b0001, 0b01, 0,
-                    (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
-                    IIC_fpUNA64, "vneg", ".f64\t$Dd, $Dm",
-                    [/*(set DPR:$Dd, (ARMcneg DPR:$Dn, DPR:$Dm, imm:$cc))*/]>,
-                 RegConstraint<"$Dn = $Dd">;
-
-def VNEGScc  : ASuI<0b11101, 0b11, 0b0001, 0b01, 0,
-                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
-                    IIC_fpUNA32, "vneg", ".f32\t$Sd, $Sm",
-                    [/*(set SPR:$Sd, (ARMcneg SPR:$Sn, SPR:$Sm, imm:$cc))*/]>,
-                 RegConstraint<"$Sn = $Sd"> {
-  // Some single precision VFP instructions may be executed on both NEON and
-  // VFP pipelines on A8.
-  let D = VFPNeonA8Domain;
-}
 } // neverHasSideEffects
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index d9dc5cd..df89fad 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -79,7 +79,7 @@ namespace {
       unsigned Position;
       MachineBasicBlock::iterator MBBI;
       bool Merged;
-      MemOpQueueEntry(int o, unsigned r, bool k, unsigned p, 
+      MemOpQueueEntry(int o, unsigned r, bool k, unsigned p,
                       MachineBasicBlock::iterator i)
         : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {}
     };
@@ -174,7 +174,7 @@ static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) {
     switch (Mode) {
     default: llvm_unreachable("Unhandled submode!");
     case ARM_AM::ia: return ARM::VLDMSIA;
-    case ARM_AM::db: return ARM::VLDMSDB;
+    case ARM_AM::db: return 0; // Only VLDMSDB_UPD exists.
     }
     break;
   case ARM::VSTRS:
@@ -182,7 +182,7 @@ static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) {
     switch (Mode) {
     default: llvm_unreachable("Unhandled submode!");
     case ARM_AM::ia: return ARM::VSTMSIA;
-    case ARM_AM::db: return ARM::VSTMSDB;
+    case ARM_AM::db: return 0; // Only VSTMSDB_UPD exists.
     }
     break;
   case ARM::VLDRD:
@@ -190,7 +190,7 @@ static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) {
     switch (Mode) {
     default: llvm_unreachable("Unhandled submode!");
     case ARM_AM::ia: return ARM::VLDMDIA;
-    case ARM_AM::db: return ARM::VLDMDDB;
+    case ARM_AM::db: return 0; // Only VLDMDDB_UPD exists.
     }
     break;
   case ARM::VSTRD:
@@ -198,7 +198,7 @@ static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) {
     switch (Mode) {
     default: llvm_unreachable("Unhandled submode!");
     case ARM_AM::ia: return ARM::VSTMDIA;
-    case ARM_AM::db: return ARM::VSTMDDB;
+    case ARM_AM::db: return 0; // Only VSTMDDB_UPD exists.
     }
     break;
   }
@@ -246,13 +246,9 @@ AMSubMode getLoadStoreMultipleSubMode(int Opcode) {
   case ARM::t2LDMDB_UPD:
   case ARM::t2STMDB:
   case ARM::t2STMDB_UPD:
-  case ARM::VLDMSDB:
   case ARM::VLDMSDB_UPD:
-  case ARM::VSTMSDB:
   case ARM::VSTMSDB_UPD:
-  case ARM::VLDMDDB:
   case ARM::VLDMDDB_UPD:
-  case ARM::VSTMDDB:
   case ARM::VSTMDDB_UPD:
     return ARM_AM::db;
 
@@ -312,6 +308,10 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
     // VLDM/VSTM do not support DB mode without also updating the base reg.
     Mode = ARM_AM::db;
   else if (Offset != 0) {
+    // Check if this is a supported opcode before we insert instructions to
+    // calculate a new base register.
+    if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return false;
+
     // If starting offset isn't zero, insert a MI to materialize a new base.
     // But only do so if it is cost effective, i.e. merging more than two
     // loads / stores.
@@ -354,6 +354,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
   bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS ||
                 Opcode == ARM::VLDRD);
   Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
+  if (!Opcode) return false;
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode))
     .addReg(Base, getKillRegState(BaseKill))
     .addImm(Pred).addReg(PredReg);
@@ -453,6 +454,25 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
   unsigned PRegNum = PMO.isUndef() ? UINT_MAX
     : getARMRegisterNumbering(PReg);
   unsigned Count = 1;
+  unsigned Limit = ~0U;
+
+  // vldm / vstm limit are 32 for S variants, 16 for D variants.
+
+  switch (Opcode) {
+  default: break;
+  case ARM::VSTRS:
+    Limit = 32;
+    break;
+  case ARM::VSTRD:
+    Limit = 16;
+    break;
+  case ARM::VLDRD:
+    Limit = 16;
+    break;
+  case ARM::VLDRS:
+    Limit = 32;
+    break;
+  }
 
   for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
     int NewOffset = MemOps[i].Offset;
@@ -460,13 +480,13 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
     unsigned Reg = MO.getReg();
     unsigned RegNum = MO.isUndef() ? UINT_MAX
       : getARMRegisterNumbering(Reg);
-    // Register numbers must be in ascending order.  For VFP, the registers
-    // must also be consecutive and there is a limit of 16 double-word
-    // registers per instruction.
+    // Register numbers must be in ascending order. For VFP / NEON load and
+    // store multiples, the registers must also be consecutive and within the
+    // limit on the number of registers per instruction.
     if (Reg != ARM::SP &&
         NewOffset == Offset + (int)Size &&
-        ((isNotVFP && RegNum > PRegNum)
-         || ((Size < 8 || Count < 16) && RegNum == PRegNum+1))) {
+        ((isNotVFP && RegNum > PRegNum) ||
+         ((Count < Limit) && RegNum == PRegNum+1))) {
       Offset += Size;
       PRegNum = RegNum;
       ++Count;
@@ -567,14 +587,10 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
   case ARM::t2STMIA:
   case ARM::t2STMDB:
   case ARM::VLDMSIA:
-  case ARM::VLDMSDB:
   case ARM::VSTMSIA:
-  case ARM::VSTMSDB:
     return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
   case ARM::VLDMDIA:
-  case ARM::VLDMDDB:
   case ARM::VSTMDIA:
-  case ARM::VSTMDDB:
     return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
   }
 }
@@ -624,7 +640,6 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
     }
     break;
   case ARM::VLDMSIA:
-  case ARM::VLDMSDB:
     switch (Mode) {
     default: llvm_unreachable("Unhandled submode!");
     case ARM_AM::ia: return ARM::VLDMSIA_UPD;
@@ -632,7 +647,6 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
     }
     break;
   case ARM::VLDMDIA:
-  case ARM::VLDMDDB:
     switch (Mode) {
     default: llvm_unreachable("Unhandled submode!");
     case ARM_AM::ia: return ARM::VLDMDIA_UPD;
@@ -640,7 +654,6 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
     }
     break;
   case ARM::VSTMSIA:
-  case ARM::VSTMSDB:
     switch (Mode) {
     default: llvm_unreachable("Unhandled submode!");
     case ARM_AM::ia: return ARM::VSTMSIA_UPD;
@@ -648,7 +661,6 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
     }
     break;
   case ARM::VSTMDIA:
-  case ARM::VSTMDDB:
     switch (Mode) {
     default: llvm_unreachable("Unhandled submode!");
     case ARM_AM::ia: return ARM::VSTMDIA_UPD;
@@ -749,7 +761,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
     MIB.addOperand(MI->getOperand(OpNum));
 
   // Transfer memoperands.
-  (*MIB).setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+  MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
 
   MBB.erase(MBBI);
   return true;
@@ -1275,14 +1287,14 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
         MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
                      CurrPred, CurrPredReg, Scratch, MemOps, Merges);
 
-        // Try folding preceeding/trailing base inc/dec into the generated
+        // Try folding preceding/trailing base inc/dec into the generated
         // LDM/STM ops.
         for (unsigned i = 0, e = Merges.size(); i < e; ++i)
           if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
             ++NumMerges;
         NumMerges += Merges.size();
 
-        // Try folding preceeding/trailing base inc/dec into those load/store
+        // Try folding preceding/trailing base inc/dec into those load/store
         // that were not merged to form LDM/STM ops.
         for (unsigned i = 0; i != NumMemOps; ++i)
           if (!MemOps[i].Merged)
@@ -1292,7 +1304,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
         // RS may be pointing to an instruction that's deleted.
         RS->skipTo(prior(MBBI));
       } else if (NumMemOps == 1) {
-        // Try folding preceeding/trailing base inc/dec into the single
+        // Try folding preceding/trailing base inc/dec into the single
         // load/store.
         if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
           ++NumMerges;
@@ -1322,7 +1334,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
 }
 
 /// MergeReturnIntoLDM - If this is a exit BB, try merging the return ops
-/// ("bx lr" and "mov pc, lr") into the preceeding stack restore so it
+/// ("bx lr" and "mov pc, lr") into the preceding stack restore so it
 /// directly restore the value of LR into pc.
 ///   ldmfd sp!, {..., lr}
 ///   bx lr
@@ -1530,15 +1542,9 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
   // Then make sure the immediate offset fits.
   int OffImm = getMemoryOpOffset(Op0);
   if (isT2) {
-    if (OffImm < 0) {
-      if (OffImm < -255)
-        // Can't fall back to t2LDRi8 / t2STRi8.
-        return false;
-    } else {
-      int Limit = (1 << 8) * Scale;
-      if (OffImm >= Limit || (OffImm & (Scale-1)))
-        return false;
-    }
+    int Limit = (1 << 8) * Scale;
+    if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1)))
+      return false;
     Offset = OffImm;
   } else {
     ARM_AM::AddrOpc AddSub = ARM_AM::add;
diff --git a/lib/Target/ARM/ARMMCAsmInfo.cpp b/lib/Target/ARM/ARMMCAsmInfo.cpp
index 53edfca..a3f89e9 100644
--- a/lib/Target/ARM/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/ARMMCAsmInfo.cpp
@@ -12,8 +12,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMMCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+
 using namespace llvm;
 
+cl::opt<bool>
+EnableARMEHABI("arm-enable-ehabi", cl::Hidden,
+  cl::desc("Generate ARM EHABI tables"),
+  cl::init(false));
+
+
 static const char *const arm_asm_table[] = {
   "{r0}", "r0",
   "{r1}", "r1",
@@ -65,4 +73,8 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo() {
   DwarfRequiresFrameSection = false;
 
   SupportsDebugInformation = true;
+
+  // Exceptions handling
+  if (EnableARMEHABI)
+    ExceptionsType = ExceptionHandling::ARM;
 }
diff --git a/lib/Target/ARM/ARMMCCodeEmitter.cpp b/lib/Target/ARM/ARMMCCodeEmitter.cpp
index 6d7b485..10607b1 100644
--- a/lib/Target/ARM/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMMCCodeEmitter.cpp
@@ -278,6 +278,15 @@ public:
   unsigned getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
                                      SmallVectorImpl<MCFixup> &Fixups) const;
 
+  unsigned getShiftRight8Imm(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftRight16Imm(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftRight32Imm(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftRight64Imm(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+
   unsigned NEONThumb2DataIPostEncoder(const MCInst &MI,
                                       unsigned EncodedValue) const;
   unsigned NEONThumb2LoadStorePostEncoder(const MCInst &MI,
@@ -1201,6 +1210,30 @@ getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
   return MO.getReg();
 }
 
+unsigned ARMMCCodeEmitter::
+getShiftRight8Imm(const MCInst &MI, unsigned Op,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  return 8 - MI.getOperand(Op).getImm();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight16Imm(const MCInst &MI, unsigned Op,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  return 16 - MI.getOperand(Op).getImm();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight32Imm(const MCInst &MI, unsigned Op,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  return 32 - MI.getOperand(Op).getImm();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight64Imm(const MCInst &MI, unsigned Op,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  return 64 - MI.getOperand(Op).getImm();
+}
+
 void ARMMCCodeEmitter::
 EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                   SmallVectorImpl<MCFixup> &Fixups) const {
diff --git a/lib/Target/ARM/ARMMCExpr.h b/lib/Target/ARM/ARMMCExpr.h
index d42f766..0a2e883 100644
--- a/lib/Target/ARM/ARMMCExpr.h
+++ b/lib/Target/ARM/ARMMCExpr.h
@@ -60,6 +60,9 @@ public:
   bool EvaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAsmLayout *Layout) const;
   void AddValueSymbols(MCAssembler *) const;
+  const MCSection *FindAssociatedSection() const {
+    return getSubExpr()->FindAssociatedSection();
+  }
 
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp
index ad51bc1..1cba1ba 100644
--- a/lib/Target/ARM/ARMRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -12,26 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMBaseInstrInfo.h"
-#include "ARMInstrInfo.h"
-#include "ARMMachineFunctionInfo.h"
 #include "ARMRegisterInfo.h"
-#include "ARMSubtarget.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLocation.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallVector.h"
 using namespace llvm;
 
 ARMRegisterInfo::ARMRegisterInfo(const ARMBaseInstrInfo &tii,
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 22d15b5..54bf82a 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -70,6 +70,8 @@ def R4  : ARMReg< 4, "r4">,  DwarfRegNum<[4]>;
 def R5  : ARMReg< 5, "r5">,  DwarfRegNum<[5]>;
 def R6  : ARMReg< 6, "r6">,  DwarfRegNum<[6]>;
 def R7  : ARMReg< 7, "r7">,  DwarfRegNum<[7]>;
+// These require 32-bit instructions.
+let CostPerUse = 1 in {
 def R8  : ARMReg< 8, "r8">,  DwarfRegNum<[8]>;
 def R9  : ARMReg< 9, "r9">,  DwarfRegNum<[9]>;
 def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>;
@@ -78,6 +80,7 @@ def R12 : ARMReg<12, "r12">, DwarfRegNum<[12]>;
 def SP  : ARMReg<13, "sp">,  DwarfRegNum<[13]>;
 def LR  : ARMReg<14, "lr">,  DwarfRegNum<[14]>;
 def PC  : ARMReg<15, "pc">,  DwarfRegNum<[15]>;
+}
 
 // Float registers
 def S0  : ARMFReg< 0, "s0">;  def S1  : ARMFReg< 1, "s1">;
@@ -99,33 +102,41 @@ def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">;
 
 // Aliases of the F* registers used to hold 64-bit fp values (doubles)
 let SubRegIndices = [ssub_0, ssub_1] in {
-def D0  : ARMReg< 0,  "d0", [S0,   S1]>;
-def D1  : ARMReg< 1,  "d1", [S2,   S3]>;
-def D2  : ARMReg< 2,  "d2", [S4,   S5]>;
-def D3  : ARMReg< 3,  "d3", [S6,   S7]>;
-def D4  : ARMReg< 4,  "d4", [S8,   S9]>;
-def D5  : ARMReg< 5,  "d5", [S10, S11]>;
-def D6  : ARMReg< 6,  "d6", [S12, S13]>;
-def D7  : ARMReg< 7,  "d7", [S14, S15]>;
-def D8  : ARMReg< 8,  "d8", [S16, S17]>;
-def D9  : ARMReg< 9,  "d9", [S18, S19]>;
-def D10 : ARMReg<10, "d10", [S20, S21]>;
-def D11 : ARMReg<11, "d11", [S22, S23]>;
-def D12 : ARMReg<12, "d12", [S24, S25]>;
-def D13 : ARMReg<13, "d13", [S26, S27]>;
-def D14 : ARMReg<14, "d14", [S28, S29]>;
-def D15 : ARMReg<15, "d15", [S30, S31]>;
+def D0  : ARMReg< 0,  "d0", [S0,   S1]>, DwarfRegNum<[256]>;
+def D1  : ARMReg< 1,  "d1", [S2,   S3]>, DwarfRegNum<[257]>;
+def D2  : ARMReg< 2,  "d2", [S4,   S5]>, DwarfRegNum<[258]>;
+def D3  : ARMReg< 3,  "d3", [S6,   S7]>, DwarfRegNum<[259]>;
+def D4  : ARMReg< 4,  "d4", [S8,   S9]>, DwarfRegNum<[260]>;
+def D5  : ARMReg< 5,  "d5", [S10, S11]>, DwarfRegNum<[261]>;
+def D6  : ARMReg< 6,  "d6", [S12, S13]>, DwarfRegNum<[262]>;
+def D7  : ARMReg< 7,  "d7", [S14, S15]>, DwarfRegNum<[263]>;
+def D8  : ARMReg< 8,  "d8", [S16, S17]>, DwarfRegNum<[264]>;
+def D9  : ARMReg< 9,  "d9", [S18, S19]>, DwarfRegNum<[265]>;
+def D10 : ARMReg<10, "d10", [S20, S21]>, DwarfRegNum<[266]>;
+def D11 : ARMReg<11, "d11", [S22, S23]>, DwarfRegNum<[267]>;
+def D12 : ARMReg<12, "d12", [S24, S25]>, DwarfRegNum<[268]>;
+def D13 : ARMReg<13, "d13", [S26, S27]>, DwarfRegNum<[269]>;
+def D14 : ARMReg<14, "d14", [S28, S29]>, DwarfRegNum<[270]>;
+def D15 : ARMReg<15, "d15", [S30, S31]>, DwarfRegNum<[271]>;
 }
 
 // VFP3 defines 16 additional double registers
-def D16 : ARMFReg<16, "d16">; def D17 : ARMFReg<17, "d17">;
-def D18 : ARMFReg<18, "d18">; def D19 : ARMFReg<19, "d19">;
-def D20 : ARMFReg<20, "d20">; def D21 : ARMFReg<21, "d21">;
-def D22 : ARMFReg<22, "d22">; def D23 : ARMFReg<23, "d23">;
-def D24 : ARMFReg<24, "d24">; def D25 : ARMFReg<25, "d25">;
-def D26 : ARMFReg<26, "d26">; def D27 : ARMFReg<27, "d27">;
-def D28 : ARMFReg<28, "d28">; def D29 : ARMFReg<29, "d29">;
-def D30 : ARMFReg<30, "d30">; def D31 : ARMFReg<31, "d31">;
+def D16 : ARMFReg<16, "d16">, DwarfRegNum<[272]>; 
+def D17 : ARMFReg<17, "d17">, DwarfRegNum<[273]>;
+def D18 : ARMFReg<18, "d18">, DwarfRegNum<[274]>;
+def D19 : ARMFReg<19, "d19">, DwarfRegNum<[275]>;
+def D20 : ARMFReg<20, "d20">, DwarfRegNum<[276]>;
+def D21 : ARMFReg<21, "d21">, DwarfRegNum<[277]>;
+def D22 : ARMFReg<22, "d22">, DwarfRegNum<[278]>; 
+def D23 : ARMFReg<23, "d23">, DwarfRegNum<[279]>;
+def D24 : ARMFReg<24, "d24">, DwarfRegNum<[280]>;
+def D25 : ARMFReg<25, "d25">, DwarfRegNum<[281]>;
+def D26 : ARMFReg<26, "d26">, DwarfRegNum<[282]>;
+def D27 : ARMFReg<27, "d27">, DwarfRegNum<[283]>;
+def D28 : ARMFReg<28, "d28">, DwarfRegNum<[284]>;
+def D29 : ARMFReg<29, "d29">, DwarfRegNum<[285]>;
+def D30 : ARMFReg<30, "d30">, DwarfRegNum<[286]>;
+def D31 : ARMFReg<31, "d31">, DwarfRegNum<[287]>;
 
 // Advanced SIMD (NEON) defines 16 quad-word aliases
 let SubRegIndices = [dsub_0, dsub_1],
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 82c6735..49fedf6 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -656,19 +656,19 @@ def CortexA9Itineraries : ProcessorItineraries<
                               [1, 1, 1]>,
   //
   // Single-precision to Integer Move
+  //
+  // On A9 move-from-VFP is free to issue with no stall if other VFP
+  // operations are in flight. I assume it still can't dual-issue though.
   InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_NPipe]>],
+                               InstrStage<1, [A9_MUX0], 0>],
                               [2, 1]>,
   //
   // Double-precision to Integer Move
+  //
+  // On A9 move-from-VFP is free to issue with no stall if other VFP
+  // operations are in flight. I assume it still can't dual-issue though.
   InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_NPipe]>],
+                               InstrStage<1, [A9_MUX0], 0>],
                               [2, 1, 1]>,
   //
   // Single-precision FP Load
@@ -691,20 +691,22 @@ def CortexA9Itineraries : ProcessorItineraries<
                               [2, 1]>,
   //
   // FP Load Multiple
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
   InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe], 0>,
-                               InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>,
+                               InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>,
   //
   // FP Load Multiple + update
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
   InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe], 0>,
-                               InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1]>,
+                               InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>,
   //
   // Single-precision FP Store
   InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -725,205 +727,206 @@ def CortexA9Itineraries : ProcessorItineraries<
                               [1, 1]>,
   //
   // FP Store Multiple
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
   InstrItinData<IIC_fpStore_m,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe], 0>,
-                               InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>,
+                               InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>,
   //
   // FP Store Multiple + update
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
   InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
                                 InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                 InstrStage<2, [A9_DRegsN],   0, Reserved>,
                                 InstrStage<1, [A9_NPipe], 0>,
-                                InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1]>,
+                                InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>,
   // NEON
   // VLD1
-  // FIXME: Conservatively assume insufficent alignment.
   InstrItinData<IIC_VLD1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [2, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1]>,
   // VLD1x2
   InstrItinData<IIC_VLD1x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [2, 2, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1]>,
   // VLD1x3
   InstrItinData<IIC_VLD1x3,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [2, 2, 3, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 1]>,
   // VLD1x4
   InstrItinData<IIC_VLD1x4,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [2, 2, 3, 3, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 2, 1]>,
   // VLD1u
   InstrItinData<IIC_VLD1u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [2, 2, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 2, 1]>,
   // VLD1x2u
   InstrItinData<IIC_VLD1x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [2, 2, 2, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 2, 1]>,
   // VLD1x3u
   InstrItinData<IIC_VLD1x3u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [2, 2, 3, 2, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 2, 1]>,
   // VLD1x4u
   InstrItinData<IIC_VLD1x4u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [2, 2, 3, 3, 2, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 2, 2, 1]>,
   //
   // VLD1ln
   InstrItinData<IIC_VLD1ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [4, 1, 1, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 1, 1, 1]>,
   //
   // VLD1lnu
   InstrItinData<IIC_VLD1lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [4, 2, 1, 1, 1, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 2, 1, 1, 1, 1]>,
   //
   // VLD1dup
   InstrItinData<IIC_VLD1dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [3, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1]>,
   //
   // VLD1dupu
   InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [3, 2, 1, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 1, 1]>,
   //
   // VLD2
   InstrItinData<IIC_VLD2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [3, 3, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 1]>,
   //
   // VLD2x2
   InstrItinData<IIC_VLD2x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [3, 4, 3, 4, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 3, 2, 3, 1]>,
   //
   // VLD2ln
   InstrItinData<IIC_VLD2ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [4, 4, 1, 1, 1, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 3, 1, 1, 1, 1]>,
   //
   // VLD2u
   InstrItinData<IIC_VLD2u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [3, 3, 2, 1, 1, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 2, 1, 1, 1]>,
   //
   // VLD2x2u
   InstrItinData<IIC_VLD2x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [3, 4, 3, 4, 2, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 3, 2, 3, 2, 1]>,
   //
   // VLD2lnu
   InstrItinData<IIC_VLD2lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [4, 4, 2, 1, 1, 1, 1, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 3, 2, 1, 1, 1, 1, 1]>,
   //
   // VLD2dup
   InstrItinData<IIC_VLD2dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [3, 3, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 1]>,
   //
   // VLD2dupu
   InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [3, 3, 2, 1, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 2, 1, 1]>,
   //
   // VLD3
   InstrItinData<IIC_VLD3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<4, [A9_NPipe], 0>,
-                               InstrStage<4, [A9_LSUnit]>],
-                              [4, 4, 5, 1]>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 1]>,
   //
   // VLD3ln
   InstrItinData<IIC_VLD3ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -938,10 +941,10 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VLD3u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<4, [A9_NPipe], 0>,
-                               InstrStage<4, [A9_LSUnit]>],
-                              [4, 4, 5, 2, 1]>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 2, 1]>,
   //
   // VLD3lnu
   InstrItinData<IIC_VLD3lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -974,108 +977,108 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VLD4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<4, [A9_NPipe], 0>,
-                               InstrStage<4, [A9_LSUnit]>],
-                              [4, 4, 5, 5, 1]>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 4, 1]>,
   //
   // VLD4ln
   InstrItinData<IIC_VLD4ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<5, [A9_NPipe], 0>,
-                               InstrStage<5, [A9_LSUnit]>],
-                              [5, 5, 6, 6, 1, 1, 1, 1, 2, 2]>,
+                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe], 0>,
+                               InstrStage<4, [A9_LSUnit]>],
+                              [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>,
   //
   // VLD4u
   InstrItinData<IIC_VLD4u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<4, [A9_NPipe], 0>,
-                               InstrStage<4, [A9_LSUnit]>],
-                              [4, 4, 5, 5, 2, 1]>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 4, 2, 1]>,
   //
   // VLD4lnu
   InstrItinData<IIC_VLD4lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<5, [A9_NPipe], 0>,
-                               InstrStage<5, [A9_LSUnit]>],
-                              [5, 5, 6, 6, 2, 1, 1, 1, 1, 1, 2, 2]>,
+                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe], 0>,
+                               InstrStage<4, [A9_LSUnit]>],
+                              [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>,
   //
   // VLD4dup
   InstrItinData<IIC_VLD4dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [3, 3, 4, 4, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 2, 3, 3, 1]>,
   //
   // VLD4dupu
   InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [3, 3, 4, 4, 2, 1, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 2, 3, 3, 2, 1, 1]>,
   //
   // VST1
   InstrItinData<IIC_VST1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [1, 1, 1]>,
   //
   // VST1x2
   InstrItinData<IIC_VST1x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [1, 1, 1, 1]>,
   //
   // VST1x3
   InstrItinData<IIC_VST1x3,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [1, 1, 1, 1, 2]>,
   //
   // VST1x4
   InstrItinData<IIC_VST1x4,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [1, 1, 1, 1, 2, 2]>,
   //
   // VST1u
   InstrItinData<IIC_VST1u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1]>,
   //
   // VST1x2u
   InstrItinData<IIC_VST1x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1, 1]>,
   //
   // VST1x3u
@@ -1083,44 +1086,44 @@ def CortexA9Itineraries : ProcessorItineraries<
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
                                InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1, 1, 2]>,
   //
   // VST1x4u
   InstrItinData<IIC_VST1x4u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1, 1, 2, 2]>,
   //
   // VST1ln
   InstrItinData<IIC_VST1ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [1, 1, 1]>,
   //
   // VST1lnu
   InstrItinData<IIC_VST1lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1]>,
   //
   // VST2
   InstrItinData<IIC_VST2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [1, 1, 1, 1]>,
   //
   // VST2x2
@@ -1136,9 +1139,9 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VST2u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1, 1]>,
   //
   // VST2x2u
@@ -1154,36 +1157,36 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VST2ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [1, 1, 1, 1]>,
   //
   // VST2lnu
   InstrItinData<IIC_VST2lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1, 1]>,
   //
   // VST3
   InstrItinData<IIC_VST3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [1, 1, 1, 1, 2]>,
   //
   // VST3u
   InstrItinData<IIC_VST3u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1, 1, 2]>,
   //
   // VST3ln
@@ -1208,36 +1211,36 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VST4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [1, 1, 1, 1, 2, 2]>,
   //
   // VST4u
   InstrItinData<IIC_VST4u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1, 1, 2, 2]>,
   //
   // VST4ln
   InstrItinData<IIC_VST4ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [1, 1, 1, 1, 2, 2]>,
   //
   // VST4lnu
   InstrItinData<IIC_VST4lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1, 1, 2, 2]>,
 
   //
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 2b9202b..aa1e398 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -35,7 +35,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
   // This requires 4-byte alignment.
   if ((Align & 3) != 0)
     return SDValue();
-  // This requires the copy size to be a constant, preferrably
+  // This requires the copy size to be a constant, preferably
   // within a subtarget-specific limit.
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   if (!ConstantSize)
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 1465984..c6f266b 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -38,6 +38,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
   , ARMFPUType(None)
   , UseNEONForSinglePrecisionFP(false)
   , SlowFPVMLx(false)
+  , HasVMLxForwarding(false)
   , SlowFPBrcc(false)
   , IsThumb(isT)
   , ThumbMode(Thumb1)
@@ -51,6 +52,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
   , HasT2ExtractPack(false)
   , HasDataBarrier(false)
   , Pref32BitThumb(false)
+  , AvoidCPSRPartialUpdate(false)
   , HasMPExtension(false)
   , FPOnlySP(false)
   , AllowsUnalignedMem(false)
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 76c1c3f..0271c87 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -61,6 +61,10 @@ protected:
   /// whether the FP VML[AS] instructions are slow (if so, don't use them).
   bool SlowFPVMLx;
 
+  /// HasVMLxForwarding - If true, NEON has special multiplier accumulator
+  /// forwarding to allow mul + mla being issued back to back.
+  bool HasVMLxForwarding;
+
   /// SlowFPBrcc - True if floating point compare + branch is slow.
   bool SlowFPBrcc;
 
@@ -106,6 +110,11 @@ protected:
   /// over 16-bit ones.
   bool Pref32BitThumb;
 
+  /// AvoidCPSRPartialUpdate - If true, codegen would avoid using instructions
+  /// that partially update CPSR and add false dependency on the previous
+  /// CPSR setting instruction.
+  bool AvoidCPSRPartialUpdate;
+
   /// HasMPExtension - True if the subtarget supports Multiprocessing
   /// extension (ARMv7 only).
   bool HasMPExtension;
@@ -182,15 +191,19 @@ protected:
   bool hasT2ExtractPack() const { return HasT2ExtractPack; }
   bool hasDataBarrier() const { return HasDataBarrier; }
   bool useFPVMLx() const { return !SlowFPVMLx; }
+  bool hasVMLxForwarding() const { return HasVMLxForwarding; }
   bool isFPBrccSlow() const { return SlowFPBrcc; }
   bool isFPOnlySP() const { return FPOnlySP; }
   bool prefers32BitThumb() const { return Pref32BitThumb; }
+  bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; }
   bool hasMPExtension() const { return HasMPExtension; }
 
   bool hasFP16() const { return HasFP16; }
   bool hasD16() const { return HasD16; }
 
-  bool isTargetDarwin() const { return TargetTriple.getOS() == Triple::Darwin; }
+  const Triple &getTargetTriple() const { return TargetTriple; }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
   bool isTargetELF() const { return !isTargetDarwin(); }
 
   bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; }
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 0ee773b..29aa4f7 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -22,16 +22,13 @@
 #include "llvm/Target/TargetRegistry.h"
 using namespace llvm;
 
-static cl::opt<bool>ExpandMLx("expand-fp-mlx", cl::init(false), cl::Hidden);
-
 static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
-  switch (TheTriple.getOS()) {
-  case Triple::Darwin:
+
+  if (TheTriple.isOSDarwin())
     return new ARMMCAsmInfoDarwin();
-  default:
-    return new ARMELFMCAsmInfo();
-  }
+
+  return new ARMELFMCAsmInfo();
 }
 
 // This is duplicated code. Refactor this.
@@ -41,17 +38,17 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
                                     MCCodeEmitter *Emitter,
                                     bool RelaxAll,
                                     bool NoExecStack) {
-  switch (Triple(TT).getOS()) {
-  case Triple::Darwin:
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
     return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll);
-  case Triple::MinGW32:
-  case Triple::Cygwin:
-  case Triple::Win32:
+
+  if (TheTriple.isOSWindows()) {
     llvm_unreachable("ARM does not support Windows COFF format");
     return NULL;
-  default:
-    return createELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
   }
+
+  return createELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
 }
 
 extern "C" void LLVMInitializeARMTarget() {
@@ -86,8 +83,7 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T,
   : LLVMTargetMachine(T, TT),
     Subtarget(TT, FS, isThumb),
     JITInfo(),
-    InstrItins(Subtarget.getInstrItineraryData())
-{
+    InstrItins(Subtarget.getInstrItineraryData()) {
   DefRelocModel = getRelocationModel();
 }
 
@@ -149,8 +145,7 @@ bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM,
   // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
     PM.add(createARMLoadStoreOptimizationPass(true));
-  if (ExpandMLx &&
-      OptLevel != CodeGenOpt::None && Subtarget.hasVFP2())
+  if (OptLevel != CodeGenOpt::None && Subtarget.isCortexA9())
     PM.add(createMLxExpansionPass());
 
   return true;
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index 7535da5..19defa1 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -36,8 +36,9 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
                                  ELF::SHF_WRITE |
                                  ELF::SHF_ALLOC,
                                  SectionKind::getDataRel());
+    LSDASection = NULL;
   }
-  
+
   AttributesSection =
     getContext().getELFSection(".ARM.attributes",
                                ELF::SHT_ARM_ATTRIBUTES,
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 129af20..29ecc18 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -29,15 +29,6 @@
 #include "llvm/ADT/Twine.h"
 using namespace llvm;
 
-/// Shift types used for register controlled shifts in ARM memory addressing.
-enum ShiftType {
-  Lsl,
-  Lsr,
-  Asr,
-  Ror,
-  Rrx
-};
-
 namespace {
 
 class ARMOperand;
@@ -55,8 +46,10 @@ class ARMAsmParser : public TargetAsmParser {
   int TryParseRegister();
   virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
   bool TryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool TryParseShiftRegister(SmallVectorImpl<MCParsedAsmOperand*> &);
   bool ParseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &,
+                   ARMII::AddrMode AddrMode);
   bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &, StringRef Mnemonic);
   bool ParsePrefix(ARMMCExpr::VariantKind &RefKind);
   const MCExpr *ApplyPrefixToExpr(const MCExpr *E,
@@ -65,13 +58,14 @@ class ARMAsmParser : public TargetAsmParser {
 
   bool ParseMemoryOffsetReg(bool &Negative,
                             bool &OffsetRegShifted,
-                            enum ShiftType &ShiftType,
+                            enum ARM_AM::ShiftOpc &ShiftType,
                             const MCExpr *&ShiftAmount,
                             const MCExpr *&Offset,
                             bool &OffsetIsReg,
                             int &OffsetRegNum,
                             SMLoc &E);
-  bool ParseShift(enum ShiftType &St, const MCExpr *&ShiftAmount, SMLoc &E);
+  bool ParseShift(enum ARM_AM::ShiftOpc &St,
+                  const MCExpr *&ShiftAmount, SMLoc &E);
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
   bool ParseDirectiveThumb(SMLoc L);
   bool ParseDirectiveThumbFunc(SMLoc L);
@@ -102,10 +96,25 @@ class ARMAsmParser : public TargetAsmParser {
     SmallVectorImpl<MCParsedAsmOperand*>&);
   OperandMatchResultTy tryParseMSRMaskOperand(
     SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy tryParseMemMode2Operand(
+    SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy tryParseMemMode3Operand(
+    SmallVectorImpl<MCParsedAsmOperand*>&);
+
+  // Asm Match Converter Methods
+  bool CvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool CvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool CvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool CvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
 
 public:
   ARMAsmParser(const Target &T, MCAsmParser &_Parser, TargetMachine &_TM)
     : TargetAsmParser(T), Parser(_Parser), TM(_TM) {
+      MCAsmParserExtension::Initialize(_Parser);
       // Initialize the set of available features.
       setAvailableFeatures(ComputeAvailableFeatures(
           &TM.getSubtarget<ARMSubtarget>()));
@@ -136,6 +145,7 @@ class ARMOperand : public MCParsedAsmOperand {
     RegisterList,
     DPRRegisterList,
     SPRRegisterList,
+    Shifter,
     Token
   } Kind;
 
@@ -178,13 +188,14 @@ class ARMOperand : public MCParsedAsmOperand {
 
     /// Combined record for all forms of ARM address expressions.
     struct {
+      ARMII::AddrMode AddrMode;
       unsigned BaseRegNum;
       union {
         unsigned RegNum;     ///< Offset register num, when OffsetIsReg.
         const MCExpr *Value; ///< Offset value, when !OffsetIsReg.
       } Offset;
       const MCExpr *ShiftAmount;     // used when OffsetRegShifted is true
-      enum ShiftType ShiftType;      // used when OffsetRegShifted is true
+      enum ARM_AM::ShiftOpc ShiftType; // used when OffsetRegShifted is true
       unsigned OffsetRegShifted : 1; // only used when OffsetIsReg is true
       unsigned Preindexed       : 1;
       unsigned Postindexed      : 1;
@@ -192,6 +203,11 @@ class ARMOperand : public MCParsedAsmOperand {
       unsigned Negative         : 1; // only used when OffsetIsReg is true
       unsigned Writeback        : 1;
     } Mem;
+
+    struct {
+      ARM_AM::ShiftOpc ShiftTy;
+      unsigned RegNum;
+    } Shift;
   };
 
   ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
@@ -234,6 +250,10 @@ public:
       break;
     case ProcIFlags:
       IFlags = o.IFlags;
+      break;
+    case Shifter:
+      Shift = o.Shift;
+      break;
     }
   }
 
@@ -290,7 +310,9 @@ public:
 
   /// @name Memory Operand Accessors
   /// @{
-
+  ARMII::AddrMode getMemAddrMode() const {
+    return Mem.AddrMode;
+  }
   unsigned getMemBaseRegNum() const {
     return Mem.BaseRegNum;
   }
@@ -310,7 +332,7 @@ public:
     assert(Mem.OffsetIsReg && Mem.OffsetRegShifted && "Invalid access!");
     return Mem.ShiftAmount;
   }
-  enum ShiftType getMemShiftType() const {
+  enum ARM_AM::ShiftOpc getMemShiftType() const {
     assert(Mem.OffsetIsReg && Mem.OffsetRegShifted && "Invalid access!");
     return Mem.ShiftType;
   }
@@ -334,6 +356,52 @@ public:
   bool isToken() const { return Kind == Token; }
   bool isMemBarrierOpt() const { return Kind == MemBarrierOpt; }
   bool isMemory() const { return Kind == Memory; }
+  bool isShifter() const { return Kind == Shifter; }
+  bool isMemMode2() const {
+    if (getMemAddrMode() != ARMII::AddrMode2)
+      return false;
+
+    if (getMemOffsetIsReg())
+      return true;
+
+    if (getMemNegative() &&
+        !(getMemPostindexed() || getMemPreindexed()))
+      return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+
+    // The offset must be in the range 0-4095 (imm12).
+    if (Value > 4095 || Value < -4095)
+      return false;
+
+    return true;
+  }
+  bool isMemMode3() const {
+    if (getMemAddrMode() != ARMII::AddrMode3)
+      return false;
+
+    if (getMemOffsetIsReg()) {
+      if (getMemOffsetRegShifted())
+        return false; // No shift with offset reg allowed
+      return true;
+    }
+
+    if (getMemNegative() &&
+        !(getMemPostindexed() || getMemPreindexed()))
+      return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+
+    // The offset must be in the range 0-255 (imm8).
+    if (Value > 255 || Value < -255)
+      return false;
+
+    return true;
+  }
   bool isMemMode5() const {
     if (!isMemory() || getMemOffsetIsReg() || getMemWriteback() ||
         getMemNegative())
@@ -346,6 +414,23 @@ public:
     int64_t Value = CE->getValue();
     return ((Value & 0x3) == 0 && Value <= 1020 && Value >= -1020);
   }
+  bool isMemMode7() const {
+    if (!isMemory() ||
+        getMemPreindexed() ||
+        getMemPostindexed() ||
+        getMemOffsetIsReg() ||
+        getMemNegative() ||
+        getMemWriteback())
+      return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    if (!CE) return false;
+
+    if (CE->getValue())
+      return false;
+
+    return true;
+  }
   bool isMemModeRegThumb() const {
     if (!isMemory() || !getMemOffsetIsReg() || getMemWriteback())
       return false;
@@ -402,6 +487,12 @@ public:
     Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
 
+  void addShifterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(
+      ARM_AM::getSORegOpc(Shift.ShiftTy, 0)));
+  }
+
   void addRegListOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const SmallVectorImpl<unsigned> &RegList = getRegList();
@@ -428,6 +519,88 @@ public:
     Inst.addOperand(MCOperand::CreateImm(unsigned(getMemBarrierOpt())));
   }
 
+  void addMemMode7Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && isMemMode7() && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum()));
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    (void)CE;
+    assert((CE || CE->getValue() == 0) &&
+           "No offset operand support in mode 7");
+  }
+
+  void addMemMode2Operands(MCInst &Inst, unsigned N) const {
+    assert(isMemMode2() && "Invalid mode or number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum()));
+    unsigned IdxMode = (getMemPreindexed() | getMemPostindexed() << 1);
+
+    if (getMemOffsetIsReg()) {
+      Inst.addOperand(MCOperand::CreateReg(getMemOffsetRegNum()));
+
+      ARM_AM::AddrOpc AMOpc = getMemNegative() ? ARM_AM::sub : ARM_AM::add;
+      ARM_AM::ShiftOpc ShOpc = ARM_AM::no_shift;
+      int64_t ShiftAmount = 0;
+
+      if (getMemOffsetRegShifted()) {
+        ShOpc = getMemShiftType();
+        const MCConstantExpr *CE =
+                   dyn_cast<MCConstantExpr>(getMemShiftAmount());
+        ShiftAmount = CE->getValue();
+      }
+
+      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM2Opc(AMOpc, ShiftAmount,
+                                           ShOpc, IdxMode)));
+      return;
+    }
+
+    // Create a operand placeholder to always yield the same number of operands.
+    Inst.addOperand(MCOperand::CreateReg(0));
+
+    // FIXME: #-0 is encoded differently than #0. Does the parser preserve
+    // the difference?
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    assert(CE && "Non-constant mode 2 offset operand!");
+    int64_t Offset = CE->getValue();
+
+    if (Offset >= 0)
+      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM2Opc(ARM_AM::add,
+                                           Offset, ARM_AM::no_shift, IdxMode)));
+    else
+      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM2Opc(ARM_AM::sub,
+                                          -Offset, ARM_AM::no_shift, IdxMode)));
+  }
+
+  void addMemMode3Operands(MCInst &Inst, unsigned N) const {
+    assert(isMemMode3() && "Invalid mode or number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum()));
+    unsigned IdxMode = (getMemPreindexed() | getMemPostindexed() << 1);
+
+    if (getMemOffsetIsReg()) {
+      Inst.addOperand(MCOperand::CreateReg(getMemOffsetRegNum()));
+
+      ARM_AM::AddrOpc AMOpc = getMemNegative() ? ARM_AM::sub : ARM_AM::add;
+      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM3Opc(AMOpc, 0,
+                                                             IdxMode)));
+      return;
+    }
+
+    // Create a operand placeholder to always yield the same number of operands.
+    Inst.addOperand(MCOperand::CreateReg(0));
+
+    // FIXME: #-0 is encoded differently than #0. Does the parser preserve
+    // the difference?
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    assert(CE && "Non-constant mode 3 offset operand!");
+    int64_t Offset = CE->getValue();
+
+    if (Offset >= 0)
+      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM3Opc(ARM_AM::add,
+                                           Offset, IdxMode)));
+    else
+      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM3Opc(ARM_AM::sub,
+                                           -Offset, IdxMode)));
+  }
+
   void addMemMode5Operands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && isMemMode5() && "Invalid number of operands!");
 
@@ -525,6 +698,15 @@ public:
     return Op;
   }
 
+  static ARMOperand *CreateShifter(ARM_AM::ShiftOpc ShTy,
+                                   SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(Shifter);
+    Op->Shift.ShiftTy = ShTy;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
   static ARMOperand *
   CreateRegList(const SmallVectorImpl<std::pair<unsigned, SMLoc> > &Regs,
                 SMLoc StartLoc, SMLoc EndLoc) {
@@ -553,9 +735,10 @@ public:
     return Op;
   }
 
-  static ARMOperand *CreateMem(unsigned BaseRegNum, bool OffsetIsReg,
-                               const MCExpr *Offset, int OffsetRegNum,
-                               bool OffsetRegShifted, enum ShiftType ShiftType,
+  static ARMOperand *CreateMem(ARMII::AddrMode AddrMode, unsigned BaseRegNum,
+                               bool OffsetIsReg, const MCExpr *Offset,
+                               int OffsetRegNum, bool OffsetRegShifted,
+                               enum ARM_AM::ShiftOpc ShiftType,
                                const MCExpr *ShiftAmount, bool Preindexed,
                                bool Postindexed, bool Negative, bool Writeback,
                                SMLoc S, SMLoc E) {
@@ -571,6 +754,7 @@ public:
            "Cannot have expression offset and register offset!");
 
     ARMOperand *Op = new ARMOperand(Memory);
+    Op->Mem.AddrMode = AddrMode;
     Op->Mem.BaseRegNum = BaseRegNum;
     Op->Mem.OffsetIsReg = OffsetIsReg;
     if (OffsetIsReg)
@@ -642,7 +826,8 @@ void ARMOperand::dump(raw_ostream &OS) const {
     break;
   case Memory:
     OS << "<memory "
-       << "base:" << getMemBaseRegNum();
+       << "am:" << ARMII::AddrModeToString(getMemAddrMode())
+       << " base:" << getMemBaseRegNum();
     if (getMemOffsetIsReg()) {
       OS << " offset:<register " << getMemOffsetRegNum();
       if (getMemOffsetRegShifted()) {
@@ -676,6 +861,9 @@ void ARMOperand::dump(raw_ostream &OS) const {
   case Register:
     OS << "<register " << getReg() << ">";
     break;
+  case Shifter:
+    OS << "<shifter " << getShiftOpcStr(Shift.ShiftTy) << ">";
+    break;
   case RegisterList:
   case DPRRegisterList:
   case SPRRegisterList: {
@@ -738,6 +926,42 @@ int ARMAsmParser::TryParseRegister() {
   return RegNum;
 }
 
+/// Try to parse a register name.  The token must be an Identifier when called,
+/// and if it is a register name the token is eaten and the register number is
+/// returned.  Otherwise return -1.
+///
+bool ARMAsmParser::TryParseShiftRegister(
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+  std::string upperCase = Tok.getString().str();
+  std::string lowerCase = LowercaseString(upperCase);
+  ARM_AM::ShiftOpc ShiftTy = StringSwitch<ARM_AM::ShiftOpc>(lowerCase)
+      .Case("lsl", ARM_AM::lsl)
+      .Case("lsr", ARM_AM::lsr)
+      .Case("asr", ARM_AM::asr)
+      .Case("ror", ARM_AM::ror)
+      .Case("rrx", ARM_AM::rrx)
+      .Default(ARM_AM::no_shift);
+
+  if (ShiftTy == ARM_AM::no_shift)
+    return true;
+
+  Parser.Lex(); // Eat shift-type operand;
+  int RegNum = TryParseRegister();
+  if (RegNum == -1)
+    return Error(Parser.getTok().getLoc(), "register expected");
+
+  Operands.push_back(ARMOperand::CreateReg(RegNum,S, Parser.getTok().getLoc()));
+  Operands.push_back(ARMOperand::CreateShifter(ShiftTy,
+                                               S, Parser.getTok().getLoc()));
+
+  return false;
+}
+
+
 /// Try to parse a register name.  The token must be an Identifier when called.
 /// If it's a register, an AsmOperand is created. Another AsmOperand is created
 /// if there is a "writeback". 'true' if it's not a register.
@@ -1046,13 +1270,96 @@ tryParseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
+/// tryParseMemMode2Operand - Try to parse memory addressing mode 2 operand.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+tryParseMemMode2Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  assert(Parser.getTok().is(AsmToken::LBrac) && "Token is not a \"[\"");
+
+  if (ParseMemory(Operands, ARMII::AddrMode2))
+    return MatchOperand_NoMatch;
+
+  return MatchOperand_Success;
+}
+
+/// tryParseMemMode3Operand - Try to parse memory addressing mode 3 operand.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+tryParseMemMode3Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  assert(Parser.getTok().is(AsmToken::LBrac) && "Token is not a \"[\"");
+
+  if (ParseMemory(Operands, ARMII::AddrMode3))
+    return MatchOperand_NoMatch;
+
+  return MatchOperand_Success;
+}
+
+/// CvtLdWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+CvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+
+  ((ARMOperand*)Operands[3])->addMemMode2Operands(Inst, 3);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// CvtStWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+CvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addMemMode2Operands(Inst, 3);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// CvtLdWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+CvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+
+  ((ARMOperand*)Operands[3])->addMemMode3Operands(Inst, 3);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// CvtStWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+CvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addMemMode3Operands(Inst, 3);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
 /// Parse an ARM memory expression, return false if successful else return true
 /// or an error.  The first token must be a '[' when called.
 ///
 /// TODO Only preindexing and postindexing addressing are started, unindexed
 /// with option, etc are still to do.
 bool ARMAsmParser::
-ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+            ARMII::AddrMode AddrMode = ARMII::AddrModeNone) {
   SMLoc S, E;
   assert(Parser.getTok().is(AsmToken::LBrac) &&
          "Token is not a Left Bracket");
@@ -1083,7 +1390,7 @@ ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   ARMOperand *WBOp = 0;
   int OffsetRegNum = -1;
   bool OffsetRegShifted = false;
-  enum ShiftType ShiftType = Lsl;
+  enum ARM_AM::ShiftOpc ShiftType = ARM_AM::lsl;
   const MCExpr *ShiftAmount = 0;
   const MCExpr *Offset = 0;
 
@@ -1106,10 +1413,17 @@ ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
     const AsmToken &ExclaimTok = Parser.getTok();
     if (ExclaimTok.is(AsmToken::Exclaim)) {
+      // None of addrmode3 instruction uses "!"
+      if (AddrMode == ARMII::AddrMode3)
+        return true;
+
       WBOp = ARMOperand::CreateToken(ExclaimTok.getString(),
                                      ExclaimTok.getLoc());
       Writeback = true;
       Parser.Lex(); // Eat exclaim token
+    } else { // In addressing mode 2, pre-indexed mode always end with "!"
+      if (AddrMode == ARMII::AddrMode2)
+        Preindexed = false;
     }
   } else {
     // The "[Rn" we have so far was not followed by a comma.
@@ -1143,13 +1457,17 @@ ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   if (!OffsetIsReg) {
     if (!Offset)
       Offset = MCConstantExpr::Create(0, getContext());
+  } else {
+    if (AddrMode == ARMII::AddrMode3 && OffsetRegShifted) {
+      Error(E, "shift amount not supported");
+      return true;
+    }
   }
 
-  Operands.push_back(ARMOperand::CreateMem(BaseRegNum, OffsetIsReg, Offset,
-                                           OffsetRegNum, OffsetRegShifted,
-                                           ShiftType, ShiftAmount, Preindexed,
-                                           Postindexed, Negative, Writeback,
-                                           S, E));
+  Operands.push_back(ARMOperand::CreateMem(AddrMode, BaseRegNum, OffsetIsReg,
+                                     Offset, OffsetRegNum, OffsetRegShifted,
+                                     ShiftType, ShiftAmount, Preindexed,
+                                     Postindexed, Negative, Writeback, S, E));
   if (WBOp)
     Operands.push_back(WBOp);
 
@@ -1165,7 +1483,7 @@ ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 /// we return false on success or an error otherwise.
 bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative,
                                         bool &OffsetRegShifted,
-                                        enum ShiftType &ShiftType,
+                                        enum ARM_AM::ShiftOpc &ShiftType,
                                         const MCExpr *&ShiftAmount,
                                         const MCExpr *&Offset,
                                         bool &OffsetIsReg,
@@ -1226,28 +1544,28 @@ bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative,
 ///   ( lsl | lsr | asr | ror ) , # shift_amount
 ///   rrx
 /// and returns true if it parses a shift otherwise it returns false.
-bool ARMAsmParser::ParseShift(ShiftType &St, const MCExpr *&ShiftAmount,
-                              SMLoc &E) {
+bool ARMAsmParser::ParseShift(ARM_AM::ShiftOpc &St,
+                              const MCExpr *&ShiftAmount, SMLoc &E) {
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
     return true;
   StringRef ShiftName = Tok.getString();
   if (ShiftName == "lsl" || ShiftName == "LSL")
-    St = Lsl;
+    St = ARM_AM::lsl;
   else if (ShiftName == "lsr" || ShiftName == "LSR")
-    St = Lsr;
+    St = ARM_AM::lsr;
   else if (ShiftName == "asr" || ShiftName == "ASR")
-    St = Asr;
+    St = ARM_AM::asr;
   else if (ShiftName == "ror" || ShiftName == "ROR")
-    St = Ror;
+    St = ARM_AM::ror;
   else if (ShiftName == "rrx" || ShiftName == "RRX")
-    St = Rrx;
+    St = ARM_AM::rrx;
   else
     return true;
   Parser.Lex(); // Eat shift type token.
 
   // Rrx stands alone.
-  if (St == Rrx)
+  if (St == ARM_AM::rrx)
     return false;
 
   // Otherwise, there must be a '#' and a shift amount.
@@ -1286,6 +1604,9 @@ bool ARMAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
   case AsmToken::Identifier:
     if (!TryParseRegisterWithWriteBack(Operands))
       return false;
+    if (!TryParseShiftRegister(Operands))
+      return false;
+
 
     // Fall though for the Identifier case that is not a register or a
     // special name.
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 78d73d3..bdce2c4 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -18,6 +18,7 @@
 #include "ARMDisassembler.h"
 #include "ARMDisassemblerCore.h"
 
+#include "llvm/ADT/OwningPtr.h"
 #include "llvm/MC/EDInstInfo.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Target/TargetRegistry.h"
@@ -94,6 +95,9 @@ static unsigned decodeARMInstruction(uint32_t &insn) {
   // As a result, the decoder fails to deocode USAT properly.
   if (slice(insn, 27, 21) == 0x37 && slice(insn, 5, 4) == 1)
     return ARM::USAT;
+  // As a result, the decoder fails to deocode UQADD16 properly.
+  if (slice(insn, 27, 20) == 0x66 && slice(insn, 7, 4) == 1)
+    return ARM::UQADD16;
 
   // Ditto for ADDSrs, which is a super-instruction for A8.6.7 & A8.6.8.
   // As a result, the decoder fails to decode UMULL properly.
@@ -280,6 +284,24 @@ static unsigned T2Morph2LoadLiteral(unsigned Opcode) {
   }
 }
 
+// Helper function for special case handling of PLD (literal) and friends.
+// See A8.6.117 T1 & T2 and friends for why we morphed the opcode
+// before returning it.
+static unsigned T2Morph2PLDLiteral(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return Opcode; // Return unmorphed opcode.
+
+  case ARM::t2PLDi8:   case ARM::t2PLDs:
+  case ARM::t2PLDWi12: case ARM::t2PLDWi8:
+  case ARM::t2PLDWs:
+    return ARM::t2PLDi12;
+
+  case ARM::t2PLIi8:   case ARM::t2PLIs:
+    return ARM::t2PLIi12;
+  }
+}
+
 /// decodeThumbSideEffect is a decorator function which can potentially twiddle
 /// the instruction or morph the returned opcode under Thumb2.
 ///
@@ -330,12 +352,27 @@ static unsigned decodeThumbSideEffect(bool IsThumb2, unsigned &insn) {
     }
     // --------- Transform End Marker ---------
 
+    unsigned unmorphed = decodeThumbInstruction(insn);
+
     // See, for example, A6.3.7 Load word: Table A6-18 Load word.
     // See A8.6.57 T3, T4 & A8.6.60 T2 and friends for why we morphed the opcode
     // before returning it to our caller.
     if (op1 == 3 && slice(op2, 6, 5) == 0 && slice(op2, 0, 0) == 1
-        && slice(insn, 19, 16) == 15)
-      return T2Morph2LoadLiteral(decodeThumbInstruction(insn));
+        && slice(insn, 19, 16) == 15) {
+      unsigned morphed = T2Morph2LoadLiteral(unmorphed);
+      if (morphed != unmorphed)
+        return morphed;
+    }
+
+    // See, for example, A8.6.117 PLD,PLDW (immediate) T1 & T2, and friends for
+    // why we morphed the opcode before returning it to our caller.
+    if (slice(insn, 31, 25) == 0x7C && slice(insn, 15, 12) == 0xF
+        && slice(insn, 22, 22) == 0 && slice(insn, 20, 20) == 1
+        && slice(insn, 19, 16) == 15) {
+      unsigned morphed = T2Morph2PLDLiteral(unmorphed);
+      if (morphed != unmorphed)
+        return morphed;
+    }
 
     // One last check for NEON/VFP instructions.
     if ((op1 == 1 || op1 == 3) && slice(op2, 6, 6) == 1)
@@ -375,21 +412,23 @@ bool ARMDisassembler::getInstruction(MCInst &MI,
   Size = 4;
 
   DEBUG({
-      errs() << "Opcode=" << Opcode << " Name=" << ARMUtils::OpcodeName(Opcode)
+      errs() << "\nOpcode=" << Opcode << " Name=" <<ARMUtils::OpcodeName(Opcode)
              << " Format=" << stringForARMFormat(Format) << '(' << (int)Format
              << ")\n";
       showBitVector(errs(), insn);
     });
 
-  ARMBasicMCBuilder *Builder = CreateMCBuilder(Opcode, Format);
+  OwningPtr<ARMBasicMCBuilder> Builder(CreateMCBuilder(Opcode, Format));
   if (!Builder)
     return false;
 
+  Builder->setupBuilderForSymbolicDisassembly(getLLVMOpInfoCallback(),
+                                              getDisInfoBlock(), getMCContext(),
+                                              Address);
+
   if (!Builder->Build(MI, insn))
     return false;
 
-  delete Builder;
-
   return true;
 }
 
@@ -398,7 +437,7 @@ bool ThumbDisassembler::getInstruction(MCInst &MI,
                                        const MemoryObject &Region,
                                        uint64_t Address,
                                        raw_ostream &os) const {
-  // The Thumb instruction stream is a sequence of halhwords.
+  // The Thumb instruction stream is a sequence of halfwords.
 
   // This represents the first halfword as well as the machine instruction
   // passed to decodeThumbInstruction().  For 16-bit Thumb instruction, the top
@@ -463,17 +502,19 @@ bool ThumbDisassembler::getInstruction(MCInst &MI,
       showBitVector(errs(), insn);
     });
 
-  ARMBasicMCBuilder *Builder = CreateMCBuilder(Opcode, Format);
+  OwningPtr<ARMBasicMCBuilder> Builder(CreateMCBuilder(Opcode, Format));
   if (!Builder)
     return false;
 
   Builder->SetSession(const_cast<Session *>(&SO));
 
+  Builder->setupBuilderForSymbolicDisassembly(getLLVMOpInfoCallback(),
+                                              getDisInfoBlock(), getMCContext(),
+                                              Address);
+
   if (!Builder->Build(MI, insn))
     return false;
 
-  delete Builder;
-
   return true;
 }
 
diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
index bac68dd..642829c 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
@@ -17,6 +17,7 @@
 
 #include "ARMDisassemblerCore.h"
 #include "ARMAddressingModes.h"
+#include "ARMMCExpr.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -82,10 +83,28 @@ const char *ARMUtils::OpcodeName(unsigned Opcode) {
 // FIXME: Auto-gened?
 static unsigned
 getRegisterEnum(BO B, unsigned RegClassID, unsigned RawRegister) {
-  // For this purpose, we can treat rGPR as if it were GPR.
-  if (RegClassID == ARM::rGPRRegClassID) RegClassID = ARM::GPRRegClassID;
+  if (RegClassID == ARM::rGPRRegClassID) {
+    // Check for The register numbers 13 and 15 that are not permitted for many
+    // Thumb register specifiers.
+    if (RawRegister == 13 || RawRegister == 15) {
+      B->SetErr(-1);
+      return 0;
+    }
+    // For this purpose, we can treat rGPR as if it were GPR.
+    RegClassID = ARM::GPRRegClassID;
+  }
 
   // See also decodeNEONRd(), decodeNEONRn(), decodeNEONRm().
+  // A7.3 register encoding
+  //     Qd -> bit[12] == 0
+  //     Qn -> bit[16] == 0
+  //     Qm -> bit[0]  == 0
+  //
+  // If one of these bits is 1, the instruction is UNDEFINED.
+  if (RegClassID == ARM::QPRRegClassID && slice(RawRegister, 0, 0) == 1) {
+    B->SetErr(-1);
+    return 0;
+  }
   unsigned RegNum =
     RegClassID == ARM::QPRRegClassID ? RawRegister >> 1 : RawRegister;
 
@@ -497,14 +516,66 @@ static bool DisassemblePseudo(MCInst &MI, unsigned Opcode, uint32_t insn,
   return false;
 }
 
+// A8.6.94 MLA
+// if d == 15 || n == 15 || m == 15 || a == 15 then UNPREDICTABLE;
+//
+// A8.6.105 MUL
+// if d == 15 || n == 15 || m == 15 then UNPREDICTABLE;
+//
+// A8.6.246 UMULL
+// if dLo == 15 || dHi == 15 || n == 15 || m == 15 then UNPREDICTABLE;
+// if dHi == dLo then UNPREDICTABLE;
+static bool BadRegsMulFrm(unsigned Opcode, uint32_t insn) {
+  unsigned R19_16 = slice(insn, 19, 16);
+  unsigned R15_12 = slice(insn, 15, 12);
+  unsigned R11_8  = slice(insn, 11, 8);
+  unsigned R3_0   = slice(insn, 3, 0);
+  switch (Opcode) {
+  default:
+    // Did we miss an opcode?
+    DEBUG(errs() << "BadRegsMulFrm: unexpected opcode!");
+    return false;
+  case ARM::MLA:     case ARM::MLS:     case ARM::SMLABB:  case ARM::SMLABT:
+  case ARM::SMLATB:  case ARM::SMLATT:  case ARM::SMLAWB:  case ARM::SMLAWT:
+  case ARM::SMMLA:   case ARM::SMMLAR:  case ARM::SMMLS:   case ARM::SMMLSR:
+  case ARM::USADA8:
+    if (R19_16 == 15 || R15_12 == 15 || R11_8 == 15 || R3_0 == 15)
+      return true;
+    return false;
+  case ARM::MUL:     case ARM::SMMUL:   case ARM::SMMULR:
+  case ARM::SMULBB:  case ARM::SMULBT:  case ARM::SMULTB:  case ARM::SMULTT:
+  case ARM::SMULWB:  case ARM::SMULWT:  case ARM::SMUAD:   case ARM::SMUADX:
+  // A8.6.167 SMLAD & A8.6.172 SMLSD
+  case ARM::SMLAD:   case ARM::SMLADX:  case ARM::SMLSD:   case ARM::SMLSDX:
+  case ARM::USAD8:
+    if (R19_16 == 15 || R11_8 == 15 || R3_0 == 15)
+      return true;
+    return false;
+  case ARM::SMLAL:   case ARM::SMULL:   case ARM::UMAAL:   case ARM::UMLAL:
+  case ARM::UMULL:
+  case ARM::SMLALBB: case ARM::SMLALBT: case ARM::SMLALTB: case ARM::SMLALTT:
+  case ARM::SMLALD:  case ARM::SMLALDX: case ARM::SMLSLD:  case ARM::SMLSLDX:
+    if (R19_16 == 15 || R15_12 == 15 || R11_8 == 15 || R3_0 == 15)
+      return true;
+    if (R19_16 == R15_12)
+      return true;
+    return false;;
+  }
+}
+
 // Multiply Instructions.
-// MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB, SMLAWT, SMMLA, SMMLS:
+// MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB, SMLAWT, SMMLA, SMMLAR,
+// SMMLS, SMMLAR, SMLAD, SMLADX, SMLSD, SMLSDX, and USADA8 (for convenience):
 //     Rd{19-16} Rn{3-0} Rm{11-8} Ra{15-12}
+// But note that register checking for {SMLAD, SMLADX, SMLSD, SMLSDX} is
+// only for {d, n, m}.
 //
-// MUL, SMMUL, SMULBB, SMULBT, SMULTB, SMULTT, SMULWB, SMULWT:
+// MUL, SMMUL, SMMULR, SMULBB, SMULBT, SMULTB, SMULTT, SMULWB, SMULWT, SMUAD,
+// SMUADX, and USAD8 (for convenience):
 //     Rd{19-16} Rn{3-0} Rm{11-8}
 //
-// SMLAL, SMULL, UMAAL, UMLAL, UMULL, SMLALBB, SMLALBT, SMLALTB, SMLALTT:
+// SMLAL, SMULL, UMAAL, UMLAL, UMULL, SMLALBB, SMLALBT, SMLALTB, SMLALTT,
+// SMLALD, SMLADLX, SMLSLD, SMLSLDX:
 //     RdLo{15-12} RdHi{19-16} Rn{3-0} Rm{11-8}
 //
 // The mapping of the multiply registers to the "regular" ARM registers, where
@@ -531,6 +602,10 @@ static bool DisassembleMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
          && OpInfo[2].RegClass == ARM::GPRRegClassID
          && "Expect three register operands");
 
+  // Sanity check for the register encodings.
+  if (BadRegsMulFrm(Opcode, insn))
+    return false;
+
   // Instructions with two destination registers have RdLo{15-12} first.
   if (NumDefs == 2) {
     assert(NumOps >= 4 && OpInfo[3].RegClass == ARM::GPRRegClassID &&
@@ -618,18 +693,38 @@ static inline unsigned GetCopOpc(uint32_t insn) {
 static bool DisassembleCoprocessor(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
-  assert(NumOps >= 5 && "Num of operands >= 5 for coprocessor instr");
+  assert(NumOps >= 4 && "Num of operands >= 4 for coprocessor instr");
 
   unsigned &OpIdx = NumOpsAdded;
+  // A8.6.92
+  // if coproc == '101x' then SEE "Advanced SIMD and VFP"
+  // But since the special instructions have more explicit encoding bits
+  // specified, if coproc == 10 or 11, we should reject it as invalid.
+  unsigned coproc = GetCoprocessor(insn);
+  if ((Opcode == ARM::MCR || Opcode == ARM::MCRR ||
+       Opcode == ARM::MRC || Opcode == ARM::MRRC) &&
+      (coproc == 10 || coproc == 11)) {
+    DEBUG(errs() << "Encoding error: coproc == 10 or 11 for MCR[R]/MR[R]C\n");
+    return false;
+  }
+
   bool OneCopOpc = (Opcode == ARM::MCRR || Opcode == ARM::MCRR2 ||
                     Opcode == ARM::MRRC || Opcode == ARM::MRRC2);
+
   // CDP/CDP2 has no GPR operand; the opc1 operand is also wider (Inst{23-20}).
   bool NoGPR = (Opcode == ARM::CDP || Opcode == ARM::CDP2);
   bool LdStCop = LdStCopOpcode(Opcode);
+  bool RtOut = (Opcode == ARM::MRC || Opcode == ARM::MRC2);
 
   OpIdx = 0;
 
-  MI.addOperand(MCOperand::CreateImm(GetCoprocessor(insn)));
+  if (RtOut) {
+    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+                                                       decodeRd(insn))));
+    ++OpIdx;
+  }
+  MI.addOperand(MCOperand::CreateImm(coproc));
+  ++OpIdx;
 
   if (LdStCop) {
     // Unindex if P:W = 0b00 --> _OPTION variant
@@ -639,26 +734,34 @@ static bool DisassembleCoprocessor(MCInst &MI, unsigned Opcode, uint32_t insn,
 
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        decodeRn(insn))));
+    OpIdx += 2;
 
     if (PW) {
       MI.addOperand(MCOperand::CreateReg(0));
       ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
+      const TargetInstrDesc &TID = ARMInsts[Opcode];
+      unsigned IndexMode =
+                  (TID.TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift;
       unsigned Offset = ARM_AM::getAM2Opc(AddrOpcode, slice(insn, 7, 0) << 2,
-                                          ARM_AM::no_shift);
+                                          ARM_AM::no_shift, IndexMode);
       MI.addOperand(MCOperand::CreateImm(Offset));
-      OpIdx = 5;
+      OpIdx += 2;
     } else {
       MI.addOperand(MCOperand::CreateImm(slice(insn, 7, 0)));
-      OpIdx = 4;
+      ++OpIdx;
     }
   } else {
     MI.addOperand(MCOperand::CreateImm(OneCopOpc ? GetCopOpc(insn)
                                                  : GetCopOpc1(insn, NoGPR)));
+    ++OpIdx;
 
-    MI.addOperand(NoGPR ? MCOperand::CreateImm(decodeRd(insn))
-                        : MCOperand::CreateReg(
-                            getRegisterEnum(B, ARM::GPRRegClassID,
-                                            decodeRd(insn))));
+    if (!RtOut) {
+      MI.addOperand(NoGPR ? MCOperand::CreateImm(decodeRd(insn))
+                          : MCOperand::CreateReg(
+                                getRegisterEnum(B, ARM::GPRRegClassID,
+                                                decodeRd(insn))));
+      ++OpIdx;
+    }
 
     MI.addOperand(OneCopOpc ? MCOperand::CreateReg(
                                 getRegisterEnum(B, ARM::GPRRegClassID,
@@ -667,7 +770,7 @@ static bool DisassembleCoprocessor(MCInst &MI, unsigned Opcode, uint32_t insn,
 
     MI.addOperand(MCOperand::CreateImm(decodeRm(insn)));
 
-    OpIdx = 5;
+    OpIdx += 2;
 
     if (!OneCopOpc) {
       MI.addOperand(MCOperand::CreateImm(GetCopOpc2(insn)));
@@ -679,8 +782,8 @@ static bool DisassembleCoprocessor(MCInst &MI, unsigned Opcode, uint32_t insn,
 }
 
 // Branch Instructions.
-// BLr9: SignExtend(Imm24:'00', 32)
-// Bcc, BLr9_pred: SignExtend(Imm24:'00', 32) Pred0 Pred1
+// BL: SignExtend(Imm24:'00', 32)
+// Bcc, BL_pred: SignExtend(Imm24:'00', 32) Pred0 Pred1
 // SMC: ZeroExtend(imm4, 32)
 // SVC: ZeroExtend(Imm24, 32)
 //
@@ -735,6 +838,11 @@ static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   // MSRi take a mask, followed by one so_imm operand. The mask contains the
   // R Bit in bit 4, and the special register fields in bits 3-0.
   if (Opcode == ARM::MSRi) {
+    // A5.2.11 MSR (immediate), and hints & B6.1.6 MSR (immediate)
+    // The hints instructions have more specific encodings, so if mask == 0,
+    // we should reject this as an invalid instruction.
+    if (slice(insn, 19, 16) == 0)
+      return false;
     MI.addOperand(MCOperand::CreateImm(slice(insn, 22, 22) << 4 /* R Bit */ |
                                        slice(insn, 19, 16) /* Special Reg */ ));
     // SOImm is 4-bit rotate amount in bits 11-8 with 8-bit imm in bits 7-0.
@@ -760,11 +868,11 @@ static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     return true;
   }
 
-  assert((Opcode == ARM::Bcc || Opcode == ARM::BLr9 || Opcode == ARM::BLr9_pred
+  assert((Opcode == ARM::Bcc || Opcode == ARM::BL || Opcode == ARM::BL_pred
           || Opcode == ARM::SMC || Opcode == ARM::SVC) &&
          "Unexpected Opcode");
 
-  assert(NumOps >= 1 && OpInfo[0].RegClass < 0 && "Reg operand expected");
+  assert(NumOps >= 1 && OpInfo[0].RegClass < 0 && "Imm operand expected");
 
   int Imm32 = 0;
   if (Opcode == ARM::SMC) {
@@ -778,12 +886,6 @@ static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned Imm26 = slice(insn, 23, 0) << 2;
     //Imm32 = signextend<signed int, 26>(Imm26);
     Imm32 = SignExtend32<26>(Imm26);
-
-    // When executing an ARM instruction, PC reads as the address of the current
-    // instruction plus 8.  The assembler subtracts 8 from the difference
-    // between the branch instruction and the target address, disassembler has
-    // to add 8 to compensate.
-    Imm32 += 8;
   }
 
   MI.addOperand(MCOperand::CreateImm(Imm32));
@@ -793,7 +895,7 @@ static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 }
 
 // Misc. Branch Instructions.
-// BLXr9, BXr9
+// BLX, BLXi, BX
 // BX, BX_RET
 static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
@@ -809,8 +911,9 @@ static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   if (Opcode == ARM::BX_RET || Opcode == ARM::MOVPCLR)
     return true;
 
-  // BLXr9 and BX take one GPR reg.
-  if (Opcode == ARM::BLXr9 || Opcode == ARM::BX) {
+  // BLX and BX take one GPR reg.
+  if (Opcode == ARM::BLX || Opcode == ARM::BLX_pred ||
+      Opcode == ARM::BX) {
     assert(NumOps >= 1 && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
            "Reg operand expected");
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
@@ -819,6 +922,17 @@ static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     return true;
   }
 
+  // BLXi takes imm32 (the PC offset).
+  if (Opcode == ARM::BLXi) {
+    assert(NumOps >= 1 && OpInfo[0].RegClass < 0 && "Imm operand expected");
+    // SignExtend(imm24:H:'0', 32) where imm24 = Inst{23-0} and H = Inst{24}.
+    unsigned Imm26 = slice(insn, 23, 0) << 2 | slice(insn, 24, 24) << 1;
+    int Imm32 = SignExtend32<26>(Imm26);
+    MI.addOperand(MCOperand::CreateImm(Imm32));
+    OpIdx = 1;
+    return true;
+  }
+
   return false;
 }
 
@@ -837,6 +951,24 @@ static inline bool getBFCInvMask(uint32_t insn, uint32_t &mask) {
   return true;
 }
 
+// Standard data-processing instructions allow PC as a register specifier,
+// but we should reject other DPFrm instructions with PC as registers.
+static bool BadRegsDPFrm(unsigned Opcode, uint32_t insn) {
+  switch (Opcode) {
+  default:
+    // Did we miss an opcode?
+    if (decodeRd(insn) == 15 || decodeRn(insn) == 15 || decodeRm(insn) == 15) {
+      DEBUG(errs() << "DPFrm with bad reg specifier(s)\n");
+      return true;
+    }
+  case ARM::ADCrr:  case ARM::ADDSrr: case ARM::ADDrr:  case ARM::ANDrr:
+  case ARM::BICrr:  case ARM::CMNzrr: case ARM::CMPrr:  case ARM::EORrr:
+  case ARM::ORRrr:  case ARM::RSBrr:  case ARM::RSCrr:  case ARM::SBCrr:
+  case ARM::SUBSrr: case ARM::SUBrr:  case ARM::TEQrr:  case ARM::TSTrr:
+    return false;
+  }
+}
+
 // A major complication is the fact that some of the saturating add/subtract
 // operations have Rd Rm Rn, instead of the "normal" Rd Rn Rm.
 // They are QADD, QDADD, QDSUB, and QSUB.
@@ -864,6 +996,10 @@ static bool DisassembleDPFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   // Special-case handling of BFC/BFI/SBFX/UBFX.
   if (Opcode == ARM::BFC || Opcode == ARM::BFI) {
+    // A8.6.17 BFC & A8.6.18 BFI
+    // Sanity check Rd.
+    if (decodeRd(insn) == 15)
+      return false;
     MI.addOperand(MCOperand::CreateReg(0));
     if (Opcode == ARM::BFI) {
       MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
@@ -879,6 +1015,9 @@ static bool DisassembleDPFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     return true;
   }
   if (Opcode == ARM::SBFX || Opcode == ARM::UBFX) {
+    // Sanity check Rd and Rm.
+    if (decodeRd(insn) == 15 || decodeRm(insn) == 15)
+      return false;
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        decodeRm(insn))));
     MI.addOperand(MCOperand::CreateImm(slice(insn, 11, 7)));
@@ -915,15 +1054,21 @@ static bool DisassembleDPFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     // Assert disabled because saturating operations, e.g., A8.6.127 QASX, are
     // routed here as well.
     // assert(getIBit(insn) == 0 && "I_Bit != '0' reg/reg form");
+    if (BadRegsDPFrm(Opcode, insn))
+      return false;
     MI.addOperand(MCOperand::CreateReg(
                     getRegisterEnum(B, ARM::GPRRegClassID,
                                     RmRn? decodeRn(insn) : decodeRm(insn))));
     ++OpIdx;
   } else if (Opcode == ARM::MOVi16 || Opcode == ARM::MOVTi16) {
+    // These two instructions don't allow d as 15.
+    if (decodeRd(insn) == 15)
+      return false;
     // We have an imm16 = imm4:imm12 (imm4=Inst{19:16}, imm12 = Inst{11:0}).
     assert(getIBit(insn) == 1 && "I_Bit != '1' reg/imm form");
     unsigned Imm16 = slice(insn, 19, 16) << 12 | slice(insn, 11, 0);
-    MI.addOperand(MCOperand::CreateImm(Imm16));
+    if (!B->tryAddingSymbolicOperand(Imm16, 4, MI))
+      MI.addOperand(MCOperand::CreateImm(Imm16));
     ++OpIdx;
   } else {
     // We have a reg/imm form.
@@ -992,6 +1137,21 @@ static bool DisassembleDPSoRegFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                      decodeRm(insn))));
   if (Rs) {
+    // If Inst{7} != 0, we should reject this insn as an invalid encoding.
+    if (slice(insn, 7, 7))
+      return false;
+
+    // A8.6.3 ADC (register-shifted register)
+    // if d == 15 || n == 15 || m == 15 || s == 15 then UNPREDICTABLE;
+    // 
+    // This also accounts for shift instructions (register) where, fortunately,
+    // Inst{19-16} = 0b0000.
+    // A8.6.89 LSL (register)
+    // if d == 15 || n == 15 || m == 15 then UNPREDICTABLE;
+    if (decodeRd(insn) == 15 || decodeRn(insn) == 15 ||
+        decodeRm(insn) == 15 || decodeRs(insn) == 15)
+      return false;
+    
     // Register-controlled shifts: [Rm, Rs, shift].
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        decodeRs(insn))));
@@ -1015,6 +1175,71 @@ static bool DisassembleDPSoRegFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   return true;
 }
 
+static bool BadRegsLdStFrm(unsigned Opcode, uint32_t insn, bool Store, bool WBack,
+                           bool Imm) {
+  const StringRef Name = ARMInsts[Opcode].Name;
+  unsigned Rt = decodeRd(insn);
+  unsigned Rn = decodeRn(insn);
+  unsigned Rm = decodeRm(insn);
+  unsigned P  = getPBit(insn);
+  unsigned W  = getWBit(insn);
+
+  if (Store) {
+    // Only STR (immediate, register) allows PC as the source.
+    if (Name.startswith("STRB") && Rt == 15) {
+      DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n");
+      return true;
+    }
+    if (WBack && (Rn == 15 || Rn == Rt)) {
+      DEBUG(errs() << "if wback && (n == 15 || n == t) then UNPREDICTABLE\n");
+      return true;
+    }
+    if (!Imm && Rm == 15) {
+      DEBUG(errs() << "if m == 15 then UNPREDICTABLE\n");
+      return true;
+    }
+  } else {
+    // Only LDR (immediate, register) allows PC as the destination.
+    if (Name.startswith("LDRB") && Rt == 15) {
+      DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n");
+      return true;
+    }
+    if (Imm) {
+      // Immediate
+      if (Rn == 15) {
+        // The literal form must be in offset mode; it's an encoding error
+        // otherwise.
+        if (!(P == 1 && W == 0)) {
+          DEBUG(errs() << "Ld literal form with !(P == 1 && W == 0)\n");
+          return true;
+        }
+        // LDRB (literal) does not allow PC as the destination.
+        if (Opcode != ARM::LDRi12 && Rt == 15) {
+          DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n");
+          return true;
+        }
+      } else {
+        // Write back while Rn == Rt does not make sense.
+        if (WBack && (Rn == Rt)) {
+          DEBUG(errs() << "if wback && n == t then UNPREDICTABLE\n");
+          return true;
+        }
+      }
+    } else {
+      // Register
+      if (Rm == 15) {
+        DEBUG(errs() << "if m == 15 then UNPREDICTABLE\n");
+        return true;
+      }
+      if (WBack && (Rn == 15 || Rn == Rt)) {
+        DEBUG(errs() << "if wback && (n == 15 || n == t) then UNPREDICTABLE\n");
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 static bool DisassembleLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, bool isStore, BO B) {
 
@@ -1077,19 +1302,41 @@ static bool DisassembleLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   if (OpIdx + 1 >= NumOps)
     return false;
 
-  assert((OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) &&
-         (OpInfo[OpIdx+1].RegClass < 0) &&
-         "Expect 1 reg operand followed by 1 imm operand");
+  if (BadRegsLdStFrm(Opcode, insn, isStore, isPrePost, getIBit(insn)==0))
+    return false;
 
   ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
+  unsigned IndexMode =
+               (TID.TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift;
   if (getIBit(insn) == 0) {
-    MI.addOperand(MCOperand::CreateReg(0));
+    // For pre- and post-indexed case, add a reg0 operand (Addressing Mode #2).
+    // Otherwise, skip the reg operand since for addrmode_imm12, Rn has already
+    // been populated.
+    if (isPrePost) {
+      MI.addOperand(MCOperand::CreateReg(0));
+      OpIdx += 1;
+    }
 
-    // Disassemble the 12-bit immediate offset.
     unsigned Imm12 = slice(insn, 11, 0);
-    unsigned Offset = ARM_AM::getAM2Opc(AddrOpcode, Imm12, ARM_AM::no_shift);
-    MI.addOperand(MCOperand::CreateImm(Offset));
+    if (Opcode == ARM::LDRBi12 || Opcode == ARM::LDRi12 ||
+        Opcode == ARM::STRBi12 || Opcode == ARM::STRi12) {
+      // Disassemble the 12-bit immediate offset, which is the second operand in
+      // $addrmode_imm12 => (ops GPR:$base, i32imm:$offsimm).    
+      int Offset = AddrOpcode == ARM_AM::add ? 1 * Imm12 : -1 * Imm12;
+      MI.addOperand(MCOperand::CreateImm(Offset));
+    } else {
+      // Disassemble the 12-bit immediate offset, which is the second operand in
+      // $am2offset => (ops GPR, i32imm).
+      unsigned Offset = ARM_AM::getAM2Opc(AddrOpcode, Imm12, ARM_AM::no_shift,
+                                          IndexMode);
+      MI.addOperand(MCOperand::CreateImm(Offset));
+    }
+    OpIdx += 1;
   } else {
+    // If Inst{25} = 1 and Inst{4} != 0, we should reject this as invalid.
+    if (slice(insn,4,4) == 1)
+      return false;
+
     // Disassemble the offset reg (Rm), shift type, and immediate shift length.
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        decodeRm(insn))));
@@ -1101,9 +1348,9 @@ static bool DisassembleLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     // A8.4.1.  Possible rrx or shift amount of 32...
     getImmShiftSE(ShOp, ShImm);
     MI.addOperand(MCOperand::CreateImm(
-                    ARM_AM::getAM2Opc(AddrOpcode, ShImm, ShOp)));
+                    ARM_AM::getAM2Opc(AddrOpcode, ShImm, ShOp, IndexMode)));
+    OpIdx += 2;
   }
-  OpIdx += 2;
 
   return true;
 }
@@ -1125,7 +1372,7 @@ static bool HasDualReg(unsigned Opcode) {
   case ARM::LDRD: case ARM::LDRD_PRE: case ARM::LDRD_POST:
   case ARM::STRD: case ARM::STRD_PRE: case ARM::STRD_POST:
     return true;
-  }  
+  }
 }
 
 static bool DisassembleLdStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
@@ -1153,8 +1400,6 @@ static bool DisassembleLdStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     ++OpIdx;
   }
 
-  bool DualReg = HasDualReg(Opcode);
-
   // Disassemble the dst/src operand.
   if (OpIdx >= NumOps)
     return false;
@@ -1165,8 +1410,8 @@ static bool DisassembleLdStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
                                                      decodeRd(insn))));
   ++OpIdx;
 
-  // Fill in LDRD and STRD's second operand.
-  if (DualReg) {
+  // Fill in LDRD and STRD's second operand Rt operand.
+  if (HasDualReg(Opcode)) {
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        decodeRd(insn) + 1)));
     ++OpIdx;
@@ -1188,7 +1433,7 @@ static bool DisassembleLdStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
          "Reg operand expected");
   assert((!isPrePost || (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1))
-         && "Index mode or tied_to operand expected");
+         && "Offset mode or tied_to operand expected");
   MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                      decodeRn(insn))));
   ++OpIdx;
@@ -1204,19 +1449,22 @@ static bool DisassembleLdStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
          "Expect 1 reg operand followed by 1 imm operand");
 
   ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
+  unsigned IndexMode =
+                  (TID.TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift;
   if (getAM3IBit(insn) == 1) {
     MI.addOperand(MCOperand::CreateReg(0));
 
     // Disassemble the 8-bit immediate offset.
     unsigned Imm4H = (insn >> ARMII::ImmHiShift) & 0xF;
     unsigned Imm4L = insn & 0xF;
-    unsigned Offset = ARM_AM::getAM3Opc(AddrOpcode, (Imm4H << 4) | Imm4L);
+    unsigned Offset = ARM_AM::getAM3Opc(AddrOpcode, (Imm4H << 4) | Imm4L,
+                                        IndexMode);
     MI.addOperand(MCOperand::CreateImm(Offset));
   } else {
     // Disassemble the offset reg (Rm).
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        decodeRm(insn))));
-    unsigned Offset = ARM_AM::getAM3Opc(AddrOpcode, 0);
+    unsigned Offset = ARM_AM::getAM3Opc(AddrOpcode, 0, IndexMode);
     MI.addOperand(MCOperand::CreateImm(Offset));
   }
   OpIdx += 2;
@@ -1236,13 +1484,13 @@ static bool DisassembleStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 }
 
 // The algorithm for disassembly of LdStMulFrm is different from others because
-// it explicitly populates the two predicate operands after operand 0 (the base)
-// and operand 1 (the AM4 mode imm).  After operand 3, we need to populate the
-// reglist with each affected register encoded as an MCOperand.
+// it explicitly populates the two predicate operands after the base register.
+// After that, we need to populate the reglist with each affected register
+// encoded as an MCOperand.
 static bool DisassembleLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
-  assert(NumOps >= 5 && "LdStMulFrm expects NumOps >= 5");
+  assert(NumOps >= 4 && "LdStMulFrm expects NumOps >= 4");
   NumOpsAdded = 0;
 
   unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn));
@@ -1260,8 +1508,10 @@ static bool DisassembleLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   MI.addOperand(MCOperand::CreateReg(Base));
 
   // Handling the two predicate operands before the reglist.
-  int64_t CondVal = insn >> ARMII::CondShift;
-  MI.addOperand(MCOperand::CreateImm(CondVal == 0xF ? 0xE : CondVal));
+  int64_t CondVal = getCondField(insn);
+  if (CondVal == 0xF)
+    return false;
+  MI.addOperand(MCOperand::CreateImm(CondVal));
   MI.addOperand(MCOperand::CreateReg(ARM::CPSR));
 
   NumOpsAdded += 3;
@@ -1352,6 +1602,12 @@ static bool DisassembleArithMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   bool ThreeReg = NumOps > 2 && OpInfo[2].RegClass == ARM::GPRRegClassID;
 
+  // Sanity check the registers, which should not be 15.
+  if (decodeRd(insn) == 15 || decodeRm(insn) == 15)
+    return false;
+  if (ThreeReg && decodeRn(insn) == 15)
+    return false;
+
   MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                      decodeRd(insn))));
   ++OpIdx;
@@ -1376,7 +1632,7 @@ static bool DisassembleArithMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     ARM_AM::ShiftOpc Opc = ARM_AM::no_shift;
     if (Opcode == ARM::PKHBT)
       Opc = ARM_AM::lsl;
-    else if (Opcode == ARM::PKHBT)
+    else if (Opcode == ARM::PKHTB)
       Opc = ARM_AM::asr;
     getImmShiftSE(Opc, ShiftAmt);
     MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(Opc, ShiftAmt)));
@@ -1391,6 +1647,11 @@ static bool DisassembleArithMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 static bool DisassembleSatFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
+  // A8.6.183 SSAT
+  // if d == 15 || n == 15 then UNPREDICTABLE;
+  if (decodeRd(insn) == 15 || decodeRm(insn) == 15)
+    return false;
+
   const TargetInstrDesc &TID = ARMInsts[Opcode];
   NumOpsAdded = TID.getNumOperands() - 2; // ignore predicate operands
 
@@ -1429,6 +1690,11 @@ static bool DisassembleSatFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 static bool DisassembleExtFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
+  // A8.6.220 SXTAB
+  // if d == 15 || m == 15 then UNPREDICTABLE;
+  if (decodeRd(insn) == 15 || decodeRm(insn) == 15)
+    return false;
+
   const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
   unsigned &OpIdx = NumOpsAdded;
 
@@ -1611,7 +1877,7 @@ static bool DisassembleVFPBinaryFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 // A8.6.295 vcvt (floating-point <-> integer)
 // Int to FP: VSITOD, VSITOS, VUITOD, VUITOS
 // FP to Int: VTOSI[Z|R]D, VTOSI[Z|R]S, VTOUI[Z|R]D, VTOUI[Z|R]S
-// 
+//
 // A8.6.297 vcvt (floating-point and fixed-point)
 // Dd|Sd Dd|Sd(TIED_TO) #fbits(= 16|32 - UInt(imm4:i))
 static bool DisassembleVFPConv1Frm(MCInst &MI, unsigned Opcode, uint32_t insn,
@@ -1800,15 +2066,14 @@ static bool DisassembleVFPLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 }
 
 // VFP Load/Store Multiple Instructions.
-// This is similar to the algorithm for LDM/STM in that operand 0 (the base) and
-// operand 1 (the AM4 mode imm) is followed by two predicate operands.  It is
-// followed by a reglist of either DPR(s) or SPR(s).
+// We have an optional write back reg, the base, and two predicate operands.
+// It is then followed by a reglist of either DPR(s) or SPR(s).
 //
 // VLDMD[_UPD], VLDMS[_UPD], VSTMD[_UPD], VSTMS[_UPD]
 static bool DisassembleVFPLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
-  assert(NumOps >= 5 && "VFPLdStMulFrm expects NumOps >= 5");
+  assert(NumOps >= 4 && "VFPLdStMulFrm expects NumOps >= 4");
 
   unsigned &OpIdx = NumOpsAdded;
 
@@ -1827,25 +2092,18 @@ static bool DisassembleVFPLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   MI.addOperand(MCOperand::CreateReg(Base));
 
-  // Next comes the AM4 Opcode.
-  ARM_AM::AMSubMode SubMode = getAMSubModeForBits(getPUBits(insn));
-  // Must be either "ia" or "db" submode.
-  if (SubMode != ARM_AM::ia && SubMode != ARM_AM::db) {
-    DEBUG(errs() << "Illegal addressing mode 4 sub-mode!\n");
-    return false;
-  }
-  MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM4ModeImm(SubMode)));
-
   // Handling the two predicate operands before the reglist.
-  int64_t CondVal = insn >> ARMII::CondShift;
-  MI.addOperand(MCOperand::CreateImm(CondVal == 0xF ? 0xE : CondVal));
+  int64_t CondVal = getCondField(insn);
+  if (CondVal == 0xF)
+    return false;
+  MI.addOperand(MCOperand::CreateImm(CondVal));
   MI.addOperand(MCOperand::CreateReg(ARM::CPSR));
 
-  OpIdx += 4;
+  OpIdx += 3;
 
-  bool isSPVFP = (Opcode == ARM::VLDMSIA     || Opcode == ARM::VLDMSDB     ||
+  bool isSPVFP = (Opcode == ARM::VLDMSIA     ||
                   Opcode == ARM::VLDMSIA_UPD || Opcode == ARM::VLDMSDB_UPD ||
-                  Opcode == ARM::VSTMSIA     || Opcode == ARM::VSTMSDB     ||
+                  Opcode == ARM::VSTMSIA     ||
                   Opcode == ARM::VSTMSIA_UPD || Opcode == ARM::VSTMSDB_UPD);
   unsigned RegClassID = isSPVFP ? ARM::SPRRegClassID : ARM::DPRRegClassID;
 
@@ -1855,6 +2113,11 @@ static bool DisassembleVFPLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   // Fill the variadic part of reglist.
   unsigned char Imm8 = insn & 0xFF;
   unsigned Regs = isSPVFP ? Imm8 : Imm8/2;
+
+  // Apply some sanity checks before proceeding.
+  if (Regs == 0 || (RegD + Regs) > 32 || (!isSPVFP && Regs > 16))
+    return false;
+
   for (unsigned i = 0; i < Regs; ++i) {
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassID,
                                                        RegD + i)));
@@ -2136,7 +2399,7 @@ static unsigned decodeN3VImm(uint32_t insn) {
 // Correctly set VLD*/VST*'s TIED_TO GPR, as the asm printer needs it.
 static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, bool Store, bool DblSpaced,
-    BO B) {
+    unsigned alignment, BO B) {
 
   const TargetInstrDesc &TID = ARMInsts[Opcode];
   const TargetOperandInfo *OpInfo = TID.OpInfo;
@@ -2180,9 +2443,10 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
 
     assert((OpIdx+1) < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
            OpInfo[OpIdx + 1].RegClass < 0 && "Addrmode #6 Operands expected");
+    // addrmode6 := (ops GPR:$addr, i32imm)
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        Rn)));
-    MI.addOperand(MCOperand::CreateImm(0)); // Alignment ignored?
+    MI.addOperand(MCOperand::CreateImm(alignment)); // Alignment
     OpIdx += 2;
 
     if (WB) {
@@ -2230,9 +2494,10 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
 
     assert((OpIdx+1) < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
            OpInfo[OpIdx + 1].RegClass < 0 && "Addrmode #6 Operands expected");
+    // addrmode6 := (ops GPR:$addr, i32imm)
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        Rn)));
-    MI.addOperand(MCOperand::CreateImm(0)); // Alignment ignored?
+    MI.addOperand(MCOperand::CreateImm(alignment)); // Alignment
     OpIdx += 2;
 
     if (WB) {
@@ -2263,6 +2528,92 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
   return true;
 }
 
+// A8.6.308, A8.6.311, A8.6.314, A8.6.317.
+static bool Align4OneLaneInst(unsigned elem, unsigned size,
+    unsigned index_align, unsigned & alignment) {
+  unsigned bits = 0;
+  switch (elem) {
+  default:
+    return false;
+  case 1:
+    // A8.6.308
+    if (size == 0)
+      return slice(index_align, 0, 0) == 0;
+    else if (size == 1) {
+      bits = slice(index_align, 1, 0);
+      if (bits != 0 && bits != 1)
+        return false;
+      if (bits == 1)
+        alignment = 16;
+      return true;
+    } else if (size == 2) {
+      bits = slice(index_align, 2, 0);
+      if (bits != 0 && bits != 3)
+        return false;
+      if (bits == 3)
+        alignment = 32;
+      return true;;
+    }
+    return true;
+  case 2:
+    // A8.6.311
+    if (size == 0) {
+      if (slice(index_align, 0, 0) == 1)
+        alignment = 16;
+      return true;
+    } if (size == 1) {
+      if (slice(index_align, 0, 0) == 1)
+        alignment = 32;
+      return true;
+    } else if (size == 2) {
+      if (slice(index_align, 1, 1) != 0)
+        return false;
+      if (slice(index_align, 0, 0) == 1)
+        alignment = 64;
+      return true;;
+    }
+    return true;
+  case 3:
+    // A8.6.314
+    if (size == 0) {
+      if (slice(index_align, 0, 0) != 0)
+        return false;
+      return true;
+    } if (size == 1) {
+      if (slice(index_align, 0, 0) != 0)
+        return false;
+      return true;
+      return true;
+    } else if (size == 2) {
+      if (slice(index_align, 1, 0) != 0)
+        return false;
+      return true;;
+    }
+    return true;
+  case 4:
+    // A8.6.317
+    if (size == 0) {
+      if (slice(index_align, 0, 0) == 1)
+        alignment = 32;
+      return true;
+    } if (size == 1) {
+      if (slice(index_align, 0, 0) == 1)
+        alignment = 64;
+      return true;
+    } else if (size == 2) {
+      bits = slice(index_align, 1, 0);
+      if (bits == 3)
+        return false;
+      if (bits == 1)
+        alignment = 64;
+      else if (bits == 2)
+        alignment = 128;
+      return true;;
+    }
+    return true;
+  }
+}
+
 // A7.7
 // If L (Inst{21}) == 0, store instructions.
 // Find out about double-spaced-ness of the Opcode and pass it on to
@@ -2272,11 +2623,33 @@ static bool DisassembleNLdSt(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   const StringRef Name = ARMInsts[Opcode].Name;
   bool DblSpaced = false;
+  // 0 represents standard alignment, i.e., unaligned data access.
+  unsigned alignment = 0;
+
+  unsigned elem = 0; // legal values: {1, 2, 3, 4}
+  if (Name.startswith("VST1") || Name.startswith("VLD1"))
+    elem = 1;
+
+  if (Name.startswith("VST2") || Name.startswith("VLD2"))
+    elem = 2;
+
+  if (Name.startswith("VST3") || Name.startswith("VLD3"))
+    elem = 3;
+
+  if (Name.startswith("VST4") || Name.startswith("VLD4"))
+    elem = 4;
 
   if (Name.find("LN") != std::string::npos) {
     // To one lane instructions.
     // See, for example, 8.6.317 VLD4 (single 4-element structure to one lane).
 
+    // Utility function takes number of elements, size, and index_align.
+    if (!Align4OneLaneInst(elem,
+                           slice(insn, 11, 10),
+                           slice(insn, 7, 4),
+                           alignment))
+      return false;
+
     // <size> == 16 && Inst{5} == 1 --> DblSpaced = true
     if (Name.endswith("16") || Name.endswith("16_UPD"))
       DblSpaced = slice(insn, 5, 5) == 1;
@@ -2284,30 +2657,102 @@ static bool DisassembleNLdSt(MCInst &MI, unsigned Opcode, uint32_t insn,
     // <size> == 32 && Inst{6} == 1 --> DblSpaced = true
     if (Name.endswith("32") || Name.endswith("32_UPD"))
       DblSpaced = slice(insn, 6, 6) == 1;
-
+  } else if (Name.find("DUP") != std::string::npos) {
+    // Single element (or structure) to all lanes.
+    // Inst{9-8} encodes the number of element(s) in the structure, with:
+    // 0b00 (VLD1DUP) (for this, a bit makes sense only for data size 16 and 32.
+    // 0b01 (VLD2DUP)
+    // 0b10 (VLD3DUP) (for this, a bit must be encoded as 0)
+    // 0b11 (VLD4DUP)
+    //
+    // Inst{7-6} encodes the data size, with:
+    // 0b00 => 8, 0b01 => 16, 0b10 => 32
+    //
+    // Inst{4} (the a bit) encodes the align action (0: standard alignment)
+    unsigned elem = slice(insn, 9, 8) + 1;
+    unsigned a = slice(insn, 4, 4);
+    if (elem != 3) {
+      // 0b11 is not a valid encoding for Inst{7-6}.
+      if (slice(insn, 7, 6) == 3)
+        return false;
+      unsigned data_size = 8 << slice(insn, 7, 6);
+      // For VLD1DUP, a bit makes sense only for data size of 16 and 32.
+      if (a && data_size == 8)
+        return false;
+
+      // Now we can calculate the alignment!
+      if (a)
+        alignment = elem * data_size;
+    } else {
+      if (a) {
+        // A8.6.315 VLD3 (single 3-element structure to all lanes)
+        // The a bit must be encoded as 0.
+        return false;
+      }
+    }
   } else {
     // Multiple n-element structures with type encoded as Inst{11-8}.
     // See, for example, A8.6.316 VLD4 (multiple 4-element structures).
 
-    // n == 2 && type == 0b1001 -> DblSpaced = true
-    if (Name.startswith("VST2") || Name.startswith("VLD2"))
-      DblSpaced = slice(insn, 11, 8) == 9;
-    
-    // n == 3 && type == 0b0101 -> DblSpaced = true
-    if (Name.startswith("VST3") || Name.startswith("VLD3"))
-      DblSpaced = slice(insn, 11, 8) == 5;
-    
-    // n == 4 && type == 0b0001 -> DblSpaced = true
-    if (Name.startswith("VST4") || Name.startswith("VLD4"))
-      DblSpaced = slice(insn, 11, 8) == 1;
-    
+    // Inst{5-4} encodes alignment.
+    unsigned align = slice(insn, 5, 4);
+    switch (align) {
+    default:
+      break;
+    case 1:
+      alignment = 64; break;
+    case 2:
+      alignment = 128; break;
+    case 3:
+      alignment = 256; break;
+    }
+
+    unsigned type = slice(insn, 11, 8);
+    // Reject UNDEFINED instructions based on type and align.
+    // Plus set DblSpaced flag where appropriate.
+    switch (elem) {
+    default:
+      break;
+    case 1:
+      // n == 1
+      // A8.6.307 & A8.6.391
+      if ((type == 7  && slice(align, 1, 1) == 1) ||
+          (type == 10 && align == 3) ||
+          (type == 6  && slice(align, 1, 1) == 1))
+        return false;
+      break;
+    case 2:
+      // n == 2 && type == 0b1001 -> DblSpaced = true
+      // A8.6.310 & A8.6.393
+      if ((type == 8 || type == 9) && align == 3)
+        return false;
+      DblSpaced = (type == 9);
+      break;
+    case 3:
+      // n == 3 && type == 0b0101 -> DblSpaced = true
+      // A8.6.313 & A8.6.395
+      if (slice(insn, 7, 6) == 3 || slice(align, 1, 1) == 1)
+        return false;
+      DblSpaced = (type == 5);
+      break;
+    case 4:
+      // n == 4 && type == 0b0001 -> DblSpaced = true
+      // A8.6.316 & A8.6.397
+      if (slice(insn, 7, 6) == 3)
+        return false;
+      DblSpaced = (type == 1);
+      break;
+    }
   }
   return DisassembleNLdSt0(MI, Opcode, insn, NumOps, NumOpsAdded,
-                           slice(insn, 21, 21) == 0, DblSpaced, B);
+                           slice(insn, 21, 21) == 0, DblSpaced, alignment/8, B);
 }
 
 // VMOV (immediate)
 //   Qd/Dd imm
+// VBIC (immediate)
+// VORR (immediate)
+//   Qd/Dd imm src(=Qd/Dd)
 static bool DisassembleN1RegModImmFrm(MCInst &MI, unsigned Opcode,
     uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
@@ -2334,12 +2779,20 @@ static bool DisassembleN1RegModImmFrm(MCInst &MI, unsigned Opcode,
   case ARM::VMOVv8i16:
   case ARM::VMVNv4i16:
   case ARM::VMVNv8i16:
+  case ARM::VBICiv4i16:
+  case ARM::VBICiv8i16:
+  case ARM::VORRiv4i16:
+  case ARM::VORRiv8i16:
     esize = ESize16;
     break;
   case ARM::VMOVv2i32:
   case ARM::VMOVv4i32:
   case ARM::VMVNv2i32:
   case ARM::VMVNv4i32:
+  case ARM::VBICiv2i32:
+  case ARM::VBICiv4i32:
+  case ARM::VORRiv2i32:
+  case ARM::VORRiv4i32:
     esize = ESize32;
     break;
   case ARM::VMOVv1i64:
@@ -2347,7 +2800,7 @@ static bool DisassembleN1RegModImmFrm(MCInst &MI, unsigned Opcode,
     esize = ESize64;
     break;
   default:
-    assert(0 && "Unreachable code!");
+    assert(0 && "Unexpected opcode!");
     return false;
   }
 
@@ -2356,6 +2809,16 @@ static bool DisassembleN1RegModImmFrm(MCInst &MI, unsigned Opcode,
   MI.addOperand(MCOperand::CreateImm(decodeN1VImm(insn, esize)));
 
   NumOpsAdded = 2;
+
+  // VBIC/VORRiv*i* variants have an extra $src = $Vd to be filled in.
+  if (NumOps >= 3 &&
+      (OpInfo[2].RegClass == ARM::DPRRegClassID ||
+       OpInfo[2].RegClass == ARM::QPRRegClassID)) {
+    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[0].RegClass,
+                                                     decodeNEONRd(insn))));
+    NumOpsAdded += 1;
+  }
+
   return true;
 }
 
@@ -2376,7 +2839,7 @@ enum N2VFlag {
 //
 // Vector Move Long:
 //   Qd Dm
-// 
+//
 // Vector Move Narrow:
 //   Dd Qm
 //
@@ -2518,7 +2981,7 @@ static bool DisassembleNVectorShift(MCInst &MI, unsigned Opcode, uint32_t insn,
   assert(OpInfo[OpIdx].RegClass < 0 && "Imm operand expected");
 
   // Add the imm operand.
-  
+
   // VSHLL has maximum shift count as the imm, inferred from its size.
   unsigned Imm;
   switch (Opcode) {
@@ -2631,7 +3094,7 @@ static bool DisassembleNVdVnVmOptImm(MCInst &MI, unsigned Opcode, uint32_t insn,
   // N3RegFrm.
   if (Opcode == ARM::VMOVDneon || Opcode == ARM::VMOVQ)
     return true;
-  
+
   // Dm = Inst{5:3-0} => NEON Rm
   // or
   // Dm is restricted to D0-D7 if size is 16, D0-D15 otherwise
@@ -2770,7 +3233,7 @@ static bool DisassembleNGetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   ElemSize esize =
     Opcode == ARM::VGETLNi32 ? ESize32
       : ((Opcode == ARM::VGETLNs16 || Opcode == ARM::VGETLNu16) ? ESize16
-                                                                : ESize32);
+                                                                : ESize8);
 
   // Rt = Inst{15-12} => ARM Rd
   MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
@@ -2852,17 +3315,6 @@ static bool DisassembleNDupFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   return true;
 }
 
-// A8.6.41 DMB
-// A8.6.42 DSB
-// A8.6.49 ISB
-static inline bool MemBarrierInstr(uint32_t insn) {
-  unsigned op7_4 = slice(insn, 7, 4);
-  if (slice(insn, 31, 8) == 0xf57ff0 && (op7_4 >= 4 && op7_4 <= 6))
-    return true;
-
-  return false;
-}
-
 static inline bool PreLoadOpcode(unsigned Opcode) {
   switch(Opcode) {
   case ARM::PLDi12:  case ARM::PLDrs:
@@ -2878,8 +3330,8 @@ static bool DisassemblePreLoadFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
   // Preload Data/Instruction requires either 2 or 3 operands.
-  // PLDi, PLDWi, PLIi:                addrmode_imm12
-  // PLDr[a|m], PLDWr[a|m], PLIr[a|m]: ldst_so_reg
+  // PLDi12, PLDWi12, PLIi12: addrmode_imm12
+  // PLDrs, PLDWrs, PLIrs:    ldst_so_reg
 
   MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                      decodeRn(insn))));
@@ -2888,10 +3340,19 @@ static bool DisassemblePreLoadFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
       || Opcode == ARM::PLIi12) {
     unsigned Imm12 = slice(insn, 11, 0);
     bool Negative = getUBit(insn) == 0;
+
+    // A8.6.118 PLD (literal) PLDWi12 with Rn=PC is transformed to PLDi12.
+    if (Opcode == ARM::PLDWi12 && slice(insn, 19, 16) == 0xF) {
+      DEBUG(errs() << "Rn == '1111': PLDWi12 morphed to PLDi12\n");
+      MI.setOpcode(ARM::PLDi12);
+    }
+    
     // -0 is represented specially. All other values are as normal.
+    int Offset = Negative ? -1 * Imm12 : Imm12;
     if (Imm12 == 0 && Negative)
-      Imm12 = INT32_MIN;
-    MI.addOperand(MCOperand::CreateImm(Imm12));
+      Offset = INT32_MIN;
+
+    MI.addOperand(MCOperand::CreateImm(Offset));
     NumOpsAdded = 2;
   } else {
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
@@ -2917,14 +3378,20 @@ static bool DisassemblePreLoadFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 static bool DisassembleMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
-  if (MemBarrierInstr(insn)) {
-    // DMBsy, DSBsy, and ISBsy instructions have zero operand and are taken care
-    // of within the generic ARMBasicMCBuilder::BuildIt() method.
-    //
+  if (Opcode == ARM::DMB || Opcode == ARM::DSB) {
     // Inst{3-0} encodes the memory barrier option for the variants.
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 3, 0)));
-    NumOpsAdded = 1;
-    return true;
+    unsigned opt = slice(insn, 3, 0);
+    switch (opt) {
+    case ARM_MB::SY:  case ARM_MB::ST:
+    case ARM_MB::ISH: case ARM_MB::ISHST:
+    case ARM_MB::NSH: case ARM_MB::NSHST:
+    case ARM_MB::OSH: case ARM_MB::OSHST:
+      MI.addOperand(MCOperand::CreateImm(opt));
+      NumOpsAdded = 1;
+      return true;
+    default:
+      return false;
+    }
   }
 
   switch (Opcode) {
@@ -2936,6 +3403,11 @@ static bool DisassembleMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   case ARM::WFI:
   case ARM::SEV:
     return true;
+  case ARM::SWP:
+  case ARM::SWPB:
+    // SWP, SWPB: Rd Rm Rn
+    // Delegate to DisassembleLdStExFrm()....
+    return DisassembleLdStExFrm(MI, Opcode, insn, NumOps, NumOpsAdded, B);
   default:
     break;
   }
@@ -2950,20 +3422,32 @@ static bool DisassembleMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   // opcodes which match the same real instruction. This is needed since there's
   // no current handling of optional arguments. Fix here when a better handling
   // of optional arguments is implemented.
-  if (Opcode == ARM::CPS3p) {
+  if (Opcode == ARM::CPS3p) {   // M = 1
+    // Let's reject these impossible imod values by returning false:
+    // 1. (imod=0b01)
+    //
+    // AsmPrinter cannot handle imod=0b00, plus (imod=0b00,M=1,iflags!=0) is an
+    // invalid combination, so we just check for imod=0b00 here.
+    if (slice(insn, 19, 18) == 0 || slice(insn, 19, 18) == 1)
+      return false;
     MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 18))); // imod
     MI.addOperand(MCOperand::CreateImm(slice(insn, 8, 6)));   // iflags
     MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0)));   // mode
     NumOpsAdded = 3;
     return true;
   }
-  if (Opcode == ARM::CPS2p) {
+  if (Opcode == ARM::CPS2p) { // mode = 0, M = 0
+    // Let's reject these impossible imod values by returning false:
+    // 1. (imod=0b00,M=0)
+    // 2. (imod=0b01)
+    if (slice(insn, 19, 18) == 0 || slice(insn, 19, 18) == 1)
+      return false;
     MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 18))); // imod
     MI.addOperand(MCOperand::CreateImm(slice(insn, 8, 6)));   // iflags
     NumOpsAdded = 2;
     return true;
   }
-  if (Opcode == ARM::CPS1p) {
+  if (Opcode == ARM::CPS1p) { // imod = 0, iflags = 0, M = 1
     MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0))); // mode
     NumOpsAdded = 1;
     return true;
@@ -3142,7 +3626,7 @@ bool ARMBasicMCBuilder::DoPredicateOperands(MCInst& MI, unsigned Opcode,
 
   return false;
 }
-  
+
 /// TryPredicateAndSBitModifier - TryPredicateAndSBitModifier tries to process
 /// the possible Predicate and SBitModifier, to build the remaining MCOperand
 /// constituents.
@@ -3154,6 +3638,7 @@ bool ARMBasicMCBuilder::TryPredicateAndSBitModifier(MCInst& MI, unsigned Opcode,
   const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
   const std::string &Name = ARMInsts[Opcode].Name;
   unsigned Idx = MI.getNumOperands();
+  uint64_t TSFlags = ARMInsts[Opcode].TSFlags;
 
   // First, we check whether this instr specifies the PredicateOperand through
   // a pair of TargetOperandInfos with isPredicate() property.
@@ -3173,14 +3658,23 @@ bool ARMBasicMCBuilder::TryPredicateAndSBitModifier(MCInst& MI, unsigned Opcode,
         // like ARM.
         //
         // A8.6.16 B
-        if (Name == "t2Bcc")
-          MI.addOperand(MCOperand::CreateImm(CondCode(slice(insn, 25, 22))));
-        else if (Name == "tBcc")
-          MI.addOperand(MCOperand::CreateImm(CondCode(slice(insn, 11, 8))));
-        else
+        // Check for undefined encodings.
+        unsigned cond;
+        if (Name == "t2Bcc") {
+          if ((cond = slice(insn, 25, 22)) >= 14)
+            return false;
+          MI.addOperand(MCOperand::CreateImm(CondCode(cond)));
+        } else if (Name == "tBcc") {
+          if ((cond = slice(insn, 11, 8)) == 14)
+            return false;
+          MI.addOperand(MCOperand::CreateImm(CondCode(cond)));
+        } else
           MI.addOperand(MCOperand::CreateImm(ARMCC::AL));
       } else {
         // ARM instructions get their condition field from Inst{31-28}.
+        // We should reject Inst{31-28} = 0b1111 as invalid encoding.
+        if (!isNEONDomain(TSFlags) && getCondField(insn) == 0xF)
+          return false;
         MI.addOperand(MCOperand::CreateImm(CondCode(getCondField(insn))));
       }
     }
@@ -3243,3 +3737,84 @@ ARMBasicMCBuilder *llvm::CreateMCBuilder(unsigned Opcode, ARMFormat Format) {
   return new ARMBasicMCBuilder(Opcode, Format,
                                ARMInsts[Opcode].getNumOperands());
 }
+
+/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
+/// operand in place of the immediate Value in the MCInst.  The immediate
+/// Value has had any PC adjustment made by the caller.  If the getOpInfo()
+/// function was set as part of the setupBuilderForSymbolicDisassembly() call
+/// then that function is called to get any symbolic information at the
+/// builder's Address for this instrution.  If that returns non-zero then the
+/// symbolic information it returns is used to create an MCExpr and that is
+/// added as an operand to the MCInst.  This function returns true if it adds
+/// an operand to the MCInst and false otherwise.
+bool ARMBasicMCBuilder::tryAddingSymbolicOperand(uint64_t Value,
+                                                 uint64_t InstSize,
+                                                 MCInst &MI) {
+  if (!GetOpInfo)
+    return false;
+
+  struct LLVMOpInfo1 SymbolicOp;
+  SymbolicOp.Value = Value;
+  if (!GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp))
+    return false;
+
+  const MCExpr *Add = NULL;
+  if (SymbolicOp.AddSymbol.Present) {
+    if (SymbolicOp.AddSymbol.Name) {
+      StringRef Name(SymbolicOp.AddSymbol.Name);
+      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
+      Add = MCSymbolRefExpr::Create(Sym, *Ctx);
+    } else {
+      Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, *Ctx);
+    }
+  }
+
+  const MCExpr *Sub = NULL;
+  if (SymbolicOp.SubtractSymbol.Present) {
+    if (SymbolicOp.SubtractSymbol.Name) {
+      StringRef Name(SymbolicOp.SubtractSymbol.Name);
+      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
+      Sub = MCSymbolRefExpr::Create(Sym, *Ctx);
+    } else {
+      Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, *Ctx);
+    }
+  }
+
+  const MCExpr *Off = NULL;
+  if (SymbolicOp.Value != 0)
+    Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx);
+
+  const MCExpr *Expr;
+  if (Sub) {
+    const MCExpr *LHS;
+    if (Add)
+      LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx);
+    else
+      LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx);
+    if (Off != 0)
+      Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx);
+    else
+      Expr = LHS;
+  } else if (Add) {
+    if (Off != 0)
+      Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx);
+    else
+      Expr = Add;
+  } else {
+    if (Off != 0)
+      Expr = Off;
+    else
+      Expr = MCConstantExpr::Create(0, *Ctx);
+  }
+
+  if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_HI16)
+    MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateUpper16(Expr, *Ctx)));
+  else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_LO16)
+    MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateLower16(Expr, *Ctx)));
+  else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_None)
+    MI.addOperand(MCOperand::CreateExpr(Expr));
+  else 
+    assert("bad SymbolicOp.VariantKind");
+
+  return true;
+}
diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
index 9c30d33..a7ba141 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
+++ b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
@@ -22,12 +22,17 @@
 #define ARMDISASSEMBLERCORE_H
 
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm-c/Disassembler.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMRegisterInfo.h"
 #include "ARMDisassembler.h"
 
 namespace llvm {
+class MCContext;
 
 class ARMUtils {
 public:
@@ -134,6 +139,31 @@ static inline void setSlice(unsigned &Bits, unsigned From, unsigned To,
   Bits |= (Val & Mask) << To;
 }
 
+// Return an integer result equal to the number of bits of x that are ones.
+static inline uint32_t
+BitCount (uint64_t x)
+{
+    // c accumulates the total bits set in x
+    uint32_t c;
+    for (c = 0; x; ++c)
+    {
+        x &= x - 1; // clear the least significant bit set
+    }
+    return c;
+}
+
+static inline bool
+BitIsSet (const uint64_t value, const uint64_t bit)
+{
+    return (value & (1ull << bit)) != 0;
+}
+
+static inline bool
+BitIsClear (const uint64_t value, const uint64_t bit)
+{
+    return (value & (1ull << bit)) == 0;
+}
+
 /// Various utilities for checking the target specific flags.
 
 /// A unary data processing instruction doesn't have an Rn operand.
@@ -141,6 +171,12 @@ static inline bool isUnaryDP(uint64_t TSFlags) {
   return (TSFlags & ARMII::UnaryDP);
 }
 
+/// A NEON Domain instruction has cond field (Inst{31-28}) as 0b1111.
+static inline bool isNEONDomain(uint64_t TSFlags) {
+  return (TSFlags & ARMII::DomainNEON) ||
+         (TSFlags & ARMII::DomainNEONA8);
+}
+
 /// This four-bit field describes the addressing mode used.
 /// See also ARMBaseInstrInfo.h.
 static inline unsigned getAddrMode(uint64_t TSFlags) {
@@ -196,7 +232,7 @@ private:
 public:
   ARMBasicMCBuilder(ARMBasicMCBuilder &B)
     : Opcode(B.Opcode), Format(B.Format), NumOps(B.NumOps), Disasm(B.Disasm),
-      SP(B.SP) {
+      SP(B.SP), GetOpInfo(0), DisInfo(0), Ctx(0) {
     Err = 0;
   }
 
@@ -255,6 +291,44 @@ private:
     assert(SP);
     return slice(SP->ITState, 7, 4);
   }
+
+private:
+  //
+  // Hooks for symbolic disassembly via the public 'C' interface.
+  //
+  // The function to get the symbolic information for operands.
+  LLVMOpInfoCallback GetOpInfo;
+  // The pointer to the block of symbolic information for above call back.
+  void *DisInfo;
+  // The assembly context for creating symbols and MCExprs in place of
+  // immediate operands when there is symbolic information.
+  MCContext *Ctx;
+  // The address of the instruction being disassembled.
+  uint64_t Address;
+
+public:
+  void setupBuilderForSymbolicDisassembly(LLVMOpInfoCallback getOpInfo,
+                                          void *disInfo, MCContext *ctx,
+                                          uint64_t address) {
+    GetOpInfo = getOpInfo;
+    DisInfo = disInfo;
+    Ctx = ctx;
+    Address = address;
+  }
+
+  uint64_t getBuilderAddress() const { return Address; }
+
+  /// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
+  /// operand in place of the immediate Value in the MCInst.  The immediate
+  /// Value has had any PC adjustment made by the caller.  If the getOpInfo()
+  /// function was set as part of the setupBuilderForSymbolicDisassembly() call
+  /// then that function is called to get any symbolic information at the
+  /// builder's Address for this instrution.  If that returns non-zero then the
+  /// symbolic information it returns is used to create an MCExpr and that is
+  /// added as an operand to the MCInst.  This function returns true if it adds
+  /// an operand to the MCInst and false otherwise.
+  bool tryAddingSymbolicOperand(uint64_t Value, uint64_t InstSize, MCInst &MI);
+
 };
 
 } // namespace llvm
diff --git a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h b/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
index 23372e0..8d39982 100644
--- a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
+++ b/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
@@ -108,6 +108,8 @@ static inline bool IsGPR(unsigned RegClass) {
 
 // Utilities for 32-bit Thumb instructions.
 
+static inline bool BadReg(uint32_t n) { return n == 13 || n == 15; }
+
 // Extract imm4: Inst{19-16}.
 static inline unsigned getImm4(uint32_t insn) {
   return slice(insn, 19, 16);
@@ -398,9 +400,17 @@ static bool DisassembleThumb1General(MCInst &MI, unsigned Opcode, uint32_t insn,
     assert(OpInfo[OpIdx].RegClass < 0 &&
            !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()
            && "Pure imm operand expected");
-    MI.addOperand(MCOperand::CreateImm(UseRt ? getT1Imm8(insn)
-                                             : (Imm3 ? getT1Imm3(insn)
-                                                     : getT1Imm5(insn))));
+    unsigned Imm = 0;
+    if (UseRt)
+      Imm = getT1Imm8(insn);
+    else if (Imm3)
+      Imm = getT1Imm3(insn);
+    else {
+      Imm = getT1Imm5(insn);
+      ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 12, 11));
+      getImmShiftSE(ShOp, Imm);
+    }
+    MI.addOperand(MCOperand::CreateImm(Imm));
   }
   ++OpIdx;
 
@@ -469,6 +479,7 @@ static bool DisassembleThumb1DP(MCInst &MI, unsigned Opcode, uint32_t insn,
 // tBX_RET: 0 operand
 // tBX_RET_vararg: Rm
 // tBLXr_r9: Rm
+// tBRIND: Rm
 static bool DisassembleThumb1Special(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
@@ -476,11 +487,17 @@ static bool DisassembleThumb1Special(MCInst &MI, unsigned Opcode, uint32_t insn,
   if (NumOps == 0)
     return true;
 
-  // BX/BLX has 1 reg operand: Rm.
-  if (NumOps == 1) {
+  // BX/BLX/tBRIND (indirect branch, i.e, mov pc, Rm) has 1 reg operand: Rm.
+  if (Opcode==ARM::tBLXr_r9 || Opcode==ARM::tBX_Rm || Opcode==ARM::tBRIND) {
+    if (Opcode != ARM::tBRIND) {
+      // Handling the two predicate operands before the reg operand.
+      if (!B->DoPredicateOperands(MI, Opcode, insn, NumOps))
+        return false;
+      NumOpsAdded += 2;
+    }
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        getT1Rm(insn))));
-    NumOpsAdded = 1;
+    NumOpsAdded += 1;
     return true;
   }
 
@@ -598,7 +615,7 @@ static bool DisassembleThumb2Ldpci(MCInst &MI, unsigned Opcode,
 
 // A6.2.4 Load/store single data item
 //
-// Load/Store Register (reg|imm):      tRd tRn imm5 tRm
+// Load/Store Register (reg|imm):      tRd tRn imm5|tRm
 // Load Register Signed Byte|Halfword: tRd tRn tRm
 static bool DisassembleThumb1LdSt(unsigned opA, MCInst &MI, unsigned Opcode,
     uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
@@ -607,11 +624,6 @@ static bool DisassembleThumb1LdSt(unsigned opA, MCInst &MI, unsigned Opcode,
   const TargetOperandInfo *OpInfo = TID.OpInfo;
   unsigned &OpIdx = NumOpsAdded;
 
-  // Table A6-5 16-bit Thumb Load/store instructions
-  // opA = 0b0101 for STR/LDR (register) and friends.
-  // Otherwise, we have STR/LDR (immediate) and friends.
-  bool Imm5 = (opA != 5);
-
   assert(NumOps >= 2
          && OpInfo[0].RegClass == ARM::tGPRRegClassID
          && OpInfo[1].RegClass == ARM::tGPRRegClassID
@@ -624,28 +636,28 @@ static bool DisassembleThumb1LdSt(unsigned opA, MCInst &MI, unsigned Opcode,
                                                      getT1tRn(insn))));
   OpIdx = 2;
 
-  // We have either { imm5, tRm } or { tRm } remaining.
-  // Process the imm5 first.  Note that STR/LDR (register) should skip the imm5
-  // offset operand for t_addrmode_s[1|2|4].
+  // We have either { imm5 } or { tRm } remaining.
+  // Note that STR/LDR (register) should skip the imm5 offset operand for
+  // t_addrmode_s[1|2|4].
 
   assert(OpIdx < NumOps && "More operands expected");
 
   if (OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() &&
       !OpInfo[OpIdx].isOptionalDef()) {
-
-    MI.addOperand(MCOperand::CreateImm(Imm5 ? getT1Imm5(insn) : 0));
+    // Table A6-5 16-bit Thumb Load/store instructions
+    // opA = 0b0101 for STR/LDR (register) and friends.
+    // Otherwise, we have STR/LDR (immediate) and friends.
+    assert(opA != 5 && "Immediate operand expected for this opcode");
+    MI.addOperand(MCOperand::CreateImm(getT1Imm5(insn)));
+    ++OpIdx;
+  } else {
+    // The next reg operand is tRm, the offset.
+    assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID
+           && "Thumb reg operand expected");
+    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
+                                                       getT1tRm(insn))));
     ++OpIdx;
   }
-
-  // The next reg operand is tRm, the offset.
-  assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID
-         && "Thumb reg operand expected");
-  MI.addOperand(MCOperand::CreateReg(
-                  Imm5 ? 0
-                       : getRegisterEnum(B, ARM::tGPRRegClassID,
-                                         getT1tRm(insn))));
-  ++OpIdx;
-
   return true;
 }
 
@@ -895,6 +907,10 @@ static bool DisassembleThumb1LdStMul(bool Ld, MCInst &MI, unsigned Opcode,
   }
 
   unsigned RegListBits = slice(insn, 7, 0);
+  if (BitCount(RegListBits) < 1) {
+    DEBUG(errs() << "if BitCount(registers) < 1 then UNPREDICTABLE\n");
+    return false;
+  }
 
   // Fill the variadic part of reglist.
   for (unsigned i = 0; i < 8; ++i)
@@ -945,6 +961,11 @@ static bool DisassembleThumb1CondBr(MCInst &MI, unsigned Opcode, uint32_t insn,
                                       : (int)Imm8));
 
   // Predicate operands by ARMBasicMCBuilder::TryPredicateAndSBitModifier().
+  // But note that for tBcc, if cond = '1110' then UNDEFINED.
+  if (Opcode == ARM::tBcc && slice(insn, 11, 8) == 14) {
+    DEBUG(errs() << "if cond = '1110' then UNDEFINED\n");
+    return false;
+  }
   NumOpsAdded = 1;
 
   return true;
@@ -965,11 +986,7 @@ static bool DisassembleThumb1Br(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   unsigned Imm11 = getT1Imm11(insn);
 
-  // When executing a Thumb instruction, PC reads as the address of the current
-  // instruction plus 4.  The assembler subtracts 4 from the difference between
-  // the branch instruction and the target address, disassembler has to add 4 to
-  // to compensate.
-  MI.addOperand(MCOperand::CreateImm(SignExtend32<12>(Imm11 << 1) + 4));
+  MI.addOperand(MCOperand::CreateImm(SignExtend32<12>(Imm11 << 1)));
 
   NumOpsAdded = 1;
 
@@ -1129,8 +1146,12 @@ static bool DisassembleThumb2SRS(MCInst &MI, unsigned Opcode, uint32_t insn,
 // t2RFE[IA|DB]W/t2RFE[IA|DB]: Rn
 static bool DisassembleThumb2RFE(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
+  unsigned Rn = decodeRn(insn);
+  if (Rn == 15) {
+    DEBUG(errs() << "if n == 15 then UNPREDICTABLE\n");
+    return false;
+  }
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,ARM::GPRRegClassID,Rn)));
   NumOpsAdded = 1;
   return true;
 }
@@ -1149,7 +1170,7 @@ static bool DisassembleThumb2LdStMul(MCInst &MI, unsigned Opcode, uint32_t insn,
           Opcode == ARM::t2STMIA || Opcode == ARM::t2STMIA_UPD ||
           Opcode == ARM::t2STMDB || Opcode == ARM::t2STMDB_UPD)
          && "Unexpected opcode");
-  assert(NumOps >= 5 && "Thumb2 LdStMul expects NumOps >= 5");
+  assert(NumOps >= 4 && "Thumb2 LdStMul expects NumOps >= 4");
 
   NumOpsAdded = 0;
 
@@ -1203,45 +1224,79 @@ static bool DisassembleThumb2LdStEx(MCInst &MI, unsigned Opcode, uint32_t insn,
   OpIdx = 0;
 
   assert(NumOps >= 2
-         && OpInfo[0].RegClass == ARM::GPRRegClassID
-         && OpInfo[1].RegClass == ARM::GPRRegClassID
+         && OpInfo[0].RegClass > 0
+         && OpInfo[1].RegClass > 0
          && "Expect >=2 operands and first two as reg operands");
 
   bool isStore = (ARM::t2STREX <= Opcode && Opcode <= ARM::t2STREXH);
   bool isSW = (Opcode == ARM::t2LDREX || Opcode == ARM::t2STREX);
   bool isDW = (Opcode == ARM::t2LDREXD || Opcode == ARM::t2STREXD);
 
+  unsigned Rt  = decodeRd(insn);
+  unsigned Rt2 = decodeRs(insn); // But note that this is Rd for t2STREX.
+  unsigned Rd  = decodeRm(insn);
+  unsigned Rn  = decodeRn(insn);
+
+  // Some sanity checking first.
+  if (isStore) {
+    // if d == n || d == t then UNPREDICTABLE
+    // if d == n || d == t || d == t2 then UNPREDICTABLE
+    if (isDW) {
+      if (Rd == Rn || Rd == Rt || Rd == Rt2) {
+        DEBUG(errs() << "if d == n || d == t || d == t2 then UNPREDICTABLE\n");
+        return false;
+      }
+    } else {
+      if (isSW) {
+        if (Rt2 == Rn || Rt2 == Rt) {
+          DEBUG(errs() << "if d == n || d == t then UNPREDICTABLE\n");
+          return false;
+        }
+      } else {
+        if (Rd == Rn || Rd == Rt) {
+          DEBUG(errs() << "if d == n || d == t then UNPREDICTABLE\n");
+          return false;
+        }
+      }
+    }
+  } else {
+    // Load
+    // A8.6.71 LDREXD
+    // if t == t2 then UNPREDICTABLE
+    if (isDW && Rt == Rt2) {
+      DEBUG(errs() << "if t == t2 then UNPREDICTABLE\n");
+      return false;
+    }
+  }
+
   // Add the destination operand for store.
   if (isStore) {
     MI.addOperand(MCOperand::CreateReg(
-                    getRegisterEnum(B, ARM::GPRRegClassID,
-                                    isSW ? decodeRs(insn) : decodeRm(insn))));
+                    getRegisterEnum(B, OpInfo[OpIdx].RegClass,
+                                    isSW ? Rt2 : Rd)));
     ++OpIdx;
   }
 
   // Source operand for store and destination operand for load.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
+                                                     Rt)));
   ++OpIdx;
 
   // Thumb2 doubleword complication: with an extra source/destination operand.
   if (isDW) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRs(insn))));
+    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,OpInfo[OpIdx].RegClass,
+                                                       Rt2)));
     ++OpIdx;
   }
 
   // Finally add the pointer operand.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
+                                                     Rn)));
   ++OpIdx;
 
   return true;
 }
 
-// LLVM, as of Jan-05-2010, does not output <Rt2>, i.e., Rs, in the asm.
-// Whereas the ARM Arch. Manual does not require that t2 = t+1 like in ARM ISA.
-//
 // t2LDRDi8: Rd Rs Rn imm8s4 (offset mode)
 // t2LDRDpci: Rd Rs imm8s4 (Not decoded, prefer the generic t2LDRDi8 version)
 // t2STRDi8: Rd Rs Rn imm8s4 (offset mode)
@@ -1255,18 +1310,50 @@ static bool DisassembleThumb2LdStDual(MCInst &MI, unsigned Opcode,
   if (!OpInfo) return false;
 
   assert(NumOps >= 4
-         && OpInfo[0].RegClass == ARM::GPRRegClassID
-         && OpInfo[1].RegClass == ARM::GPRRegClassID
-         && OpInfo[2].RegClass == ARM::GPRRegClassID
+         && OpInfo[0].RegClass > 0
+         && OpInfo[0].RegClass == OpInfo[1].RegClass
+         && OpInfo[2].RegClass > 0
          && OpInfo[3].RegClass < 0
          && "Expect >= 4 operands and first 3 as reg operands");
 
+  // Thumnb allows for specifying Rt and Rt2, unlike ARM (which has Rt2==Rt+1).
+  unsigned Rt  = decodeRd(insn);
+  unsigned Rt2 = decodeRs(insn);
+  unsigned Rn  = decodeRn(insn);
+
+  // Some sanity checking first.
+
+  // A8.6.67 LDRD (literal) has its W bit as (0).
+  if (Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2LDRD_PRE || Opcode == ARM::t2LDRD_POST) {
+    if (Rn == 15 && slice(insn, 21, 21) != 0)
+      return false;
+  } else {
+    // For Dual Store, PC cannot be used as the base register.
+    if (Rn == 15) {
+      DEBUG(errs() << "if n == 15 then UNPREDICTABLE\n");
+      return false;
+    }
+  }
+  if (Rt == Rt2) {
+    DEBUG(errs() << "if t == t2 then UNPREDICTABLE\n");
+    return false;
+  }
+  if (Opcode != ARM::t2LDRDi8 && Opcode != ARM::t2STRDi8) {
+    if (Rn == Rt || Rn == Rt2) {
+      DEBUG(errs() << "if wback && (n == t || n == t2) then UNPREDICTABLE\n");
+      return false;
+    }
+  }
+
   // Add the <Rt> <Rt2> operands.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+  unsigned RegClassPair = OpInfo[0].RegClass;
+  unsigned RegClassBase = OpInfo[2].RegClass;
+  
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassPair,
                                                      decodeRd(insn))));
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassPair,
                                                      decodeRs(insn))));
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassBase,
                                                      decodeRn(insn))));
 
   // Finally add (+/-)imm8*4, depending on the U bit.
@@ -1394,9 +1481,12 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn,
   if (OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
       && !OpInfo[OpIdx].isOptionalDef()) {
 
-    if (Thumb2ShiftOpcode(Opcode))
-      MI.addOperand(MCOperand::CreateImm(getShiftAmtBits(insn)));
-    else {
+    if (Thumb2ShiftOpcode(Opcode)) {
+      unsigned Imm = getShiftAmtBits(insn);
+      ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 5, 4));
+      getImmShiftSE(ShOp, Imm);
+      MI.addOperand(MCOperand::CreateImm(Imm));
+    } else {
       // Build the constant shift specifier operand.
       unsigned bits2 = getShiftTypeBits(insn);
       unsigned imm5 = getShiftAmtBits(insn);
@@ -1421,7 +1511,8 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn,
 static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode,
     uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
-  const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
+  const TargetInstrDesc &TID = ARMInsts[Opcode];
+  const TargetOperandInfo *OpInfo = TID.OpInfo;
   unsigned &OpIdx = NumOpsAdded;
 
   OpIdx = 0;
@@ -1448,8 +1539,15 @@ static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode,
       DEBUG(errs()<<"Thumb2 encoding error: d==15 for DPModImm 2-reg instr.\n");
       return false;
     }
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RnRegClassID,
-                                                       decodeRn(insn))));
+    int Idx;
+    if ((Idx = TID.getOperandConstraint(OpIdx, TOI::TIED_TO)) != -1) {
+      // The reg operand is tied to the first reg operand.
+      MI.addOperand(MI.getOperand(Idx));
+    } else {
+      // Add second reg operand.
+      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RnRegClassID,
+                                                         decodeRn(insn))));
+    }
     ++OpIdx;
   }
 
@@ -1518,7 +1616,7 @@ static bool DisassembleThumb2Sat(MCInst &MI, unsigned Opcode, uint32_t insn,
 // o t2ADDri12, t2SUBri12: Rs Rn imm12
 // o t2LEApcrel (ADR): Rs imm12
 // o t2BFC (BFC): Rs Ro(TIED_TO) bf_inv_mask_imm
-// o t2BFI (BFI) (Currently not defined in LLVM as of Jan-07-2010)
+// o t2BFI (BFI): Rs Ro(TIED_TO) Rn bf_inv_mask_imm
 // o t2MOVi16: Rs imm16
 // o t2MOVTi16: Rs imm16
 // o t2SBFX (SBFX): Rs Rn lsb width
@@ -1579,9 +1677,10 @@ static bool DisassembleThumb2DPBinImm(MCInst &MI, unsigned Opcode,
   if (Opcode == ARM::t2ADDri12 || Opcode == ARM::t2SUBri12
       || Opcode == ARM::t2LEApcrel)
     MI.addOperand(MCOperand::CreateImm(getIImm3Imm8(insn)));
-  else if (Opcode == ARM::t2MOVi16 || Opcode == ARM::t2MOVTi16)
-    MI.addOperand(MCOperand::CreateImm(getImm16(insn)));
-  else if (Opcode == ARM::t2BFC || Opcode == ARM::t2BFI) {
+  else if (Opcode == ARM::t2MOVi16 || Opcode == ARM::t2MOVTi16) {
+    if (!B->tryAddingSymbolicOperand(getImm16(insn), 4, MI))
+      MI.addOperand(MCOperand::CreateImm(getImm16(insn)));
+  } else if (Opcode == ARM::t2BFC || Opcode == ARM::t2BFI) {
     uint32_t mask = 0;
     if (getBitfieldInvMask(insn, mask))
       MI.addOperand(MCOperand::CreateImm(mask));
@@ -1625,8 +1724,7 @@ static inline bool t2MiscCtrlInstr(uint32_t insn) {
 // A8.6.26
 // t2BXJ -> Rn
 //
-// Miscellaneous control: t2DMBsy (and its t2DMB variants),
-// t2DSBsy (and its t2DSB varianst), t2ISBsy, t2CLREX
+// Miscellaneous control:
 //   -> no operand (except pred-imm pred-ccr for CLREX, memory barrier variants)
 //
 // Hint: t2NOP, t2YIELD, t2WFE, t2WFI, t2SEV
@@ -1643,6 +1741,22 @@ static bool DisassembleThumb2BrMiscCtrl(MCInst &MI, unsigned Opcode,
   if (NumOps == 0)
     return true;
 
+  if (Opcode == ARM::t2DMB || Opcode == ARM::t2DSB) {
+    // Inst{3-0} encodes the memory barrier option for the variants.
+    unsigned opt = slice(insn, 3, 0);
+    switch (opt) {
+    case ARM_MB::SY:  case ARM_MB::ST:
+    case ARM_MB::ISH: case ARM_MB::ISHST:
+    case ARM_MB::NSH: case ARM_MB::NSHST:
+    case ARM_MB::OSH: case ARM_MB::OSHST:
+      MI.addOperand(MCOperand::CreateImm(opt));
+      NumOpsAdded = 1;
+      return true;
+    default:
+      return false;
+    }
+  }
+
   if (t2MiscCtrlInstr(insn))
     return true;
 
@@ -1719,6 +1833,17 @@ static bool DisassembleThumb2BrMiscCtrl(MCInst &MI, unsigned Opcode,
     return true;
   }
 
+  // Some instructions have predicate operands first before the immediate.
+  if (Opcode == ARM::tBLXi_r9 || Opcode == ARM::tBLr9) {
+    // Handling the two predicate operands before the imm operand.
+    if (B->DoPredicateOperands(MI, Opcode, insn, NumOps))
+      NumOpsAdded += 2;
+    else {
+      DEBUG(errs() << "Expected predicate operands not found.\n");
+      return false;
+    }
+  }
+
   // Add the imm operand.
   int Offset = 0;
 
@@ -1739,13 +1864,12 @@ static bool DisassembleThumb2BrMiscCtrl(MCInst &MI, unsigned Opcode,
     Offset = decodeImm32_BLX(insn);
     break;
   }
-  // When executing a Thumb instruction, PC reads as the address of the current
-  // instruction plus 4.  The assembler subtracts 4 from the difference between
-  // the branch instruction and the target address, disassembler has to add 4 to
-  // to compensate.
-  MI.addOperand(MCOperand::CreateImm(Offset + 4));
 
-  NumOpsAdded = 1;
+  if (!B->tryAddingSymbolicOperand(Offset + B->getBuilderAddress() + 4, 4, MI))
+    MI.addOperand(MCOperand::CreateImm(Offset));
+
+  // This is an increment as some predicate operands may have been added first.
+  NumOpsAdded += 1;
 
   return true;
 }
@@ -1787,7 +1911,7 @@ static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn,
                                                      decodeRn(insn))));
   ++OpIdx;
 
-  if (OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) {
+  if (OpInfo[OpIdx].RegClass == ARM::rGPRRegClassID) {
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        decodeRm(insn))));
   } else {
@@ -1795,17 +1919,17 @@ static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn,
            && !OpInfo[OpIdx].isOptionalDef()
            && "Pure imm operand expected");
     int Offset = 0;
-    if (slice(insn, 19, 16) == 0xFF) {
-      bool Negative = slice(insn, 23, 23) == 0;
-      unsigned Imm12 = getImm12(insn);
-      Offset = Negative ? -1 - Imm12 : 1 * Imm12;
-    } else if (Opcode == ARM::t2PLDi8 || Opcode == ARM::t2PLDWi8 ||
-               Opcode == ARM::t2PLIi8) {
+    if (Opcode == ARM::t2PLDi8 || Opcode == ARM::t2PLDWi8 ||
+        Opcode == ARM::t2PLIi8) {
       // A8.6.117 Encoding T2: add = FALSE
       unsigned Imm8 = getImm8(insn);
-      Offset = -1 - Imm8;
-    } else // The i12 forms.  See, for example, A8.6.117 Encoding T1.
+      Offset = -1 * Imm8;
+    } else {
+      // The i12 forms.  See, for example, A8.6.117 Encoding T1.
+      // Note that currently t2PLDi12 also handles the previously named t2PLDpci
+      // opcode, that's why we use decodeImm12(insn) which returns +/- imm12.
       Offset = decodeImm12(insn);
+    }
     MI.addOperand(MCOperand::CreateImm(Offset));
   }
   ++OpIdx;
@@ -1820,6 +1944,87 @@ static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn,
   return true;
 }
 
+static bool BadRegsThumb2LdSt(unsigned Opcode, uint32_t insn, bool Load,
+      unsigned R0, unsigned R1, unsigned R2, bool UseRm, bool WB) {
+
+  // Inst{22-21} encodes the data item transferred for load/store.
+  // For single word, it is encoded as ob10.
+  bool Word = (slice(insn, 22, 21) == 2);
+  bool Half = (slice(insn, 22, 21) == 1);
+  bool Byte = (slice(insn, 22, 21) == 0);
+
+  if (UseRm && BadReg(R2)) {
+    DEBUG(errs() << "if BadReg(m) then UNPREDICTABLE\n");
+    return true;
+  }
+
+  if (Load) {
+    if (!Word && R0 == 13) {
+      DEBUG(errs() << "if t == 13 then UNPREDICTABLE\n");
+      return true;
+    }
+    if (Byte) {
+      if (WB && R0 == 15 && slice(insn, 10, 8) == 3)  {
+        // A8.6.78 LDRSB (immediate) Encoding T2 (errata markup 8.0)
+        DEBUG(errs() << "if t == 15 && PUW == '011' then UNPREDICTABLE\n");
+        return true;
+      }
+    }
+    // A6.3.8 Load halfword, memory hints
+    if (Half) {
+      if (WB) {
+        if (R0 == R1)  {
+          // A8.6.82 LDRSH (immediate) Encoding T2
+          DEBUG(errs() << "if WB && n == t then UNPREDICTABLE\n");
+          return true;
+        }
+        if (R0 == 15 && slice(insn, 10, 8) == 3)  {
+          // A8.6.82 LDRSH (immediate) Encoding T2 (errata markup 8.0)
+          DEBUG(errs() << "if t == 15 && PUW == '011' then UNPREDICTABLE\n");
+          return true;
+        }
+      } else {
+        if (Opcode == ARM::t2LDRHi8 || Opcode == ARM::t2LDRSHi8) {
+          if (R0 == 15 && slice(insn, 10, 8) == 4) {
+            // A8.6.82 LDRSH (immediate) Encoding T2
+            DEBUG(errs() << "if Rt == '1111' and PUW == '100' then SEE"
+                         << " \"Unallocated memory hints\"\n");
+            return true;
+          }
+        } else {
+          if (R0 == 15) {
+            // A8.6.82 LDRSH (immediate) Encoding T1
+            DEBUG(errs() << "if Rt == '1111' then SEE"
+                         << " \"Unallocated memory hints\"\n");
+            return true;
+          }
+        }
+      }
+    }
+  } else {
+    if (WB && R0 == R1) {
+      DEBUG(errs() << "if wback && n == t then UNPREDICTABLE\n");
+      return true;
+    }
+    if ((WB && R0 == 15) || (!WB && R1 == 15)) {
+      DEBUG(errs() << "if Rn == '1111' then UNDEFINED\n");
+      return true;
+    }
+    if (Word) {
+      if ((WB && R1 == 15) || (!WB && R0 == 15)) {
+        DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n");
+        return true;
+      }
+    } else {
+      if ((WB && BadReg(R1)) || (!WB && BadReg(R0))) {
+        DEBUG(errs() << "if BadReg(t) then UNPREDICTABLE\n");
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 // A6.3.10 Store single data item
 // A6.3.9 Load byte, memory hints
 // A6.3.8 Load halfword, memory hints
@@ -1865,16 +2070,16 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode,
   OpIdx = 0;
 
   assert(NumOps >= 3 &&
-         OpInfo[0].RegClass == ARM::GPRRegClassID &&
-         OpInfo[1].RegClass == ARM::GPRRegClassID &&
+         OpInfo[0].RegClass > 0 &&
+         OpInfo[1].RegClass > 0 &&
          "Expect >= 3 operands and first two as reg operands");
 
-  bool ThreeReg = (OpInfo[2].RegClass == ARM::GPRRegClassID);
+  bool ThreeReg = (OpInfo[2].RegClass > 0);
   bool TIED_TO = ThreeReg && TID.getOperandConstraint(2, TOI::TIED_TO) != -1;
   bool Imm12 = !ThreeReg && slice(insn, 23, 23) == 1; // ARMInstrThumb2.td
 
   // Build the register operands, followed by the immediate.
-  unsigned R0, R1, R2 = 0;
+  unsigned R0 = 0, R1 = 0, R2 = 0;
   unsigned Rd = decodeRd(insn);
   int Imm = 0;
 
@@ -1905,19 +2110,24 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode,
       Imm = decodeImm8(insn);
   }
 
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
                                                      R0)));
   ++OpIdx;
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
                                                      R1)));
   ++OpIdx;
 
   if (ThreeReg) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+    // This could be an offset register or a TIED_TO register.
+    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,OpInfo[OpIdx].RegClass,
                                                        R2)));
     ++OpIdx;
   }
 
+  if (BadRegsThumb2LdSt(Opcode, insn, Load, R0, R1, R2, ThreeReg & !TIED_TO,
+                        TIED_TO))
+    return false;
+
   assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
          && !OpInfo[OpIdx].isOptionalDef()
          && "Pure imm operand expected");
@@ -1947,25 +2157,25 @@ static bool DisassembleThumb2DPReg(MCInst &MI, unsigned Opcode, uint32_t insn,
   OpIdx = 0;
 
   assert(NumOps >= 2 &&
-         OpInfo[0].RegClass == ARM::rGPRRegClassID &&
-         OpInfo[1].RegClass == ARM::rGPRRegClassID &&
+         OpInfo[0].RegClass > 0 &&
+         OpInfo[1].RegClass > 0 &&
          "Expect >= 2 operands and first two as reg operands");
 
   // Build the register operands, followed by the optional rotation amount.
 
-  bool ThreeReg = NumOps > 2 && OpInfo[2].RegClass == ARM::rGPRRegClassID;
+  bool ThreeReg = NumOps > 2 && OpInfo[2].RegClass > 0;
 
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
                                                      decodeRs(insn))));
   ++OpIdx;
 
   if (ThreeReg) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
+    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,OpInfo[OpIdx].RegClass,
                                                        decodeRn(insn))));
     ++OpIdx;
   }
 
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
                                                      decodeRm(insn))));
   ++OpIdx;
 
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 1499da0..fc2aa75 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -29,6 +29,9 @@ StringRef ARMInstPrinter::getOpcodeName(unsigned Opcode) const {
   return getInstructionName(Opcode);
 }
 
+StringRef ARMInstPrinter::getRegName(unsigned RegNo) const {
+  return getRegisterName(RegNo);
+}
 
 void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
   unsigned Opcode = MI->getOpcode();
@@ -133,9 +136,10 @@ static void printSOImm(raw_ostream &O, int64_t V, raw_ostream *CommentStream,
   unsigned Rot = ARM_AM::getSOImmValRot(V);
 
   // Print low-level immediate formation info, per
-  // A5.1.3: "Data-processing operands - Immediate".
+  // A5.2.3: Data-processing (immediate), and
+  // A5.2.4: Modified immediate constants in ARM instructions
   if (Rot) {
-    O << "#" << Imm << ", " << Rot;
+    O << "#" << Imm << ", #" << Rot;
     // Pretty printed version.
     if (CommentStream)
       *CommentStream << (int)ARM_AM::rotr32(Imm, Rot) << "\n";
@@ -178,18 +182,16 @@ void ARMInstPrinter::printSORegOperand(const MCInst *MI, unsigned OpNum,
   }
 }
 
+//===--------------------------------------------------------------------===//
+// Addressing Mode #2
+//===--------------------------------------------------------------------===//
 
-void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op,
-                                           raw_ostream &O) {
+void ARMInstPrinter::printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
+                                                raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(Op);
   const MCOperand &MO2 = MI->getOperand(Op+1);
   const MCOperand &MO3 = MI->getOperand(Op+2);
 
-  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
-    printOperand(MI, Op, O);
-    return;
-  }
-
   O << "[" << getRegisterName(MO1.getReg());
 
   if (!MO2.getReg()) {
@@ -212,6 +214,50 @@ void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op,
   O << "]";
 }
 
+void ARMInstPrinter::printAM2PostIndexOp(const MCInst *MI, unsigned Op,
+                                         raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  const MCOperand &MO3 = MI->getOperand(Op+2);
+
+  O << "[" << getRegisterName(MO1.getReg()) << "], ";
+
+  if (!MO2.getReg()) {
+    unsigned ImmOffs = ARM_AM::getAM2Offset(MO3.getImm());
+    O << '#'
+      << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()))
+      << ImmOffs;
+    return;
+  }
+
+  O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()))
+    << getRegisterName(MO2.getReg());
+
+  if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm()))
+    O << ", "
+    << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm()))
+    << " #" << ShImm;
+}
+
+void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op, O);
+    return;
+  }
+
+  const MCOperand &MO3 = MI->getOperand(Op+2);
+  unsigned IdxMode = ARM_AM::getAM2IdxMode(MO3.getImm());
+
+  if (IdxMode == ARMII::IndexModePost) {
+    printAM2PostIndexOp(MI, Op, O);
+    return;
+  }
+  printAM2PreOrOffsetIndexOp(MI, Op, O);
+}
+
 void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI,
                                                  unsigned OpNum,
                                                  raw_ostream &O) {
@@ -235,11 +281,35 @@ void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI,
     << " #" << ShImm;
 }
 
-void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned OpNum,
-                                           raw_ostream &O) {
-  const MCOperand &MO1 = MI->getOperand(OpNum);
-  const MCOperand &MO2 = MI->getOperand(OpNum+1);
-  const MCOperand &MO3 = MI->getOperand(OpNum+2);
+//===--------------------------------------------------------------------===//
+// Addressing Mode #3
+//===--------------------------------------------------------------------===//
+
+void ARMInstPrinter::printAM3PostIndexOp(const MCInst *MI, unsigned Op,
+                                         raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  const MCOperand &MO3 = MI->getOperand(Op+2);
+
+  O << "[" << getRegisterName(MO1.getReg()) << "], ";
+
+  if (MO2.getReg()) {
+    O << (char)ARM_AM::getAM3Op(MO3.getImm())
+    << getRegisterName(MO2.getReg());
+    return;
+  }
+
+  unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm());
+  O << '#'
+    << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()))
+    << ImmOffs;
+}
+
+void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
+                                                raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  const MCOperand &MO3 = MI->getOperand(Op+2);
 
   O << '[' << getRegisterName(MO1.getReg());
 
@@ -256,6 +326,18 @@ void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned OpNum,
   O << ']';
 }
 
+void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned Op,
+                                           raw_ostream &O) {
+  const MCOperand &MO3 = MI->getOperand(Op+2);
+  unsigned IdxMode = ARM_AM::getAM3IdxMode(MO3.getImm());
+
+  if (IdxMode == ARMII::IndexModePost) {
+    printAM3PostIndexOp(MI, Op, O);
+    return;
+  }
+  printAM3PreOrOffsetIndexOp(MI, Op, O);
+}
+
 void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI,
                                                  unsigned OpNum,
                                                  raw_ostream &O) {
@@ -314,6 +396,12 @@ void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum,
   O << "]";
 }
 
+void ARMInstPrinter::printAddrMode7Operand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  O << "[" << getRegisterName(MO1.getReg()) << "]";
+}
+
 void ARMInstPrinter::printAddrMode6OffsetOperand(const MCInst *MI,
                                                  unsigned OpNum,
                                                  raw_ostream &O) {
@@ -414,16 +502,6 @@ void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
   }
 }
 
-void ARMInstPrinter::printNegZeroOperand(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNum);
-  O << '#';
-  if (Op.getImm() < 0)
-    O << '-' << (-Op.getImm() - 1);
-  else
-    O << Op.getImm();
-}
-
 void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum,
                                            raw_ostream &O) {
   ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 679d313..b3ac03a 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -17,14 +17,18 @@
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
-  class MCOperand;
+
+class MCOperand;
+class TargetMachine;
 
 class ARMInstPrinter : public MCInstPrinter {
 public:
-  ARMInstPrinter(const MCAsmInfo &MAI) : MCInstPrinter(MAI) {}
+  ARMInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI)
+    : MCInstPrinter(MAI) {}
 
   virtual void printInst(const MCInst *MI, raw_ostream &O);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
+  virtual StringRef getRegName(unsigned RegNo) const;
 
   static const char *getInstructionName(unsigned Opcode);
 
@@ -38,15 +42,25 @@ public:
   void printSOImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
   void printSORegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
   void printAddrMode2Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAM2PostIndexOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O);
   void printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum,
                                    raw_ostream &O);
+
   void printAddrMode3Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAM3PostIndexOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O);
   void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum,
                                    raw_ostream &O);
+
   void printLdStmModeOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printAddrMode5Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printAddrMode6Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAddrMode7Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printAddrMode6OffsetOperand(const MCInst *MI, unsigned OpNum,
                                    raw_ostream &O);
 
@@ -87,9 +101,7 @@ public:
   void printSetendOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printCPSIMod(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printCPSIFlag(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printCPSOptionOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printMSRMaskOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printNegZeroOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printMandatoryPredicateOperand(const MCInst *MI, unsigned OpNum,
                                       raw_ostream &O);
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index 9a27e2f..f6d0242 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -15,11 +15,13 @@
 #define DEBUG_TYPE "mlx-expansion"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -49,15 +51,17 @@ namespace {
     const TargetRegisterInfo *TRI;
     MachineRegisterInfo *MRI;
 
+    bool isA9;
     unsigned MIIdx;
     MachineInstr* LastMIs[4];
+    SmallPtrSet<MachineInstr*, 4> IgnoreStall;
 
     void clearStack();
     void pushStack(MachineInstr *MI);
     MachineInstr *getAccDefMI(MachineInstr *MI) const;
     unsigned getDefReg(MachineInstr *MI) const;
     bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const;
-    bool FindMLxHazard(MachineInstr *MI) const;
+    bool FindMLxHazard(MachineInstr *MI);
     void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
                                 unsigned MulOpc, unsigned AddSubOpc,
                                 bool NegAcc, bool HasLane);
@@ -146,7 +150,7 @@ bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
 }
 
 
-bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const {
+bool MLxExpansion::FindMLxHazard(MachineInstr *MI) {
   if (NumExpand >= ExpandLimit)
     return false;
 
@@ -154,7 +158,7 @@ bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const {
     return true;
 
   MachineInstr *DefMI = getAccDefMI(MI);
-  if (TII->isFpMLxInstruction(DefMI->getOpcode()))
+  if (TII->isFpMLxInstruction(DefMI->getOpcode())) {
     // r0 = vmla
     // r3 = vmla r0, r1, r2
     // takes 16 - 17 cycles
@@ -163,24 +167,33 @@ bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const {
     // r4 = vmul r1, r2
     // r3 = vadd r0, r4
     // takes about 14 - 15 cycles even with vmul stalling for 4 cycles.
+    IgnoreStall.insert(DefMI);
     return true;
+  }
+
+  if (IgnoreStall.count(MI))
+    return false;
 
   // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the
   // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall
   // preserves the in-order retirement of the instructions.
   // Look at the next few instructions, if *most* of them can cause hazards,
   // then the scheduler can't *fix* this, we'd better break up the VMLA.
+  unsigned Limit1 = isA9 ? 1 : 4;
+  unsigned Limit2 = isA9 ? 1 : 4;
   for (unsigned i = 1; i <= 4; ++i) {
     int Idx = ((int)MIIdx - i + 4) % 4;
     MachineInstr *NextMI = LastMIs[Idx];
     if (!NextMI)
       continue;
 
-    if (TII->canCauseFpMLxStall(NextMI->getOpcode()))
-      return true;
+    if (TII->canCauseFpMLxStall(NextMI->getOpcode())) {
+      if (i <= Limit1)
+        return true;
+    }
 
     // Look for VMLx RAW hazard.
-    if (hasRAWHazard(getDefReg(MI), NextMI))
+    if (i <= Limit2 && hasRAWHazard(getDefReg(MI), NextMI))
       return true;
   }
 
@@ -248,6 +261,7 @@ bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {
   bool Changed = false;
 
   clearStack();
+  IgnoreStall.clear();
 
   unsigned Skip = 0;
   MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend();
@@ -299,6 +313,8 @@ bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
   TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo());
   TRI = Fn.getTarget().getRegisterInfo();
   MRI = &Fn.getRegInfo();
+  const ARMSubtarget *STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
+  isA9 = STI->isCortexA9();
 
   bool Modified = false;
   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index 9fc3fb9..8ba9a27 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -657,3 +657,27 @@ Note that both "tst" and "moveq" are redundant.
 
 //===---------------------------------------------------------------------===//
 
+When loading immediate constants with movt/movw, if there are multiple
+constants needed with the same low 16 bits, and those values are not live at
+the same time, it would be possible to use a single movw instruction, followed
+by multiple movt instructions to rewrite the high bits to different values.
+For example:
+
+  volatile store i32 -1, i32* inttoptr (i32 1342210076 to i32*), align 4,
+  !tbaa
+!0
+  volatile store i32 -1, i32* inttoptr (i32 1342341148 to i32*), align 4,
+  !tbaa
+!0
+
+is compiled and optimized to:
+
+    movw    r0, #32796
+    mov.w    r1, #-1
+    movt    r0, #20480
+    str    r1, [r0]
+    movw    r0, #32796    @ <= this MOVW is not needed, value is there already
+    movt    r0, #20482
+    str    r1, [r0]
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 233e165..dee3d27 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -34,13 +34,14 @@ bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const
   return !MF.getFrameInfo()->hasVarSizedObjects();
 }
 
-static void emitSPUpdate(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator &MBBI,
-                         const TargetInstrInfo &TII, DebugLoc dl,
-                         const Thumb1RegisterInfo &MRI,
-                         int NumBytes) {
-  emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes, TII,
-                            MRI, dl);
+static void
+emitSPUpdate(MachineBasicBlock &MBB,
+             MachineBasicBlock::iterator &MBBI,
+             const TargetInstrInfo &TII, DebugLoc dl,
+             const Thumb1RegisterInfo &MRI,
+             int NumBytes, unsigned MIFlags = MachineInstr::NoFlags)  {
+  emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII,
+                            MRI, MIFlags);
 }
 
 void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
@@ -70,11 +71,13 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
   int FramePtrSpillFI = 0;
 
   if (VARegSaveSize)
-    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -VARegSaveSize);
+    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -VARegSaveSize,
+                 MachineInstr::FrameSetup);
 
   if (!AFI->hasStackFrame()) {
     if (NumBytes != 0)
-      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes);
+      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
+                   MachineInstr::FrameSetup);
     return;
   }
 
@@ -131,7 +134,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
   // Adjust FP so it point to the stack slot that contains the previous FP.
   if (hasFP(MF)) {
     BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
-      .addFrameIndex(FramePtrSpillFI).addImm(0);
+      .addFrameIndex(FramePtrSpillFI).addImm(0)
+      .setMIFlags(MachineInstr::FrameSetup);
     if (NumBytes > 7)
       // If offset is > 7 then sp cannot be adjusted in a single instruction,
       // try restoring from fp instead.
@@ -140,7 +144,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
 
   if (NumBytes)
     // Insert it after all the callee-save spills.
-    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes);
+    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
+                 MachineInstr::FrameSetup);
 
   if (STI.isTargetELF() && hasFP(MF))
     MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
@@ -156,7 +161,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
   // to reference locals.
   if (RegInfo->hasBasePointer(MF))
     BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), BasePtr).addReg(ARM::SP);
-    
+
   // If the frame has variable sized objects then the epilogue must restore
   // the sp from fp. We can assume there's an FP here since hasFP already
   // checks for hasVarSizedObjects.
@@ -232,8 +237,8 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
       if (NumBytes) {
         assert(MF.getRegInfo().isPhysRegUsed(ARM::R4) &&
                "No scratch register to restore SP from FP!");
-        emitThumbRegPlusImmediate(MBB, MBBI, ARM::R4, FramePtr, -NumBytes,
-                                  TII, *RegInfo, dl);
+        emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
+                                  TII, *RegInfo);
         BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP)
           .addReg(ARM::R4);
       } else
@@ -307,6 +312,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
 
     MIB.addReg(Reg, getKillRegState(isKill));
   }
+  MIB.setMIFlags(MachineInstr::FrameSetup);
   return true;
 }
 
diff --git a/lib/Target/ARM/Thumb1FrameLowering.h b/lib/Target/ARM/Thumb1FrameLowering.h
index c592e12..bcfc516 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.h
+++ b/lib/Target/ARM/Thumb1FrameLowering.h
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #ifndef __THUMB_FRAMEINFO_H_
-#define __THUMM_FRAMEINFO_H_
+#define __THUMB_FRAMEINFO_H_
 
 #include "ARM.h"
 #include "ARMFrameLowering.h"
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index f62a13e..33cefb6 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -31,8 +31,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -48,15 +46,29 @@ Thumb1RegisterInfo::Thumb1RegisterInfo(const ARMBaseInstrInfo &tii,
   : ARMBaseRegisterInfo(tii, sti) {
 }
 
+const TargetRegisterClass*
+Thumb1RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)
+                                                                         const {
+  if (RC == ARM::tGPRRegisterClass || RC->hasSuperClass(ARM::tGPRRegisterClass))
+    return ARM::tGPRRegisterClass;
+  return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC);
+}
+
+const TargetRegisterClass *
+Thumb1RegisterInfo::getPointerRegClass(unsigned Kind) const {
+  return ARM::tGPRRegisterClass;
+}
+
 /// emitLoadConstPool - Emits a load from constpool to materialize the
 /// specified immediate.
-void Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
-                                           MachineBasicBlock::iterator &MBBI,
-                                           DebugLoc dl,
-                                           unsigned DestReg, unsigned SubIdx,
-                                           int Val,
-                                           ARMCC::CondCodes Pred,
-                                           unsigned PredReg) const {
+void
+Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator &MBBI,
+                                      DebugLoc dl,
+                                      unsigned DestReg, unsigned SubIdx,
+                                      int Val,
+                                      ARMCC::CondCodes Pred, unsigned PredReg,
+                                      unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
   const Constant *C = ConstantInt::get(
@@ -64,8 +76,9 @@ void Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
   unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
 
   BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRpci))
-          .addReg(DestReg, getDefRegState(true), SubIdx)
-          .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg);
+    .addReg(DestReg, getDefRegState(true), SubIdx)
+    .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg)
+    .setMIFlags(MIFlags);
 }
 
 
@@ -76,11 +89,12 @@ void Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
 static
 void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator &MBBI,
+                              DebugLoc dl,
                               unsigned DestReg, unsigned BaseReg,
                               int NumBytes, bool CanChangeCC,
                               const TargetInstrInfo &TII,
                               const ARMBaseRegisterInfo& MRI,
-                              DebugLoc dl) {
+                              unsigned MIFlags = MachineInstr::NoFlags) {
     MachineFunction &MF = *MBB.getParent();
     bool isHigh = !isARMLowRegister(DestReg) ||
                   (BaseReg != 0 && !isARMLowRegister(BaseReg));
@@ -101,14 +115,15 @@ void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
 
     if (NumBytes <= 255 && NumBytes >= 0)
       AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
-        .addImm(NumBytes);
+        .addImm(NumBytes).setMIFlags(MIFlags);
     else if (NumBytes < 0 && NumBytes >= -255) {
       AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
-        .addImm(NumBytes);
+        .addImm(NumBytes).setMIFlags(MIFlags);
       AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg))
-        .addReg(LdReg, RegState::Kill);
+        .addReg(LdReg, RegState::Kill).setMIFlags(MIFlags);
     } else
-      MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes);
+      MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes,
+                            ARMCC::AL, 0, MIFlags);
 
     // Emit add / sub.
     int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr);
@@ -151,10 +166,11 @@ static unsigned calcNumMI(int Opc, int ExtraOpc, unsigned Bytes,
 /// a destreg = basereg + immediate in Thumb code.
 void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator &MBBI,
+                                     DebugLoc dl,
                                      unsigned DestReg, unsigned BaseReg,
                                      int NumBytes, const TargetInstrInfo &TII,
                                      const ARMBaseRegisterInfo& MRI,
-                                     DebugLoc dl) {
+                                     unsigned MIFlags) {
   bool isSub = NumBytes < 0;
   unsigned Bytes = (unsigned)NumBytes;
   if (isSub) Bytes = -NumBytes;
@@ -211,8 +227,9 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
   if (NumMIs > Threshold) {
     // This will expand into too many instructions. Load the immediate from a
     // constpool entry.
-    emitThumbRegPlusImmInReg(MBB, MBBI, DestReg, BaseReg, NumBytes, true, TII,
-                             MRI, dl);
+    emitThumbRegPlusImmInReg(MBB, MBBI, dl,
+                             DestReg, BaseReg, NumBytes, true,
+                             TII, MRI, MIFlags);
     return;
   }
 
@@ -224,11 +241,12 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
       Bytes -= ThisVal;
       const TargetInstrDesc &TID = TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3);
       const MachineInstrBuilder MIB =
-        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg));
+        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg).setMIFlags(MIFlags));
       AddDefaultPred(MIB.addReg(BaseReg, RegState::Kill).addImm(ThisVal));
     } else {
       BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
-        .addReg(BaseReg, RegState::Kill);
+        .addReg(BaseReg, RegState::Kill)
+        .setMIFlags(MIFlags);
     }
     BaseReg = DestReg;
   }
@@ -243,9 +261,10 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
       MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
       if (NeedCC)
         MIB = AddDefaultT1CC(MIB);
-      MIB .addReg(DestReg).addImm(ThisVal);
+      MIB.addReg(DestReg).addImm(ThisVal);
       if (NeedPred)
         MIB = AddDefaultPred(MIB);
+      MIB.setMIFlags(MIFlags);
     }
     else {
       bool isKill = BaseReg != ARM::SP;
@@ -255,8 +274,9 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
       MIB.addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal);
       if (NeedPred)
         MIB = AddDefaultPred(MIB);
-      BaseReg = DestReg;
+      MIB.setMIFlags(MIFlags);
 
+      BaseReg = DestReg;
       if (Opc == ARM::tADDrSPi) {
         // r4 = add sp, imm
         // r4 = add r4, imm
@@ -274,7 +294,8 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
     const TargetInstrDesc &TID = TII.get(ExtraOpc);
     AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg))
                    .addReg(DestReg, RegState::Kill)
-                   .addImm(((unsigned)NumBytes) & 3));
+                   .addImm(((unsigned)NumBytes) & 3)
+                   .setMIFlags(MIFlags));
   }
 }
 
@@ -283,8 +304,8 @@ static void emitSPUpdate(MachineBasicBlock &MBB,
                          const TargetInstrInfo &TII, DebugLoc dl,
                          const Thumb1RegisterInfo &MRI,
                          int NumBytes) {
-  emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes, TII,
-                            MRI, dl);
+  emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII,
+                            MRI);
 }
 
 void Thumb1RegisterInfo::
@@ -337,7 +358,7 @@ static void emitThumbConstant(MachineBasicBlock &MBB,
                                         DestReg))
                  .addImm(ThisVal));
   if (Imm > 0)
-    emitThumbRegPlusImmediate(MBB, MBBI, DestReg, DestReg, Imm, TII, MRI, dl);
+    emitThumbRegPlusImmediate(MBB, MBBI, dl, DestReg, DestReg, Imm, TII, MRI);
   if (isSub) {
     const TargetInstrDesc &TID = TII.get(ARM::tRSB);
     AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg))
@@ -430,8 +451,8 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
     // MI would expand into a large number of instructions. Don't try to
     // simplify the immediate.
     if (NumMIs > 2) {
-      emitThumbRegPlusImmediate(MBB, II, DestReg, FrameReg, Offset, TII,
-                                *this, dl);
+      emitThumbRegPlusImmediate(MBB, II, dl, DestReg, FrameReg, Offset, TII,
+                                *this);
       MBB.erase(II);
       return true;
     }
@@ -450,8 +471,8 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
       }
       Offset = (Offset - Mask * Scale);
       MachineBasicBlock::iterator NII = llvm::next(II);
-      emitThumbRegPlusImmediate(MBB, NII, DestReg, DestReg, Offset, TII,
-                                *this, dl);
+      emitThumbRegPlusImmediate(MBB, NII, dl, DestReg, DestReg, Offset, TII,
+                                *this);
     } else {
       // Translate r0 = add sp, -imm to
       // r0 = -imm (this is then translated into a series of instructons)
@@ -645,15 +666,15 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     bool UseRR = false;
     if (Opcode == ARM::tRestore) {
       if (FrameReg == ARM::SP)
-        emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg,
-                                 Offset, false, TII, *this, dl);
+        emitThumbRegPlusImmInReg(MBB, II, dl, TmpReg, FrameReg,
+                                 Offset, false, TII, *this);
       else {
         emitLoadConstPool(MBB, II, dl, TmpReg, 0, Offset);
         UseRR = true;
       }
     } else {
-      emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII,
-                                *this, dl);
+      emitThumbRegPlusImmediate(MBB, II, dl, TmpReg, FrameReg, Offset, TII,
+                                *this);
     }
 
     MI.setDesc(TII.get(UseRR ? ARM::tLDRr : ARM::tLDRi));
@@ -668,15 +689,15 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
       if (Opcode == ARM::tSpill) {
         if (FrameReg == ARM::SP)
-          emitThumbRegPlusImmInReg(MBB, II, VReg, FrameReg,
-                                   Offset, false, TII, *this, dl);
+          emitThumbRegPlusImmInReg(MBB, II, dl, VReg, FrameReg,
+                                   Offset, false, TII, *this);
         else {
           emitLoadConstPool(MBB, II, dl, VReg, 0, Offset);
           UseRR = true;
         }
       } else
-        emitThumbRegPlusImmediate(MBB, II, VReg, FrameReg, Offset, TII,
-                                  *this, dl);
+        emitThumbRegPlusImmediate(MBB, II, dl, VReg, FrameReg, Offset, TII,
+                                  *this);
       MI.setDesc(TII.get(UseRR ? ARM::tSTRr : ARM::tSTRi));
       MI.getOperand(i).ChangeToRegister(VReg, false, false, true);
       if (UseRR)
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index 8a87cc5..9060e59 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -28,6 +28,11 @@ struct Thumb1RegisterInfo : public ARMBaseRegisterInfo {
 public:
   Thumb1RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
 
+  const TargetRegisterClass*
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
+
+  const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
+
   /// emitLoadConstPool - Emits a load from constpool to materialize the
   /// specified immediate.
  void emitLoadConstPool(MachineBasicBlock &MBB,
@@ -35,7 +40,8 @@ public:
                         DebugLoc dl,
                         unsigned DestReg, unsigned SubIdx, int Val,
                         ARMCC::CondCodes Pred = ARMCC::AL,
-                        unsigned PredReg = 0) const;
+                        unsigned PredReg = 0,
+                        unsigned MIFlags = MachineInstr::NoFlags) const;
 
   /// Code Generation virtual methods...
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 9b1073b..d169dbb 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -184,7 +184,7 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator &MBBI, DebugLoc dl,
                                unsigned DestReg, unsigned BaseReg, int NumBytes,
                                ARMCC::CondCodes Pred, unsigned PredReg,
-                               const ARMBaseInstrInfo &TII) {
+                               const ARMBaseInstrInfo &TII, unsigned MIFlags) {
   bool isSub = NumBytes < 0;
   if (isSub) NumBytes = -NumBytes;
 
@@ -198,14 +198,14 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
       // Use a movw to materialize the 16-bit constant.
       BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), DestReg)
         .addImm(NumBytes)
-        .addImm((unsigned)Pred).addReg(PredReg);
+        .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags);
       Fits = true;
     } else if ((NumBytes & 0xffff) == 0) {
       // Use a movt to materialize the 32-bit constant.
       BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVTi16), DestReg)
         .addReg(DestReg)
         .addImm(NumBytes >> 16)
-        .addImm((unsigned)Pred).addReg(PredReg);
+        .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags);
       Fits = true;
     }
 
@@ -214,12 +214,14 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
         BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), DestReg)
           .addReg(BaseReg, RegState::Kill)
           .addReg(DestReg, RegState::Kill)
-          .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+          .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+          .setMIFlags(MIFlags);
       } else {
         BuildMI(MBB, MBBI, dl, TII.get(ARM::t2ADDrr), DestReg)
           .addReg(DestReg, RegState::Kill)
           .addReg(BaseReg, RegState::Kill)
-        .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+          .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+          .setMIFlags(MIFlags);
       }
       return;
     }
@@ -230,7 +232,8 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
     unsigned Opc = 0;
     if (DestReg == ARM::SP && BaseReg != ARM::SP) {
       // mov sp, rn. Note t2MOVr cannot be used.
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr),DestReg).addReg(BaseReg);
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr),DestReg)
+        .addReg(BaseReg).setMIFlags(MIFlags);
       BaseReg = ARM::SP;
       continue;
     }
@@ -243,7 +246,7 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
         Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
         // FIXME: Fix Thumb1 immediate encoding.
         BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
-          .addReg(BaseReg).addImm(ThisVal/4);
+          .addReg(BaseReg).addImm(ThisVal/4).setMIFlags(MIFlags);
         NumBytes = 0;
         continue;
       }
@@ -283,7 +286,7 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
     MachineInstrBuilder MIB =
       AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
                      .addReg(BaseReg, RegState::Kill)
-                     .addImm(ThisVal));
+                     .addImm(ThisVal)).setMIFlags(MIFlags);
     if (HasCCOut)
       AddDefaultCC(MIB);
 
diff --git a/lib/Target/ARM/Thumb2RegisterInfo.cpp b/lib/Target/ARM/Thumb2RegisterInfo.cpp
index 099b8f7..355c3bf 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb2RegisterInfo.cpp
@@ -13,26 +13,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
-#include "ARMAddressingModes.h"
-#include "ARMBaseInstrInfo.h"
-#include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
 #include "Thumb2InstrInfo.h"
 #include "Thumb2RegisterInfo.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
-#include "llvm/LLVMContext.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLocation.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
 Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMBaseInstrInfo &tii,
@@ -42,13 +31,14 @@ Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMBaseInstrInfo &tii,
 
 /// emitLoadConstPool - Emits a load from constpool to materialize the
 /// specified immediate.
-void Thumb2RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
-                                           MachineBasicBlock::iterator &MBBI,
-                                           DebugLoc dl,
-                                           unsigned DestReg, unsigned SubIdx,
-                                           int Val,
-                                           ARMCC::CondCodes Pred,
-                                           unsigned PredReg) const {
+void
+Thumb2RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator &MBBI,
+                                      DebugLoc dl,
+                                      unsigned DestReg, unsigned SubIdx,
+                                      int Val,
+                                      ARMCC::CondCodes Pred, unsigned PredReg,
+                                      unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
   const Constant *C = ConstantInt::get(
@@ -57,5 +47,6 @@ void Thumb2RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
 
   BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci))
     .addReg(DestReg, getDefRegState(true), SubIdx)
-    .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0);
+    .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0)
+    .setMIFlags(MIFlags);
 }
diff --git a/lib/Target/ARM/Thumb2RegisterInfo.h b/lib/Target/ARM/Thumb2RegisterInfo.h
index b3cf2e5..824378a 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.h
+++ b/lib/Target/ARM/Thumb2RegisterInfo.h
@@ -35,7 +35,8 @@ public:
                          DebugLoc dl,
                          unsigned DestReg, unsigned SubIdx, int Val,
                          ARMCC::CondCodes Pred = ARMCC::AL,
-                         unsigned PredReg = 0) const;
+                         unsigned PredReg = 0,
+                         unsigned MIFlags = MachineInstr::NoFlags) const;
 };
 }
 
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index cc8f61c..ce2e966 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -12,6 +12,7 @@
 #include "ARMAddressingModes.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
 #include "Thumb2InstrInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -49,82 +50,86 @@ namespace {
                            // 1 - No cc field.
                            // 2 - Always set CPSR.
     unsigned PredCC2  : 2;
+    unsigned PartFlag : 1; // 16-bit instruction does partial flag update
     unsigned Special  : 1; // Needs to be dealt with specially
   };
 
   static const ReduceEntry ReduceTable[] = {
-    // Wide,        Narrow1,      Narrow2,     imm1,imm2,  lo1, lo2, P/C, S
-    { ARM::t2ADCrr, 0,            ARM::tADC,     0,   0,    0,   1,  0,0, 0 },
-    { ARM::t2ADDri, ARM::tADDi3,  ARM::tADDi8,   3,   8,    1,   1,  0,0, 0 },
-    { ARM::t2ADDrr, ARM::tADDrr,  ARM::tADDhirr, 0,   0,    1,   0,  0,1, 0 },
+    // Wide,        Narrow1,      Narrow2,     imm1,imm2,  lo1, lo2, P/C, PF, S
+    { ARM::t2ADCrr, 0,            ARM::tADC,     0,   0,    0,   1,  0,0, 0,0 },
+    { ARM::t2ADDri, ARM::tADDi3,  ARM::tADDi8,   3,   8,    1,   1,  0,0, 0,0 },
+    { ARM::t2ADDrr, ARM::tADDrr,  ARM::tADDhirr, 0,   0,    1,   0,  0,1, 0,0 },
     // Note: immediate scale is 4.
-    { ARM::t2ADDrSPi,ARM::tADDrSPi,0,            8,   0,    1,   0,  1,0, 1 },
-    { ARM::t2ADDSri,ARM::tADDi3,  ARM::tADDi8,   3,   8,    1,   1,  2,2, 1 },
-    { ARM::t2ADDSrr,ARM::tADDrr,  0,             0,   0,    1,   0,  2,0, 1 },
-    { ARM::t2ANDrr, 0,            ARM::tAND,     0,   0,    0,   1,  0,0, 0 },
-    { ARM::t2ASRri, ARM::tASRri,  0,             5,   0,    1,   0,  0,0, 0 },
-    { ARM::t2ASRrr, 0,            ARM::tASRrr,   0,   0,    0,   1,  0,0, 0 },
-    { ARM::t2BICrr, 0,            ARM::tBIC,     0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2ADDrSPi,ARM::tADDrSPi,0,            8,   0,    1,   0,  1,0, 0,1 },
+    { ARM::t2ADDSri,ARM::tADDi3,  ARM::tADDi8,   3,   8,    1,   1,  2,2, 0,1 },
+    { ARM::t2ADDSrr,ARM::tADDrr,  0,             0,   0,    1,   0,  2,0, 0,1 },
+    { ARM::t2ANDrr, 0,            ARM::tAND,     0,   0,    0,   1,  0,0, 1,0 },
+    { ARM::t2ASRri, ARM::tASRri,  0,             5,   0,    1,   0,  0,0, 1,0 },
+    { ARM::t2ASRrr, 0,            ARM::tASRrr,   0,   0,    0,   1,  0,0, 1,0 },
+    { ARM::t2BICrr, 0,            ARM::tBIC,     0,   0,    0,   1,  0,0, 1,0 },
     //FIXME: Disable CMN, as CCodes are backwards from compare expectations
-    //{ ARM::t2CMNrr, ARM::tCMN,    0,             0,   0,    1,   0,  2,0, 0 },
-    { ARM::t2CMPri, ARM::tCMPi8,  0,             8,   0,    1,   0,  2,0, 0 },
-    { ARM::t2CMPrr, ARM::tCMPhir, 0,             0,   0,    0,   0,  2,0, 1 },
-    { ARM::t2EORrr, 0,            ARM::tEOR,     0,   0,    0,   1,  0,0, 0 },
+    //{ ARM::t2CMNrr, ARM::tCMN,  0,             0,   0,    1,   0,  2,0, 0,0 },
+    { ARM::t2CMPri, ARM::tCMPi8,  0,             8,   0,    1,   0,  2,0, 0,0 },
+    { ARM::t2CMPrr, ARM::tCMPhir, 0,             0,   0,    0,   0,  2,0, 0,1 },
+    { ARM::t2EORrr, 0,            ARM::tEOR,     0,   0,    0,   1,  0,0, 1,0 },
     // FIXME: adr.n immediate offset must be multiple of 4.
-    //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0,     0,   0,    1,   0,  1,0, 0 },
-    { ARM::t2LSLri, ARM::tLSLri,  0,             5,   0,    1,   0,  0,0, 0 },
-    { ARM::t2LSLrr, 0,            ARM::tLSLrr,   0,   0,    0,   1,  0,0, 0 },
-    { ARM::t2LSRri, ARM::tLSRri,  0,             5,   0,    1,   0,  0,0, 0 },
-    { ARM::t2LSRrr, 0,            ARM::tLSRrr,   0,   0,    0,   1,  0,0, 0 },
-    { ARM::t2MOVi,  ARM::tMOVi8,  0,             8,   0,    1,   0,  0,0, 0 },
-    { ARM::t2MOVi16,ARM::tMOVi8,  0,             8,   0,    1,   0,  0,0, 1 },
+    //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0,   0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2LSLri, ARM::tLSLri,  0,             5,   0,    1,   0,  0,0, 1,0 },
+    { ARM::t2LSLrr, 0,            ARM::tLSLrr,   0,   0,    0,   1,  0,0, 1,0 },
+    { ARM::t2LSRri, ARM::tLSRri,  0,             5,   0,    1,   0,  0,0, 1,0 },
+    { ARM::t2LSRrr, 0,            ARM::tLSRrr,   0,   0,    0,   1,  0,0, 1,0 },
+    // FIXME: tMOVi8 and tMVN also partially update CPSR but they are less
+    // likely to cause issue in the loop. As a size / performance workaround,
+    // they are not marked as such.
+    { ARM::t2MOVi,  ARM::tMOVi8,  0,             8,   0,    1,   0,  0,0, 0,0 },
+    { ARM::t2MOVi16,ARM::tMOVi8,  0,             8,   0,    1,   0,  0,0, 0,1 },
     // FIXME: Do we need the 16-bit 'S' variant?
-    { ARM::t2MOVr,ARM::tMOVgpr2gpr,0,            0,   0,    0,   0,  1,0, 0 },
-    { ARM::t2MOVCCr,0,            ARM::tMOVCCr,  0,   0,    0,   0,  0,1, 0 },
-    { ARM::t2MOVCCi,0,            ARM::tMOVCCi,  0,   8,    0,   1,  0,1, 0 },
-    { ARM::t2MUL,   0,            ARM::tMUL,     0,   0,    0,   1,  0,0, 0 },
-    { ARM::t2MVNr,  ARM::tMVN,    0,             0,   0,    1,   0,  0,0, 0 },
-    { ARM::t2ORRrr, 0,            ARM::tORR,     0,   0,    0,   1,  0,0, 0 },
-    { ARM::t2REV,   ARM::tREV,    0,             0,   0,    1,   0,  1,0, 0 },
-    { ARM::t2REV16, ARM::tREV16,  0,             0,   0,    1,   0,  1,0, 0 },
-    { ARM::t2REVSH, ARM::tREVSH,  0,             0,   0,    1,   0,  1,0, 0 },
-    { ARM::t2RORrr, 0,            ARM::tROR,     0,   0,    0,   1,  0,0, 0 },
-    { ARM::t2RSBri, ARM::tRSB,    0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2RSBSri,ARM::tRSB,    0,             0,   0,    1,   0,  2,0, 1 },
-    { ARM::t2SBCrr, 0,            ARM::tSBC,     0,   0,    0,   1,  0,0, 0 },
-    { ARM::t2SUBri, ARM::tSUBi3,  ARM::tSUBi8,   3,   8,    1,   1,  0,0, 0 },
-    { ARM::t2SUBrr, ARM::tSUBrr,  0,             0,   0,    1,   0,  0,0, 0 },
-    { ARM::t2SUBSri,ARM::tSUBi3,  ARM::tSUBi8,   3,   8,    1,   1,  2,2, 0 },
-    { ARM::t2SUBSrr,ARM::tSUBrr,  0,             0,   0,    1,   0,  2,0, 0 },
-    { ARM::t2SXTBr, ARM::tSXTB,   0,             0,   0,    1,   0,  1,0, 0 },
-    { ARM::t2SXTHr, ARM::tSXTH,   0,             0,   0,    1,   0,  1,0, 0 },
-    { ARM::t2TSTrr, ARM::tTST,    0,             0,   0,    1,   0,  2,0, 0 },
-    { ARM::t2UXTBr, ARM::tUXTB,   0,             0,   0,    1,   0,  1,0, 0 },
-    { ARM::t2UXTHr, ARM::tUXTH,   0,             0,   0,    1,   0,  1,0, 0 },
+    { ARM::t2MOVr,ARM::tMOVgpr2gpr,0,            0,   0,    0,   0,  1,0, 0,0 },
+    { ARM::t2MOVCCr,0,            ARM::tMOVCCr,  0,   0,    0,   0,  0,1, 0,0 },
+    { ARM::t2MOVCCi,0,            ARM::tMOVCCi,  0,   8,    0,   1,  0,1, 0,0 },
+    { ARM::t2MUL,   0,            ARM::tMUL,     0,   0,    0,   1,  0,0, 1,0 },
+    { ARM::t2MVNr,  ARM::tMVN,    0,             0,   0,    1,   0,  0,0, 0,0 },
+    { ARM::t2ORRrr, 0,            ARM::tORR,     0,   0,    0,   1,  0,0, 1,0 },
+    { ARM::t2REV,   ARM::tREV,    0,             0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2REV16, ARM::tREV16,  0,             0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2REVSH, ARM::tREVSH,  0,             0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2RORrr, 0,            ARM::tROR,     0,   0,    0,   1,  0,0, 1,0 },
+    { ARM::t2RSBri, ARM::tRSB,    0,             0,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2RSBSri,ARM::tRSB,    0,             0,   0,    1,   0,  2,0, 0,1 },
+    { ARM::t2SBCrr, 0,            ARM::tSBC,     0,   0,    0,   1,  0,0, 0,0 },
+    { ARM::t2SUBri, ARM::tSUBi3,  ARM::tSUBi8,   3,   8,    1,   1,  0,0, 0,0 },
+    { ARM::t2SUBrr, ARM::tSUBrr,  0,             0,   0,    1,   0,  0,0, 0,0 },
+    { ARM::t2SUBSri,ARM::tSUBi3,  ARM::tSUBi8,   3,   8,    1,   1,  2,2, 0,0 },
+    { ARM::t2SUBSrr,ARM::tSUBrr,  0,             0,   0,    1,   0,  2,0, 0,0 },
+    { ARM::t2SXTBr, ARM::tSXTB,   0,             0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2SXTHr, ARM::tSXTH,   0,             0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2TSTrr, ARM::tTST,    0,             0,   0,    1,   0,  2,0, 0,0 },
+    { ARM::t2UXTBr, ARM::tUXTB,   0,             0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2UXTHr, ARM::tUXTH,   0,             0,   0,    1,   0,  1,0, 0,0 },
 
     // FIXME: Clean this up after splitting each Thumb load / store opcode
     // into multiple ones.
-    { ARM::t2LDRi12,ARM::tLDRi,   ARM::tLDRspi,  5,   8,    1,   0,  0,0, 1 },
-    { ARM::t2LDRs,  ARM::tLDRr,   0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2LDRBi12,ARM::tLDRBi, 0,             5,   0,    1,   0,  0,0, 1 },
-    { ARM::t2LDRBs, ARM::tLDRBr,  0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2LDRHi12,ARM::tLDRHi, 0,             5,   0,    1,   0,  0,0, 1 },
-    { ARM::t2LDRHs, ARM::tLDRHr,  0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2LDRSBs,ARM::tLDRSB,  0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2LDRSHs,ARM::tLDRSH,  0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2STRi12,ARM::tSTRi,   ARM::tSTRspi,  5,   8,    1,   0,  0,0, 1 },
-    { ARM::t2STRs,  ARM::tSTRr,   0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2STRBi12,ARM::tSTRBi, 0,             5,   0,    1,   0,  0,0, 1 },
-    { ARM::t2STRBs, ARM::tSTRBr,  0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2STRHi12,ARM::tSTRHi, 0,             5,   0,    1,   0,  0,0, 1 },
-    { ARM::t2STRHs, ARM::tSTRHr,  0,             0,   0,    1,   0,  0,0, 1 },
-
-    { ARM::t2LDMIA, ARM::tLDMIA,  0,             0,   0,    1,   1,  1,1, 1 },
-    { ARM::t2LDMIA_RET,0,         ARM::tPOP_RET, 0,   0,    1,   1,  1,1, 1 },
-    { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0,   0,    1,   1,  1,1, 1 },
+    { ARM::t2LDRi12,ARM::tLDRi,   ARM::tLDRspi,  5,   8,    1,   0,  0,0, 0,1 },
+    { ARM::t2LDRs,  ARM::tLDRr,   0,             0,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2LDRBi12,ARM::tLDRBi, 0,             5,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2LDRBs, ARM::tLDRBr,  0,             0,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2LDRHi12,ARM::tLDRHi, 0,             5,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2LDRHs, ARM::tLDRHr,  0,             0,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2LDRSBs,ARM::tLDRSB,  0,             0,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2LDRSHs,ARM::tLDRSH,  0,             0,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2STRi12,ARM::tSTRi,   ARM::tSTRspi,  5,   8,    1,   0,  0,0, 0,1 },
+    { ARM::t2STRs,  ARM::tSTRr,   0,             0,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2STRBi12,ARM::tSTRBi, 0,             5,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2STRBs, ARM::tSTRBr,  0,             0,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2STRHi12,ARM::tSTRHi, 0,             5,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2STRHs, ARM::tSTRHr,  0,             0,   0,    1,   0,  0,0, 0,1 },
+
+    { ARM::t2LDMIA, ARM::tLDMIA,  0,             0,   0,    1,   1,  1,1, 0,1 },
+    { ARM::t2LDMIA_RET,0,         ARM::tPOP_RET, 0,   0,    1,   1,  1,1, 0,1 },
+    { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0,   0,    1,   1,  1,1, 0,1 },
     // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent
-    { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0,       0,   0,    1,   1,  1,1, 1 },
-    { ARM::t2STMDB_UPD, 0,        ARM::tPUSH,    0,   0,    1,   1,  1,1, 1 },
+    { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0,       0,   0,    1,   1,  1,1, 0,1 },
+    { ARM::t2STMDB_UPD, 0,        ARM::tPUSH,    0,   0,    1,   1,  1,1, 0,1 },
   };
 
   class Thumb2SizeReduce : public MachineFunctionPass {
@@ -133,6 +138,7 @@ namespace {
     Thumb2SizeReduce();
 
     const Thumb2InstrInfo *TII;
+    const ARMSubtarget *STI;
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -144,6 +150,8 @@ namespace {
     /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable.
     DenseMap<unsigned, unsigned> ReduceOpcodeMap;
 
+    bool canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use);
+
     bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
                          bool is2Addr, ARMCC::CondCodes Pred,
                          bool LiveCPSR, bool &HasCC, bool &CCDead);
@@ -152,19 +160,20 @@ namespace {
                          const ReduceEntry &Entry);
 
     bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
-                       const ReduceEntry &Entry, bool LiveCPSR);
+                       const ReduceEntry &Entry, bool LiveCPSR,
+                       MachineInstr *CPSRDef);
 
     /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address
     /// instruction.
     bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
                        const ReduceEntry &Entry,
-                       bool LiveCPSR);
+                       bool LiveCPSR, MachineInstr *CPSRDef);
 
     /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit
     /// non-two-address instruction.
     bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
                         const ReduceEntry &Entry,
-                        bool LiveCPSR);
+                        bool LiveCPSR, MachineInstr *CPSRDef);
 
     /// ReduceMBB - Reduce width of instructions in the specified basic block.
     bool ReduceMBB(MachineBasicBlock &MBB);
@@ -187,6 +196,52 @@ static bool HasImplicitCPSRDef(const TargetInstrDesc &TID) {
   return false;
 }
 
+/// canAddPseudoFlagDep - For A9 (and other out-of-order) implementations,
+/// the 's' 16-bit instruction partially update CPSR. Abort the
+/// transformation to avoid adding false dependency on last CPSR setting
+/// instruction which hurts the ability for out-of-order execution engine
+/// to do register renaming magic.
+/// This function checks if there is a read-of-write dependency between the
+/// last instruction that defines the CPSR and the current instruction. If there
+/// is, then there is no harm done since the instruction cannot be retired
+/// before the CPSR setting instruction anyway.
+/// Note, we are not doing full dependency analysis here for the sake of compile
+/// time. We're not looking for cases like:
+/// r0 = muls ...
+/// r1 = add.w r0, ...
+/// ...
+///    = mul.w r1
+/// In this case it would have been ok to narrow the mul.w to muls since there
+/// are indirect RAW dependency between the muls and the mul.w
+bool
+Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use) {
+  if (!Def || !STI->avoidCPSRPartialUpdate())
+    return false;
+
+  SmallSet<unsigned, 2> Defs;
+  for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = Def->getOperand(i);
+    if (!MO.isReg() || MO.isUndef() || MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || Reg == ARM::CPSR)
+      continue;
+    Defs.insert(Reg);
+  }
+
+  for (unsigned i = 0, e = Use->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = Use->getOperand(i);
+    if (!MO.isReg() || MO.isUndef() || MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Defs.count(Reg))
+      return false;
+  }
+
+  // No read-after-write dependency. The narrowing will add false dependency.
+  return true;
+}
+
 bool
 Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
                                   bool is2Addr, ARMCC::CondCodes Pred,
@@ -410,7 +465,10 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
     MIB.addOperand(MI->getOperand(OpNum));
 
   // Transfer memoperands.
-  (*MIB).setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+  MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  // Transfer MI flags.
+  MIB.setMIFlags(MI->getFlags());
 
   DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
 
@@ -422,7 +480,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
 bool
 Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
                                 const ReduceEntry &Entry,
-                                bool LiveCPSR) {
+                                bool LiveCPSR, MachineInstr *CPSRDef) {
   if (Entry.LowRegs1 && !VerifyLowRegs(MI))
     return false;
 
@@ -440,12 +498,12 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
       switch (Opc) {
       default: break;
       case ARM::t2ADDSri: {
-        if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR))
+        if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef))
           return true;
         // fallthrough
       }
       case ARM::t2ADDSrr:
-        return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+        return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
       }
     }
     break;
@@ -453,13 +511,13 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
   case ARM::t2RSBri:
   case ARM::t2RSBSri:
     if (MI->getOperand(2).getImm() == 0)
-      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
     break;
   case ARM::t2MOVi16:
     // Can convert only 'pure' immediate operands, not immediates obtained as
     // globals' addresses.
     if (MI->getOperand(1).isImm())
-      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
     break;
   case ARM::t2CMPrr: {
     // Try to reduce to the lo-reg only version first. Why there are two
@@ -468,17 +526,17 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
     // are prioritized, but the table assumes a unique entry for each
     // source insn opcode. So for now, we hack a local entry record to use.
     static const ReduceEntry NarrowEntry =
-      { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 1 };
-    if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR))
+      { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 0,1 };
+    if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, CPSRDef))
       return true;
-    return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+    return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
   }
   case ARM::t2ADDrSPi: {
     static const ReduceEntry NarrowEntry =
-      { ARM::t2ADDrSPi,ARM::tADDspi, 0, 7, 0, 1, 0, 1, 0, 1 };
+      { ARM::t2ADDrSPi,ARM::tADDspi, 0, 7, 0, 1, 0, 1, 0, 0,1 };
     if (MI->getOperand(0).getReg() == ARM::SP)
-      return ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR);
-    return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+      return ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, CPSRDef);
+    return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
   }
   }
   return false;
@@ -487,7 +545,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
 bool
 Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
                                 const ReduceEntry &Entry,
-                                bool LiveCPSR) {
+                                bool LiveCPSR, MachineInstr *CPSRDef) {
 
   if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
     return false;
@@ -542,6 +600,12 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
   if (!VerifyPredAndCC(MI, Entry, true, Pred, LiveCPSR, HasCC, CCDead))
     return false;
 
+  // Avoid adding a false dependency on partial flag update by some 16-bit
+  // instructions which has the 's' bit set.
+  if (Entry.PartFlag && NewTID.hasOptionalDef() && HasCC &&
+      canAddPseudoFlagDep(CPSRDef, MI))
+    return false;
+
   // Add the 16-bit instruction.
   DebugLoc dl = MI->getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID);
@@ -563,6 +627,9 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
     MIB.addOperand(MI->getOperand(i));
   }
 
+  // Transfer MI flags.
+  MIB.setMIFlags(MI->getFlags());
+
   DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
 
   MBB.erase(MI);
@@ -573,7 +640,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
 bool
 Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
                                  const ReduceEntry &Entry,
-                                 bool LiveCPSR) {
+                                 bool LiveCPSR, MachineInstr *CPSRDef) {
   if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit))
     return false;
 
@@ -626,6 +693,12 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
   if (!VerifyPredAndCC(MI, Entry, false, Pred, LiveCPSR, HasCC, CCDead))
     return false;
 
+  // Avoid adding a false dependency on partial flag update by some 16-bit
+  // instructions which has the 's' bit set.
+  if (Entry.PartFlag && NewTID.hasOptionalDef() && HasCC &&
+      canAddPseudoFlagDep(CPSRDef, MI))
+    return false;
+
   // Add the 16-bit instruction.
   DebugLoc dl = MI->getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID);
@@ -663,6 +736,9 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
   if (!TID.isPredicable() && NewTID.isPredicable())
     AddDefaultPred(MIB);
 
+  // Transfer MI flags.
+  MIB.setMIFlags(MI->getFlags());
+
   DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
 
   MBB.erase(MI);
@@ -670,7 +746,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
   return true;
 }
 
-static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR) {
+static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR, bool &DefCPSR) {
   bool HasDef = false;
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
@@ -678,6 +754,8 @@ static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR) {
       continue;
     if (MO.getReg() != ARM::CPSR)
       continue;
+
+    DefCPSR = true;
     if (!MO.isDead())
       HasDef = true;
   }
@@ -707,6 +785,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
 
   // Yes, CPSR could be livein.
   bool LiveCPSR = MBB.isLiveIn(ARM::CPSR);
+  MachineInstr *CPSRDef = 0;
 
   MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
   MachineBasicBlock::iterator NextMII;
@@ -722,7 +801,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
       const ReduceEntry &Entry = ReduceTable[OPI->second];
       // Ignore "special" cases for now.
       if (Entry.Special) {
-        if (ReduceSpecial(MBB, MI, Entry, LiveCPSR)) {
+        if (ReduceSpecial(MBB, MI, Entry, LiveCPSR, CPSRDef)) {
           Modified = true;
           MachineBasicBlock::iterator I = prior(NextMII);
           MI = &*I;
@@ -731,7 +810,8 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
       }
 
       // Try to transform to a 16-bit two-address instruction.
-      if (Entry.NarrowOpc2 && ReduceTo2Addr(MBB, MI, Entry, LiveCPSR)) {
+      if (Entry.NarrowOpc2 &&
+          ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef)) {
         Modified = true;
         MachineBasicBlock::iterator I = prior(NextMII);
         MI = &*I;
@@ -739,7 +819,8 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
       }
 
       // Try to transform to a 16-bit non-two-address instruction.
-      if (Entry.NarrowOpc1 && ReduceToNarrow(MBB, MI, Entry, LiveCPSR)) {
+      if (Entry.NarrowOpc1 &&
+          ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef)) {
         Modified = true;
         MachineBasicBlock::iterator I = prior(NextMII);
         MI = &*I;
@@ -747,7 +828,14 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
     }
 
   ProcessNext:
-    LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR);
+    bool DefCPSR = false;
+    LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR);
+    if (MI->getDesc().isCall())
+      // Calls don't really set CPSR.
+      CPSRDef = 0;
+    else if (DefCPSR)
+      // This is the last CPSR defining instruction.
+      CPSRDef = MI;
   }
 
   return Modified;
@@ -756,6 +844,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
 bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
   const TargetMachine &TM = MF.getTarget();
   TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
+  STI = &TM.getSubtarget<ARMSubtarget>();
 
   bool Modified = false;
   for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
diff --git a/lib/Target/Alpha/Alpha.td b/lib/Target/Alpha/Alpha.td
index 4508eda..ae79c2e 100644
--- a/lib/Target/Alpha/Alpha.td
+++ b/lib/Target/Alpha/Alpha.td
@@ -21,7 +21,7 @@ include "llvm/Target/Target.td"
 //===----------------------------------------------------------------------===//
 
 def FeatureCIX : SubtargetFeature<"cix", "HasCT", "true",
-                                  "Enable CIX extentions">;
+                                  "Enable CIX extensions">;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
diff --git a/lib/Target/Alpha/AlphaISelLowering.cpp b/lib/Target/Alpha/AlphaISelLowering.cpp
index c4f43ab..ee404f0 100644
--- a/lib/Target/Alpha/AlphaISelLowering.cpp
+++ b/lib/Target/Alpha/AlphaISelLowering.cpp
@@ -296,7 +296,7 @@ AlphaTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Build a sequence of copy-to-reg nodes chained together with token chain and
   // flag operands which copy the outgoing args into registers.  The InFlag in
-  // necessary since all emited instructions must be stuck together.
+  // necessary since all emitted instructions must be stuck together.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
diff --git a/lib/Target/Alpha/AlphaInstrInfo.td b/lib/Target/Alpha/AlphaInstrInfo.td
index 099d715..b201712 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.td
+++ b/lib/Target/Alpha/AlphaInstrInfo.td
@@ -1030,7 +1030,7 @@ def : Pat<(brcond (setune F8RC:$RA, immFPZ), bb:$DISP),
 //WMB Mfc 18.4400 Write memory barrier
 //MF_FPCR F-P 17.025 Move from FPCR
 //MT_FPCR F-P 17.024 Move to FPCR
-//There are in the Multimedia extentions, so let's not use them yet
+//There are in the Multimedia extensions, so let's not use them yet
 //def MAXSB8  : OForm<0x1C, 0x3E, "MAXSB8 $RA,$RB,$RC">; //Vector signed byte maximum
 //def MAXSW4 : OForm< 0x1C, 0x3F, "MAXSW4 $RA,$RB,$RC">; //Vector signed word maximum
 //def MAXUB8  : OForm<0x1C, 0x3C, "MAXUB8 $RA,$RB,$RC">; //Vector unsigned byte maximum
diff --git a/lib/Target/Alpha/README.txt b/lib/Target/Alpha/README.txt
index 9ae1517..cc170e3 100644
--- a/lib/Target/Alpha/README.txt
+++ b/lib/Target/Alpha/README.txt
@@ -33,9 +33,9 @@ add crazy vector instructions (MVI):
 (MIN|MAX)(U|S)(B8|W4) min and max, signed and unsigned, byte and word
 PKWB, UNPKBW pack/unpack word to byte
 PKLB UNPKBL pack/unpack long to byte
-PERR pixel error (sum accross bytes of bytewise abs(i8v8 a - i8v8 b))
+PERR pixel error (sum across bytes of bytewise abs(i8v8 a - i8v8 b))
 
-cmpbytes bytewise cmpeq of i8v8 a and i8v8 b (not part of MVI extentions)
+cmpbytes bytewise cmpeq of i8v8 a and i8v8 b (not part of MVI extensions)
 
 this has some good examples for other operations that can be synthesised well 
 from these rather meager vector ops (such as saturating add).
diff --git a/lib/Target/Blackfin/BlackfinISelLowering.cpp b/lib/Target/Blackfin/BlackfinISelLowering.cpp
index 7c80eec..1e1f8c9 100644
--- a/lib/Target/Blackfin/BlackfinISelLowering.cpp
+++ b/lib/Target/Blackfin/BlackfinISelLowering.cpp
@@ -345,7 +345,7 @@ BlackfinTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
-  // The InFlag in necessary since all emited instructions must be
+  // The InFlag in necessary since all emitted instructions must be
   // stuck together.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp
index 6c555a3..358d1b3 100644
--- a/lib/Target/CBackend/CBackend.cpp
+++ b/lib/Target/CBackend/CBackend.cpp
@@ -2440,24 +2440,6 @@ void CWriter::visitReturnInst(ReturnInst &I) {
     return;
   }
 
-  if (I.getNumOperands() > 1) {
-    Out << "  {\n";
-    Out << "    ";
-    printType(Out, I.getParent()->getParent()->getReturnType());
-    Out << "   llvm_cbe_mrv_temp = {\n";
-    for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
-      Out << "      ";
-      writeOperand(I.getOperand(i));
-      if (i != e - 1)
-        Out << ",";
-      Out << "\n";
-    }
-    Out << "    };\n";
-    Out << "    return llvm_cbe_mrv_temp;\n";
-    Out << "  }\n";
-    return;
-  }
-
   Out << "  return";
   if (I.getNumOperands()) {
     Out << ' ';
diff --git a/lib/Target/CellSPU/SPU64InstrInfo.td b/lib/Target/CellSPU/SPU64InstrInfo.td
index 5ef5716..f340edf 100644
--- a/lib/Target/CellSPU/SPU64InstrInfo.td
+++ b/lib/Target/CellSPU/SPU64InstrInfo.td
@@ -24,7 +24,7 @@
 // 5. The code sequences for r64 and v2i64 are probably overly conservative,
 //    compared to the code that gcc produces.
 //
-// M00$E B!tes Kan be Pretty N@sTi!!!!! (appologies to Monty!)
+// M00$E B!tes Kan be Pretty N@sTi!!!!! (apologies to Monty!)
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
 
 // selb instruction definition for i64. Note that the selection mask is
diff --git a/lib/Target/CellSPU/SPUAsmPrinter.cpp b/lib/Target/CellSPU/SPUAsmPrinter.cpp
index 4040461..fd96694 100644
--- a/lib/Target/CellSPU/SPUAsmPrinter.cpp
+++ b/lib/Target/CellSPU/SPUAsmPrinter.cpp
@@ -182,6 +182,10 @@ namespace {
       printOp(MI->getOperand(OpNo), O);
     }
 
+    void printHBROperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
+      printOp(MI->getOperand(OpNo), O);
+    }
+
     void printPCRelativeOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
       // Used to generate a ".-<target>", but it turns out that the assembler
       // really wants the target.
@@ -279,6 +283,9 @@ void SPUAsmPrinter::printOp(const MachineOperand &MO, raw_ostream &O) {
     }
     O << *Mang->getSymbol(MO.getGlobal());
     return;
+  case MachineOperand::MO_MCSymbol:
+    O << *(MO.getMCSymbol());
+    return;
   default:
     O << "<unknown operand type: " << MO.getType() << ">";
     return;
diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
index d226156..9351ffd 100644
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@@ -321,12 +321,17 @@ SPUDAGToDAGISel::SelectAFormAddr(SDNode *Op, SDValue N, SDValue &Base,
   // These match the addr256k operand type:
   EVT OffsVT = MVT::i16;
   SDValue Zero = CurDAG->getTargetConstant(0, OffsVT);
+  int64_t val;
 
   switch (N.getOpcode()) {
   case ISD::Constant:
+    val = dyn_cast<ConstantSDNode>(N.getNode())->getSExtValue();
+    Base = CurDAG->getTargetConstant( val , MVT::i32);
+    Index = Zero;
+    return true; break;
   case ISD::ConstantPool:
   case ISD::GlobalAddress:
-    report_fatal_error("SPU SelectAFormAddr: Constant/Pool/Global not lowered.");
+    report_fatal_error("SPU SelectAFormAddr: Pool/Global not lowered.");
     /*NOTREACHED*/
 
   case ISD::TargetConstant:
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index 743a4d7..8668da3 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -705,7 +705,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
                                                  offset
                                                 ));
 
-    // Shift the low similarily
+    // Shift the low similarly
     // TODO: add SPUISD::SHL_BYTES
     low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset );
 
diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h
index dd48d7b..cf883e2 100644
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@@ -183,14 +183,6 @@ namespace llvm {
 
     virtual bool isLegalAddressingMode(const AddrMode &AM,
                                        const Type *Ty) const;
-
-    /// After allocating this many registers, the allocator should feel
-    /// register pressure. The value is a somewhat random guess, based on the
-    /// number of non callee saved registers in the C calling convention.
-    virtual unsigned getRegPressureLimit( const TargetRegisterClass *RC,
-                                          MachineFunction &MF) const{
-      return 50;
-    }
   };
 }
 
diff --git a/lib/Target/CellSPU/SPUInstrFormats.td b/lib/Target/CellSPU/SPUInstrFormats.td
index 21bc275..bdbe255 100644
--- a/lib/Target/CellSPU/SPUInstrFormats.td
+++ b/lib/Target/CellSPU/SPUInstrFormats.td
@@ -296,3 +296,25 @@ class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
   let Pattern = pattern;
   let Inst{31-0} = 0;
 }
+
+//===----------------------------------------------------------------------===//
+// Branch hint formats
+//===----------------------------------------------------------------------===//
+// For hbrr and hbra
+class HBI16Form<bits<7> opcode, dag IOL, string asmstr>
+        : Instruction {
+  field bits<32> Inst;
+  bits<16>i16;
+  bits<9>RO;
+
+  let Namespace = "SPU";
+  let InOperandList = IOL;
+  let OutOperandList = (outs); //no output
+  let AsmString = asmstr;
+  let Itinerary = BranchHints;
+
+  let Inst{0-6} = opcode;
+  let Inst{7-8} = RO{8-7};
+  let Inst{9-24} = i16;
+  let Inst{25-31} = RO{6-0};
+}
diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp
index f9e6c72..080434d 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.cpp
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/MC/MCContext.h"
 
 using namespace llvm;
 
@@ -281,9 +282,20 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
   return true;
 }
 
+// search MBB for branch hint labels and branch hit ops
+static void removeHBR( MachineBasicBlock &MBB) {
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I){
+    if (I->getOpcode() == SPU::HBRA ||
+        I->getOpcode() == SPU::HBR_LABEL){
+      I=MBB.erase(I);
+    }
+  }
+}
+
 unsigned
 SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator I = MBB.end();
+  removeHBR(MBB);
   if (I == MBB.begin())
     return 0;
   --I;
@@ -314,6 +326,23 @@ SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   return 2;
 }
 
+/** Find the optimal position for a hint branch instruction in a basic block.
+ * This should take into account:
+ *   -the branch hint delays
+ *   -congestion of the memory bus
+ *   -dual-issue scheduling (i.e. avoid insertion of nops)
+ * Current implementation is rather simplistic.
+ */
+static MachineBasicBlock::iterator findHBRPosition(MachineBasicBlock &MBB)
+{
+   MachineBasicBlock::iterator J = MBB.end();
+	for( int i=0; i<8; i++) {
+		if( J == MBB.begin() ) return J;
+		J--;
+	}
+	return J;
+}
+
 unsigned
 SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                            MachineBasicBlock *FBB,
@@ -324,32 +353,61 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   assert((Cond.size() == 2 || Cond.size() == 0) &&
          "SPU branch conditions have two components!");
 
+  MachineInstrBuilder MIB;
+  //TODO: make a more accurate algorithm.
+  bool haveHBR = MBB.size()>8;
+  
+  removeHBR(MBB);
+  MCSymbol *branchLabel = MBB.getParent()->getContext().CreateTempSymbol();
+  // Add a label just before the branch
+  if (haveHBR)
+    MIB = BuildMI(&MBB, DL, get(SPU::HBR_LABEL)).addSym(branchLabel);
+
   // One-way branch.
   if (FBB == 0) {
     if (Cond.empty()) {
       // Unconditional branch
-      MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(SPU::BR));
+      MIB = BuildMI(&MBB, DL, get(SPU::BR));
       MIB.addMBB(TBB);
 
       DEBUG(errs() << "Inserted one-way uncond branch: ");
       DEBUG((*MIB).dump());
+
+      // basic blocks have just one branch so it is safe to add the hint a its
+      if (haveHBR) {
+        MIB = BuildMI( MBB, findHBRPosition(MBB), DL, get(SPU::HBRA));
+        MIB.addSym(branchLabel);
+        MIB.addMBB(TBB);
+      }	
     } else {
       // Conditional branch
-      MachineInstrBuilder  MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
+      MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
       MIB.addReg(Cond[1].getReg()).addMBB(TBB);
 
+      if (haveHBR) {
+        MIB = BuildMI(MBB, findHBRPosition(MBB), DL, get(SPU::HBRA));
+        MIB.addSym(branchLabel);
+        MIB.addMBB(TBB);
+      }	
+
       DEBUG(errs() << "Inserted one-way cond branch:   ");
       DEBUG((*MIB).dump());
     }
     return 1;
   } else {
-    MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
+    MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
     MachineInstrBuilder MIB2 = BuildMI(&MBB, DL, get(SPU::BR));
 
     // Two-way Conditional Branch.
     MIB.addReg(Cond[1].getReg()).addMBB(TBB);
     MIB2.addMBB(FBB);
 
+    if (haveHBR) {
+      MIB = BuildMI( MBB, findHBRPosition(MBB), DL, get(SPU::HBRA));
+      MIB.addSym(branchLabel);
+      MIB.addMBB(FBB);
+    }	
+
     DEBUG(errs() << "Inserted conditional branch:    ");
     DEBUG((*MIB).dump());
     DEBUG(errs() << "part 2: ");
diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td
index 25f6fd0..e103c9b 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@@ -28,6 +28,8 @@ let hasCtrlDep = 1, Defs = [R1], Uses = [R1] in {
   def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm_i32:$amt),
                                 "${:comment} ADJCALLSTACKUP",
                                 [(callseq_end timm:$amt)]>;
+  def HBR_LABEL        : Pseudo<(outs), (ins hbrtarget:$targ), 
+                                "$targ:\t${:comment}branch hint target",[ ]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2013,9 +2015,9 @@ class SHLHInst<dag OOL, dag IOL, list<dag> pattern>:
            RotShiftVec, pattern>;
 
 class SHLHVecInst<ValueType vectype>:
-    SHLHInst<(outs VECREG:$rT), (ins VECREG:$rA, R16C:$rB),
+    SHLHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
              [(set (vectype VECREG:$rT),
-                   (SPUvec_shl (vectype VECREG:$rA), R16C:$rB))]>;
+                   (SPUvec_shl (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;
 
 multiclass ShiftLeftHalfword
 {
@@ -2063,9 +2065,9 @@ class SHLInst<dag OOL, dag IOL, list<dag> pattern>:
 multiclass ShiftLeftWord
 {
   def v4i32:
-      SHLInst<(outs VECREG:$rT), (ins VECREG:$rA, R16C:$rB),
+      SHLInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
               [(set (v4i32 VECREG:$rT),
-                    (SPUvec_shl (v4i32 VECREG:$rA), R16C:$rB))]>;
+                    (SPUvec_shl (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
   def r32:
       SHLInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
               [(set R32C:$rT, (shl R32C:$rA, R32C:$rB))]>;
@@ -2511,19 +2513,11 @@ class ROTHMInst<dag OOL, dag IOL, list<dag> pattern>:
            RotShiftVec, pattern>;
 
 def ROTHMv8i16:
-    ROTHMInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+    ROTHMInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
               [/* see patterns below - $rB must be negated */]>;
 
-def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), R32C:$rB),
-          (ROTHMv8i16 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-
-def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), R16C:$rB),
-          (ROTHMv8i16 VECREG:$rA,
-                      (SFIr32 (XSHWr16 R16C:$rB), 0))>;
-
-def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), R8C:$rB),
-          (ROTHMv8i16 VECREG:$rA,
-                      (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB) ), 0))>;
+def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)),
+          (ROTHMv8i16 VECREG:$rA, (SFHIvec VECREG:$rB, 0))>;
 
 // ROTHM r16 form: Rotate 16-bit quantity to right, zero fill at the left
 // Note: This instruction doesn't match a pattern because rB must be negated
@@ -2584,19 +2578,11 @@ class ROTMInst<dag OOL, dag IOL, list<dag> pattern>:
            RotShiftVec, pattern>;
 
 def ROTMv4i32:
-    ROTMInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+    ROTMInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
              [/* see patterns below - $rB must be negated */]>;
 
-def : Pat<(SPUvec_srl (v4i32 VECREG:$rA), R32C:$rB),
-          (ROTMv4i32 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-
-def : Pat<(SPUvec_srl (v4i32 VECREG:$rA), R16C:$rB),
-          (ROTMv4i32 VECREG:$rA,
-                     (SFIr32 (XSHWr16 R16C:$rB), 0))>;
-
-def : Pat<(SPUvec_srl (v4i32 VECREG:$rA), R8C:$rB),
-          (ROTMv4i32 VECREG:$rA,
-                     (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>;
+def : Pat<(SPUvec_srl (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)),
+          (ROTMv4i32 VECREG:$rA, (SFIvec VECREG:$rB, 0))>;
 
 def ROTMr32:
     ROTMInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
@@ -2802,20 +2788,12 @@ defm ROTQMBII: RotateMaskQuadByBitsImm;
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
 
 def ROTMAHv8i16:
-    RRForm<0b01111010000, (outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+    RRForm<0b01111010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
       "rotmah\t$rT, $rA, $rB", RotShiftVec,
       [/* see patterns below - $rB must be negated */]>;
 
-def : Pat<(SPUvec_sra (v8i16 VECREG:$rA), R32C:$rB),
-          (ROTMAHv8i16 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-
-def : Pat<(SPUvec_sra (v8i16 VECREG:$rA), R16C:$rB),
-          (ROTMAHv8i16 VECREG:$rA,
-                       (SFIr32 (XSHWr16 R16C:$rB), 0))>;
-
-def : Pat<(SPUvec_sra (v8i16 VECREG:$rA), R8C:$rB),
-          (ROTMAHv8i16 VECREG:$rA,
-                       (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>;
+def : Pat<(SPUvec_sra (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)),
+          (ROTMAHv8i16 VECREG:$rA, (SFHIvec VECREG:$rB, 0))>;
 
 def ROTMAHr16:
     RRForm<0b01111010000, (outs R16C:$rT), (ins R16C:$rA, R32C:$rB),
@@ -2857,20 +2835,12 @@ def : Pat<(sra R16C:$rA, (i8 imm:$val)),
           (ROTMAHIr16 R16C:$rA, (TO_IMM32 uimm7:$val))>;
 
 def ROTMAv4i32:
-    RRForm<0b01011010000, (outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+    RRForm<0b01011010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
       "rotma\t$rT, $rA, $rB", RotShiftVec,
       [/* see patterns below - $rB must be negated */]>;
 
-def : Pat<(SPUvec_sra (v4i32 VECREG:$rA), R32C:$rB),
-          (ROTMAv4i32 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
-
-def : Pat<(SPUvec_sra (v4i32 VECREG:$rA), R16C:$rB),
-          (ROTMAv4i32 VECREG:$rA,
-                      (SFIr32 (XSHWr16 R16C:$rB), 0))>;
-
-def : Pat<(SPUvec_sra (v4i32 VECREG:$rA), R8C:$rB),
-          (ROTMAv4i32 VECREG:$rA,
-                      (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>;
+def : Pat<(SPUvec_sra (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)),
+          (ROTMAv4i32 VECREG:$rA, (SFIvec (v4i32 VECREG:$rB), 0))>;
 
 def ROTMAr32:
     RRForm<0b01011010000, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
@@ -4208,8 +4178,8 @@ def : Pat<(fabs (v4f32 VECREG:$rA)),
 //===----------------------------------------------------------------------===//
 // Hint for branch instructions:
 //===----------------------------------------------------------------------===//
-
-/* def HBR : SPUInstr<(outs), (ins), "hbr\t" */
+def HBRA :
+    HBI16Form<0b0001001,(ins hbrtarget:$brinst, brtarget:$btarg), "hbra\t$brinst, $btarg">;
 
 //===----------------------------------------------------------------------===//
 // Execution, Load NOP (execute NOPs belong in even pipeline, load NOPs belong
diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h
index 641da04..1708c59 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.h
+++ b/lib/Target/CellSPU/SPURegisterInfo.h
@@ -46,6 +46,14 @@ namespace llvm {
     virtual const TargetRegisterClass *
     getPointerRegClass(unsigned Kind = 0) const;
 
+    /// After allocating this many registers, the allocator should feel
+    /// register pressure. The value is a somewhat random guess, based on the
+    /// number of non callee saved registers in the C calling convention.
+    virtual unsigned getRegPressureLimit( const TargetRegisterClass *RC,
+                                          MachineFunction &MF) const{
+      return 50;
+    }
+
     //! Return the array of callee-saved registers
     virtual const unsigned* getCalleeSavedRegs(const MachineFunction *MF) const;
 
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 71d6049..797cfd5 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -1348,12 +1348,10 @@ void CppWriter::printInstruction(const Instruction *I,
     const PHINode* phi = cast<PHINode>(I);
 
     Out << "PHINode* " << iName << " = PHINode::Create("
-        << getCppName(phi->getType()) << ", \"";
+        << getCppName(phi->getType()) << ", "
+        << phi->getNumIncomingValues() << ", \"";
     printEscapedString(phi->getName());
     Out << "\", " << bbname << ");";
-    nl(Out) << iName << "->reserveOperandSpace("
-      << phi->getNumIncomingValues()
-        << ");";
     nl(Out);
     for (unsigned i = 0; i < phi->getNumOperands(); i+=2) {
       Out << iName << "->addIncoming("
diff --git a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
index 3379ac2..060a87b 100644
--- a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
+++ b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
@@ -57,18 +57,26 @@ static unsigned mblazeBinary2Opcode[] = {
 };
 
 static unsigned getRD(uint32_t insn) {
+  if (!MBlazeRegisterInfo::isRegister((insn>>21)&0x1F))
+    return UNSUPPORTED;
   return MBlazeRegisterInfo::getRegisterFromNumbering((insn>>21)&0x1F);
 }
 
 static unsigned getRA(uint32_t insn) {
+  if (!MBlazeRegisterInfo::getRegisterFromNumbering((insn>>16)&0x1F))
+    return UNSUPPORTED;
   return MBlazeRegisterInfo::getRegisterFromNumbering((insn>>16)&0x1F);
 }
 
 static unsigned getRB(uint32_t insn) {
+  if (!MBlazeRegisterInfo::getRegisterFromNumbering((insn>>11)&0x1F))
+    return UNSUPPORTED;
   return MBlazeRegisterInfo::getRegisterFromNumbering((insn>>11)&0x1F);
 }
 
 static int64_t getRS(uint32_t insn) {
+  if (!MBlazeRegisterInfo::isSpecialRegister(insn&0x3FFF))
+    return UNSUPPORTED;
   return MBlazeRegisterInfo::getSpecialRegisterFromNumbering(insn&0x3FFF);
 }
 
@@ -489,13 +497,14 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
                                         raw_ostream &vStream) const {
   // The machine instruction.
   uint32_t insn;
+  uint64_t read;
   uint8_t bytes[4];
 
-  // We always consume 4 bytes of data
-  size = 4;
+  // By default we consume 1 byte on failure
+  size = 1;
 
   // We want to read exactly 4 bytes of data.
-  if (region.readBytes(address, 4, (uint8_t*)bytes, NULL) == -1)
+  if (region.readBytes(address, 4, (uint8_t*)bytes, &read) == -1 || read < 4)
     return false;
 
   // Encoded as a big-endian 32-bit word in the stream.
@@ -509,44 +518,63 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
 
   instr.setOpcode(opcode);
 
+  unsigned RD = getRD(insn);
+  unsigned RA = getRA(insn);
+  unsigned RB = getRB(insn);
+  unsigned RS = getRS(insn);
+
   uint64_t tsFlags = MBlazeInsts[opcode].TSFlags;
   switch ((tsFlags & MBlazeII::FormMask)) {
-  default: llvm_unreachable("unknown instruction encoding");
+  default: 
+    return false;
 
   case MBlazeII::FRRRR:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRB(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED || RB == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RB));
+    instr.addOperand(MCOperand::CreateReg(RA));
     break;
 
   case MBlazeII::FRRR:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRB(insn)));
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED || RB == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RA));
+    instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
   case MBlazeII::FRI:
     switch (opcode) {
-    default: llvm_unreachable("unknown instruction encoding");
+    default: 
+      return false;
     case MBlaze::MFS:
-      instr.addOperand(MCOperand::CreateReg(getRD(insn)));
+      if (RD == UNSUPPORTED)
+        return false;
+      instr.addOperand(MCOperand::CreateReg(RD));
       instr.addOperand(MCOperand::CreateImm(insn&0x3FFF));
       break;
     case MBlaze::MTS:
+      if (RA == UNSUPPORTED)
+        return false;
       instr.addOperand(MCOperand::CreateImm(insn&0x3FFF));
-      instr.addOperand(MCOperand::CreateReg(getRA(insn)));
+      instr.addOperand(MCOperand::CreateReg(RA));
       break;
     case MBlaze::MSRSET:
     case MBlaze::MSRCLR:
-      instr.addOperand(MCOperand::CreateReg(getRD(insn)));
+      if (RD == UNSUPPORTED)
+        return false;
+      instr.addOperand(MCOperand::CreateReg(RD));
       instr.addOperand(MCOperand::CreateImm(insn&0x7FFF));
       break;
     }
     break;
 
   case MBlazeII::FRRI:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RA));
     switch (opcode) {
     default:
       instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
@@ -560,27 +588,37 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
     break;
 
   case MBlazeII::FCRR:
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRB(insn)));
+    if (RA == UNSUPPORTED || RB == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RA));
+    instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
   case MBlazeII::FCRI:
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
+    if (RA == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RA));
     instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
     break;
 
   case MBlazeII::FRCR:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRB(insn)));
+    if (RD == UNSUPPORTED || RB == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
   case MBlazeII::FRCI:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
+    if (RD == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
     break;
 
   case MBlazeII::FCCR:
-    instr.addOperand(MCOperand::CreateReg(getRB(insn)));
+    if (RB == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
   case MBlazeII::FCCI:
@@ -588,33 +626,45 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
     break;
 
   case MBlazeII::FRRCI:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RA));
     instr.addOperand(MCOperand::CreateImm(getSHT(insn)));
     break;
 
   case MBlazeII::FRRC:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RA));
     break;
 
   case MBlazeII::FRCX:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
+    if (RD == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateImm(getFSL(insn)));
     break;
 
   case MBlazeII::FRCS:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRS(insn)));
+    if (RD == UNSUPPORTED || RS == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RS));
     break;
 
   case MBlazeII::FCRCS:
-    instr.addOperand(MCOperand::CreateReg(getRS(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
+    if (RS == UNSUPPORTED || RA == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RS));
+    instr.addOperand(MCOperand::CreateReg(RA));
     break;
 
   case MBlazeII::FCRCX:
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
+    if (RA == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RA));
     instr.addOperand(MCOperand::CreateImm(getFSL(insn)));
     break;
 
@@ -623,16 +673,23 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
     break;
 
   case MBlazeII::FCR:
-    instr.addOperand(MCOperand::CreateReg(getRB(insn)));
+    if (RB == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
   case MBlazeII::FRIR:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
+    instr.addOperand(MCOperand::CreateReg(RA));
     break;
   }
 
+  // We always consume 4 bytes of data on success
+  size = 4;
+
   return true;
 }
 
diff --git a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h b/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
index bebc6c8..13c4b49 100644
--- a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
+++ b/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
@@ -18,11 +18,12 @@
 
 namespace llvm {
   class MCOperand;
+  class TargetMachine;
 
   class MBlazeInstPrinter : public MCInstPrinter {
   public:
-    MBlazeInstPrinter(const MCAsmInfo &MAI) : MCInstPrinter(MAI) {
-    }
+    MBlazeInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI)
+      : MCInstPrinter(MAI) {}
 
     virtual void printInst(const MCInst *MI, raw_ostream &O);
 
diff --git a/lib/Target/MBlaze/MBlaze.td b/lib/Target/MBlaze/MBlaze.td
index 1fa1e4d..1245658 100644
--- a/lib/Target/MBlaze/MBlaze.td
+++ b/lib/Target/MBlaze/MBlaze.td
@@ -31,49 +31,28 @@ def MBlazeInstrInfo : InstrInfo;
 // Microblaze Subtarget features                                              //
 //===----------------------------------------------------------------------===//
 
-def FeaturePipe3       : SubtargetFeature<"pipe3", "HasPipe3", "true",
-                                "Implements 3-stage pipeline">;
 def FeatureBarrel      : SubtargetFeature<"barrel", "HasBarrel", "true",
                                 "Implements barrel shifter">;
 def FeatureDiv         : SubtargetFeature<"div", "HasDiv", "true",
                                 "Implements hardware divider">;
 def FeatureMul         : SubtargetFeature<"mul", "HasMul", "true",
                                 "Implements hardware multiplier">;
-def FeatureFSL         : SubtargetFeature<"fsl", "HasFSL", "true",
-                                "Implements FSL instructions">;
-def FeatureEFSL        : SubtargetFeature<"efsl", "HasEFSL", "true",
-                                "Implements extended FSL instructions">;
-def FeatureMSRSet      : SubtargetFeature<"msrset", "HasMSRSet", "true",
-                                "Implements MSR register set and clear">;
-def FeatureException   : SubtargetFeature<"exception", "HasException", "true",
-                                "Implements hardware exception support">;
 def FeaturePatCmp      : SubtargetFeature<"patcmp", "HasPatCmp", "true",
                                 "Implements pattern compare instruction">;
 def FeatureFPU         : SubtargetFeature<"fpu", "HasFPU", "true",
                                 "Implements floating point unit">;
-def FeatureESR         : SubtargetFeature<"esr", "HasESR", "true",
-                                "Implements ESR and EAR registers">;
-def FeaturePVR         : SubtargetFeature<"pvr", "HasPVR", "true",
-                                "Implements processor version register">;
 def FeatureMul64       : SubtargetFeature<"mul64", "HasMul64", "true",
                                 "Implements multiplier with 64-bit result">;
 def FeatureSqrt        : SubtargetFeature<"sqrt", "HasSqrt", "true",
                                 "Implements sqrt and floating point convert">;
-def FeatureMMU         : SubtargetFeature<"mmu", "HasMMU", "true",
-                                "Implements memory management unit">;
 
 //===----------------------------------------------------------------------===//
 // MBlaze processors supported.
 //===----------------------------------------------------------------------===//
 
-class Proc<string Name, list<SubtargetFeature> Features>
- : Processor<Name, MBlazeGenericItineraries, Features>;
-
-def : Proc<"v400", []>;
-def : Proc<"v500", []>;
-def : Proc<"v600", []>;
-def : Proc<"v700", []>;
-def : Proc<"v710", []>;
+def : Processor<"mblaze",  MBlazeGenericItineraries, []>;
+def : Processor<"mblaze3", MBlazePipe3Itineraries, []>;
+def : Processor<"mblaze5", MBlazePipe5Itineraries, []>;
 
 //===----------------------------------------------------------------------===//
 // Instruction Descriptions
diff --git a/lib/Target/MBlaze/MBlazeAsmBackend.cpp b/lib/Target/MBlaze/MBlazeAsmBackend.cpp
index a4b21af..08f14c3 100644
--- a/lib/Target/MBlaze/MBlazeAsmBackend.cpp
+++ b/lib/Target/MBlaze/MBlazeAsmBackend.cpp
@@ -150,14 +150,13 @@ void ELFMBlazeAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data,
 
 TargetAsmBackend *llvm::createMBlazeAsmBackend(const Target &T,
                                             const std::string &TT) {
-  switch (Triple(TT).getOS()) {
-  case Triple::Darwin:
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
     assert(0 && "Mac not supported on MBlaze");
-  case Triple::MinGW32:
-  case Triple::Cygwin:
-  case Triple::Win32:
+
+  if (TheTriple.isOSWindows())
     assert(0 && "Windows not supported on MBlaze");
-  default:
-    return new ELFMBlazeAsmBackend(T, Triple(TT).getOS());
-  }
+
+  return new ELFMBlazeAsmBackend(T, TheTriple.getOS());
 }
diff --git a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
index 0016df5..0f0f60e 100644
--- a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
+++ b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
@@ -319,10 +319,11 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
 }
 
 static MCInstPrinter *createMBlazeMCInstPrinter(const Target &T,
+                                                TargetMachine &TM,
                                                 unsigned SyntaxVariant,
                                                 const MCAsmInfo &MAI) {
   if (SyntaxVariant == 0)
-    return new MBlazeInstPrinter(MAI);
+    return new MBlazeInstPrinter(TM, MAI);
   return 0;
 }
 
diff --git a/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp b/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
index 4399ee2..973e968 100644
--- a/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
+++ b/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
@@ -77,7 +77,7 @@ static bool hasImmInstruction(MachineBasicBlock::iterator &candidate) {
 
         // We must assume that unknown immediate values require more than
         // 16-bits to represent.
-        if (mop.isGlobal() || mop.isSymbol())
+        if (mop.isGlobal() || mop.isSymbol() || mop.isJTI() || mop.isCPI())
           return true;
 
         // FIXME: we could probably check to see if the FP value happens
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp
index f39826b..21a5988 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp
@@ -274,7 +274,7 @@ MBlazeTargetLowering::EmitCustomShift(MachineInstr *MI,
   F->insert(It, loop);
   F->insert(It, finish);
 
-  // Update machine-CFG edges by transfering adding all successors and
+  // Update machine-CFG edges by transferring adding all successors and
   // remaining instructions from the current block to the new block which
   // will contain the Phi node for the select.
   finish->splice(finish->begin(), MBB,
@@ -456,7 +456,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI,
   F->insert(It, start);
   F->insert(It, exit);
 
-  // Update machine-CFG edges by transfering adding all successors and
+  // Update machine-CFG edges by transferring adding all successors and
   // remaining instructions from the current block to the new block which
   // will contain the Phi node for the select.
   exit->splice(exit->begin(), MBB, llvm::next(MachineBasicBlock::iterator(MI)),
@@ -778,7 +778,7 @@ LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
-  // The InFlag in necessary since all emited instructions must be
+  // The InFlag in necessary since all emitted instructions must be
   // stuck together.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
@@ -1103,7 +1103,7 @@ MBlazeTargetLowering::getSingleConstraintMatchWeight(
   switch (*constraint) {
   default:
     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
-    break;
+    break;
   case 'd':
   case 'y':
     if (type->isIntegerTy())
diff --git a/lib/Target/MBlaze/MBlazeInstrFPU.td b/lib/Target/MBlaze/MBlazeInstrFPU.td
index 094de5c..4acdcfd 100644
--- a/lib/Target/MBlaze/MBlazeInstrFPU.td
+++ b/lib/Target/MBlaze/MBlazeInstrFPU.td
@@ -21,22 +21,22 @@
 class LoadFM<bits<6> op, string instr_asm, PatFrag OpNode> :
              TA<op, 0x000, (outs GPR:$dst), (ins memrr:$addr),
                 !strconcat(instr_asm, "   $dst, $addr"),
-                [(set (f32 GPR:$dst), (OpNode xaddr:$addr))], IILoad>;
+                [(set (f32 GPR:$dst), (OpNode xaddr:$addr))], IIC_MEMl>;
 
 class LoadFMI<bits<6> op, string instr_asm, PatFrag OpNode> :
               TB<op, (outs GPR:$dst), (ins memri:$addr),
                  !strconcat(instr_asm, "   $dst, $addr"),
-                 [(set (f32 GPR:$dst), (OpNode iaddr:$addr))], IILoad>;
+                 [(set (f32 GPR:$dst), (OpNode iaddr:$addr))], IIC_MEMl>;
 
 class StoreFM<bits<6> op, string instr_asm, PatFrag OpNode> :
               TA<op, 0x000, (outs), (ins GPR:$dst, memrr:$addr),
                  !strconcat(instr_asm, "   $dst, $addr"),
-                 [(OpNode (f32 GPR:$dst), xaddr:$addr)], IIStore>;
+                 [(OpNode (f32 GPR:$dst), xaddr:$addr)], IIC_MEMs>;
 
 class StoreFMI<bits<6> op, string instr_asm, PatFrag OpNode> :
                TB<op, (outs), (ins GPR:$dst, memrr:$addr),
                   !strconcat(instr_asm, "   $dst, $addr"),
-                  [(OpNode (f32 GPR:$dst), iaddr:$addr)], IIStore>;
+                  [(OpNode (f32 GPR:$dst), iaddr:$addr)], IIC_MEMs>;
 
 class ArithF<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode,
              InstrItinClass itin> :
@@ -56,15 +56,10 @@ class ArithFR<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode,
                  !strconcat(instr_asm, "   $dst, $c, $b"),
                  [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], itin>;
 
-class LogicF<bits<6> op, string instr_asm> :
-             TB<op, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
-                !strconcat(instr_asm, "   $dst, $b, $c"),
-                [], IIAlu>;
-
 class LogicFI<bits<6> op, string instr_asm> :
              TB<op, (outs GPR:$dst), (ins GPR:$b, fimm:$c),
                 !strconcat(instr_asm, "   $dst, $b, $c"),
-                [], IIAlu>;
+                [], IIC_ALU>;
 
 let rb=0 in {
   class ArithF2<bits<6> op, bits<11> flags, string instr_asm,
@@ -95,10 +90,10 @@ let rb=0 in {
 //===----------------------------------------------------------------------===//
 let Predicates=[HasFPU] in {
   def FORI   : LogicFI<0x28, "ori    ">;
-  def FADD   :  ArithF<0x16, 0x000, "fadd   ", fadd, IIAlu>;
-  def FRSUB  : ArithFR<0x16, 0x080, "frsub  ", fsub, IIAlu>;
-  def FMUL   :  ArithF<0x16, 0x100, "fmul   ", fmul, IIAlu>;
-  def FDIV   :  ArithF<0x16, 0x180, "fdiv   ", fdiv, IIAlu>;
+  def FADD   :  ArithF<0x16, 0x000, "fadd   ", fadd, IIC_FPU>;
+  def FRSUB  : ArithFR<0x16, 0x080, "frsub  ", fsub, IIC_FPU>;
+  def FMUL   :  ArithF<0x16, 0x100, "fmul   ", fmul, IIC_FPU>;
+  def FDIV   :  ArithF<0x16, 0x180, "fdiv   ", fdiv, IIC_FPUd>;
 }
 
 let Predicates=[HasFPU], isCodeGenOnly=1 in {
@@ -110,19 +105,19 @@ let Predicates=[HasFPU], isCodeGenOnly=1 in {
 }
 
 let Predicates=[HasFPU,HasSqrt] in {
-  def FLT    : ArithIF<0x16, 0x280, "flt    ", IIAlu>;
-  def FINT   : ArithFI<0x16, 0x300, "fint   ", IIAlu>;
-  def FSQRT  : ArithF2<0x16, 0x380, "fsqrt  ", IIAlu>;
+  def FLT    : ArithIF<0x16, 0x280, "flt    ", IIC_FPUf>;
+  def FINT   : ArithFI<0x16, 0x300, "fint   ", IIC_FPUi>;
+  def FSQRT  : ArithF2<0x16, 0x380, "fsqrt  ", IIC_FPUs>;
 }
 
 let isAsCheapAsAMove = 1 in {
-  def FCMP_UN : CmpFN<0x16, 0x200, "fcmp.un", IIAlu>;
-  def FCMP_LT : CmpFN<0x16, 0x210, "fcmp.lt", IIAlu>;
-  def FCMP_EQ : CmpFN<0x16, 0x220, "fcmp.eq", IIAlu>;
-  def FCMP_LE : CmpFN<0x16, 0x230, "fcmp.le", IIAlu>;
-  def FCMP_GT : CmpFN<0x16, 0x240, "fcmp.gt", IIAlu>;
-  def FCMP_NE : CmpFN<0x16, 0x250, "fcmp.ne", IIAlu>;
-  def FCMP_GE : CmpFN<0x16, 0x260, "fcmp.ge", IIAlu>;
+  def FCMP_UN : CmpFN<0x16, 0x200, "fcmp.un", IIC_FPUc>;
+  def FCMP_LT : CmpFN<0x16, 0x210, "fcmp.lt", IIC_FPUc>;
+  def FCMP_EQ : CmpFN<0x16, 0x220, "fcmp.eq", IIC_FPUc>;
+  def FCMP_LE : CmpFN<0x16, 0x230, "fcmp.le", IIC_FPUc>;
+  def FCMP_GT : CmpFN<0x16, 0x240, "fcmp.gt", IIC_FPUc>;
+  def FCMP_NE : CmpFN<0x16, 0x250, "fcmp.ne", IIC_FPUc>;
+  def FCMP_GE : CmpFN<0x16, 0x260, "fcmp.ge", IIC_FPUc>;
 }
 
 
diff --git a/lib/Target/MBlaze/MBlazeInstrFSL.td b/lib/Target/MBlaze/MBlazeInstrFSL.td
index 3209845..3082a7e 100644
--- a/lib/Target/MBlaze/MBlazeInstrFSL.td
+++ b/lib/Target/MBlaze/MBlazeInstrFSL.td
@@ -13,7 +13,7 @@
 class FSLGet<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> :
              MBlazeInst<op, FRCX, (outs GPR:$dst), (ins fslimm:$b),
                         !strconcat(instr_asm, " $dst, $b"),
-                        [(set GPR:$dst, (OpNode immZExt4:$b))],IIAlu>
+                        [(set GPR:$dst, (OpNode immZExt4:$b))],IIC_FSLg>
 {
     bits<5> rd;
     bits<4> fslno;
@@ -29,7 +29,7 @@ class FSLGet<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> :
 class FSLGetD<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> :
               MBlazeInst<op, FRCR, (outs GPR:$dst), (ins GPR:$b),
                          !strconcat(instr_asm, " $dst, $b"),
-                         [(set GPR:$dst, (OpNode GPR:$b))], IIAlu>
+                         [(set GPR:$dst, (OpNode GPR:$b))], IIC_FSLg>
 {
     bits<5> rd;
     bits<5> rb;
@@ -45,7 +45,7 @@ class FSLGetD<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> :
 class FSLPut<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
              MBlazeInst<op, FCRCX, (outs), (ins GPR:$v, fslimm:$b),
                         !strconcat(instr_asm, " $v, $b"),
-                        [(OpNode GPR:$v, immZExt4:$b)], IIAlu>
+                        [(OpNode GPR:$v, immZExt4:$b)], IIC_FSLp>
 {
     bits<5> ra;
     bits<4> fslno;
@@ -61,7 +61,7 @@ class FSLPut<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
 class FSLPutD<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
               MBlazeInst<op, FCRR, (outs), (ins GPR:$v, GPR:$b),
                          !strconcat(instr_asm, " $v, $b"),
-                         [(OpNode GPR:$v, GPR:$b)], IIAlu>
+                         [(OpNode GPR:$v, GPR:$b)], IIC_FSLp>
 {
     bits<5> ra;
     bits<5> rb;
@@ -77,7 +77,7 @@ class FSLPutD<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
 class FSLPutT<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
               MBlazeInst<op, FCX, (outs), (ins fslimm:$b),
                          !strconcat(instr_asm, " $b"),
-                         [(OpNode immZExt4:$b)], IIAlu>
+                         [(OpNode immZExt4:$b)], IIC_FSLp>
 {
     bits<4> fslno;
 
@@ -92,7 +92,7 @@ class FSLPutT<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
 class FSLPutTD<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
                MBlazeInst<op, FCR, (outs), (ins GPR:$b),
                           !strconcat(instr_asm, " $b"),
-                          [(OpNode GPR:$b)], IIAlu>
+                          [(OpNode GPR:$b)], IIC_FSLp>
 {
     bits<5> rb;
 
diff --git a/lib/Target/MBlaze/MBlazeInstrFormats.td b/lib/Target/MBlaze/MBlazeInstrFormats.td
index d62574d..54f605f 100644
--- a/lib/Target/MBlaze/MBlazeInstrFormats.td
+++ b/lib/Target/MBlaze/MBlazeInstrFormats.td
@@ -81,7 +81,7 @@ class MBlazeInst<bits<6> op, Format form, dag outs, dag ins, string asmstr,
 // Pseudo instruction class
 //===----------------------------------------------------------------------===//
 class MBlazePseudo<dag outs, dag ins, string asmstr, list<dag> pattern>:
-      MBlazeInst<0x0, FPseudo, outs, ins, asmstr, pattern, IIPseudo>;
+      MBlazeInst<0x0, FPseudo, outs, ins, asmstr, pattern, IIC_Pseudo>;
 
 //===----------------------------------------------------------------------===//
 // Type A instruction class in MBlaze : <|opcode|rd|ra|rb|flags|>
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.cpp b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
index b353dcd..794ebed 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
@@ -17,6 +17,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "MBlazeGenInstrInfo.inc"
 
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.h b/lib/Target/MBlaze/MBlazeInstrInfo.h
index b7300c1..b717da8 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.h
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.h
@@ -261,7 +261,6 @@ public:
   virtual bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
     const;
 
-
   virtual void copyPhysReg(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator I, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.td b/lib/Target/MBlaze/MBlazeInstrInfo.td
index 7b8f70a..896e8ea 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.td
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.td
@@ -47,22 +47,22 @@ def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_MBCallSeqEnd,
 //===----------------------------------------------------------------------===//
 // MBlaze Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
-def HasPipe3     : Predicate<"Subtarget.hasPipe3()">;
+// def HasPipe3     : Predicate<"Subtarget.hasPipe3()">;
 def HasBarrel    : Predicate<"Subtarget.hasBarrel()">;
-def NoBarrel     : Predicate<"!Subtarget.hasBarrel()">;
+// def NoBarrel     : Predicate<"!Subtarget.hasBarrel()">;
 def HasDiv       : Predicate<"Subtarget.hasDiv()">;
 def HasMul       : Predicate<"Subtarget.hasMul()">;
-def HasFSL       : Predicate<"Subtarget.hasFSL()">;
-def HasEFSL      : Predicate<"Subtarget.hasEFSL()">;
-def HasMSRSet    : Predicate<"Subtarget.hasMSRSet()">;
-def HasException : Predicate<"Subtarget.hasException()">;
+// def HasFSL       : Predicate<"Subtarget.hasFSL()">;
+// def HasEFSL      : Predicate<"Subtarget.hasEFSL()">;
+// def HasMSRSet    : Predicate<"Subtarget.hasMSRSet()">;
+// def HasException : Predicate<"Subtarget.hasException()">;
 def HasPatCmp    : Predicate<"Subtarget.hasPatCmp()">;
 def HasFPU       : Predicate<"Subtarget.hasFPU()">;
-def HasESR       : Predicate<"Subtarget.hasESR()">;
-def HasPVR       : Predicate<"Subtarget.hasPVR()">;
+// def HasESR       : Predicate<"Subtarget.hasESR()">;
+// def HasPVR       : Predicate<"Subtarget.hasPVR()">;
 def HasMul64     : Predicate<"Subtarget.hasMul64()">;
 def HasSqrt      : Predicate<"Subtarget.hasSqrt()">;
-def HasMMU       : Predicate<"Subtarget.hasMMU()">;
+// def HasMMU       : Predicate<"Subtarget.hasMMU()">;
 
 //===----------------------------------------------------------------------===//
 // MBlaze Operand, Complex Patterns and Transformations Definitions.
@@ -170,18 +170,18 @@ class ArithI<bits<6> op, string instr_asm, SDNode OpNode,
              Operand Od, PatLeaf imm_type> :
              TB<op, (outs GPR:$dst), (ins GPR:$b, Od:$c),
                 !strconcat(instr_asm, "   $dst, $b, $c"),
-                [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIAlu>;
+                [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIC_ALU>;
 
 class ArithI32<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> :
                TB<op, (outs GPR:$dst), (ins GPR:$b, Od:$c),
                   !strconcat(instr_asm, "   $dst, $b, $c"),
-                  [], IIAlu>;
+                  [], IIC_ALU>;
 
 class ShiftI<bits<6> op, bits<2> flags, string instr_asm, SDNode OpNode,
              Operand Od, PatLeaf imm_type> :
              SHT<op, flags, (outs GPR:$dst), (ins GPR:$b, Od:$c),
                  !strconcat(instr_asm, "   $dst, $b, $c"),
-                 [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIAlu>;
+                 [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIC_SHT>;
 
 class ArithR<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode,
             InstrItinClass itin> :
@@ -193,7 +193,7 @@ class ArithRI<bits<6> op, string instr_asm, SDNode OpNode,
              Operand Od, PatLeaf imm_type> :
              TBR<op, (outs GPR:$dst), (ins Od:$b, GPR:$c),
                  !strconcat(instr_asm, "   $dst, $c, $b"),
-                 [(set GPR:$dst, (OpNode imm_type:$b, GPR:$c))], IIAlu>;
+                 [(set GPR:$dst, (OpNode imm_type:$b, GPR:$c))], IIC_ALU>;
 
 class ArithN<bits<6> op, bits<11> flags, string instr_asm,
             InstrItinClass itin> :
@@ -204,7 +204,7 @@ class ArithN<bits<6> op, bits<11> flags, string instr_asm,
 class ArithNI<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> :
              TB<op, (outs GPR:$dst), (ins GPR:$b, Od:$c),
                 !strconcat(instr_asm, "   $dst, $b, $c"),
-                [], IIAlu>;
+                [], IIC_ALU>;
 
 class ArithRN<bits<6> op, bits<11> flags, string instr_asm,
             InstrItinClass itin> :
@@ -215,7 +215,7 @@ class ArithRN<bits<6> op, bits<11> flags, string instr_asm,
 class ArithRNI<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> :
              TBR<op, (outs GPR:$dst), (ins Od:$c, GPR:$b),
                  !strconcat(instr_asm, "   $dst, $b, $c"),
-                 [], IIAlu>;
+                 [], IIC_ALU>;
 
 //===----------------------------------------------------------------------===//
 // Misc Arithmetic Instructions
@@ -224,23 +224,23 @@ class ArithRNI<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> :
 class Logic<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode> :
             TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
                !strconcat(instr_asm, "   $dst, $b, $c"),
-               [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], IIAlu>;
+               [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], IIC_ALU>;
 
 class LogicI<bits<6> op, string instr_asm, SDNode OpNode> :
              TB<op, (outs GPR:$dst), (ins GPR:$b, uimm16:$c),
                 !strconcat(instr_asm, "   $dst, $b, $c"),
                 [(set GPR:$dst, (OpNode GPR:$b, immZExt16:$c))],
-                IIAlu>;
+                IIC_ALU>;
 
 class LogicI32<bits<6> op, string instr_asm> :
                TB<op, (outs GPR:$dst), (ins GPR:$b, uimm16:$c),
                   !strconcat(instr_asm, "   $dst, $b, $c"),
-                  [], IIAlu>;
+                  [], IIC_ALU>;
 
 class PatCmp<bits<6> op, bits<11> flags, string instr_asm> :
              TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
                 !strconcat(instr_asm, "   $dst, $b, $c"),
-                 [], IIAlu>;
+                 [], IIC_ALU>;
 
 //===----------------------------------------------------------------------===//
 // Memory Access Instructions
@@ -248,22 +248,22 @@ class PatCmp<bits<6> op, bits<11> flags, string instr_asm> :
 class LoadM<bits<6> op, bits<11> flags, string instr_asm> :
             TA<op, flags, (outs GPR:$dst), (ins memrr:$addr),
                !strconcat(instr_asm, "   $dst, $addr"),
-               [], IILoad>;
+               [], IIC_MEMl>;
 
 class LoadMI<bits<6> op, string instr_asm, PatFrag OpNode> :
              TB<op, (outs GPR:$dst), (ins memri:$addr),
                 !strconcat(instr_asm, "   $dst, $addr"),
-                [(set (i32 GPR:$dst), (OpNode iaddr:$addr))], IILoad>;
+                [(set (i32 GPR:$dst), (OpNode iaddr:$addr))], IIC_MEMl>;
 
 class StoreM<bits<6> op, bits<11> flags, string instr_asm> :
              TA<op, flags, (outs), (ins GPR:$dst, memrr:$addr),
                 !strconcat(instr_asm, "   $dst, $addr"),
-                [], IIStore>;
+                [], IIC_MEMs>;
 
 class StoreMI<bits<6> op, string instr_asm, PatFrag OpNode> :
               TB<op, (outs), (ins GPR:$dst, memri:$addr),
                  !strconcat(instr_asm, "   $dst, $addr"),
-                 [(OpNode (i32 GPR:$dst), iaddr:$addr)], IIStore>;
+                 [(OpNode (i32 GPR:$dst), iaddr:$addr)], IIC_MEMs>;
 
 //===----------------------------------------------------------------------===//
 // Branch Instructions
@@ -271,7 +271,7 @@ class StoreMI<bits<6> op, string instr_asm, PatFrag OpNode> :
 class Branch<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
              TA<op, flags, (outs), (ins GPR:$target),
                 !strconcat(instr_asm, "   $target"),
-                [], IIBranch> {
+                [], IIC_BR> {
   let rd = 0x0;
   let ra = br;
   let Form = FCCR;
@@ -280,7 +280,7 @@ class Branch<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
 class BranchI<bits<6> op, bits<5> br, string instr_asm> :
               TB<op, (outs), (ins brtarget:$target),
                  !strconcat(instr_asm, "   $target"),
-                 [], IIBranch> {
+                 [], IIC_BR> {
   let rd = 0;
   let ra = br;
   let Form = FCCI;
@@ -292,7 +292,7 @@ class BranchI<bits<6> op, bits<5> br, string instr_asm> :
 class BranchL<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
               TA<op, flags, (outs), (ins GPR:$link, GPR:$target, variable_ops),
                  !strconcat(instr_asm, "   $link, $target"),
-                 [], IIBranch> {
+                 [], IIC_BRl> {
   let ra = br;
   let Form = FRCR;
 }
@@ -300,7 +300,7 @@ class BranchL<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
 class BranchLI<bits<6> op, bits<5> br, string instr_asm> :
                TB<op, (outs), (ins GPR:$link, calltarget:$target, variable_ops),
                   !strconcat(instr_asm, "   $link, $target"),
-                  [], IIBranch> {
+                  [], IIC_BRl> {
   let ra = br;
   let Form = FRCI;
 }
@@ -312,7 +312,7 @@ class BranchC<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
               TA<op, flags, (outs),
                  (ins GPR:$a, GPR:$b),
                  !strconcat(instr_asm, "   $a, $b"),
-                 [], IIBranch> {
+                 [], IIC_BRc> {
   let rd = br;
   let Form = FCRR;
 }
@@ -320,7 +320,7 @@ class BranchC<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
 class BranchCI<bits<6> op, bits<5> br, string instr_asm> :
                TB<op, (outs), (ins GPR:$a, brtarget:$offset),
                   !strconcat(instr_asm, "   $a, $offset"),
-                  [], IIBranch> {
+                  [], IIC_BRc> {
   let rd = br;
   let Form = FCRI;
 }
@@ -330,71 +330,74 @@ class BranchCI<bits<6> op, bits<5> br, string instr_asm> :
 //===----------------------------------------------------------------------===//
 
 let isCommutable = 1, isAsCheapAsAMove = 1 in {
-  def ADDK   :  Arith<0x04, 0x000, "addk   ", add,  IIAlu>;
+  def ADDK   :  Arith<0x04, 0x000, "addk   ", add,  IIC_ALU>;
   def AND    :  Logic<0x21, 0x000, "and    ", and>;
   def OR     :  Logic<0x20, 0x000, "or     ", or>;
   def XOR    :  Logic<0x22, 0x000, "xor    ", xor>;
-  def PCMPBF : PatCmp<0x20, 0x400, "pcmpbf ">;
-  def PCMPEQ : PatCmp<0x22, 0x400, "pcmpeq ">;
-  def PCMPNE : PatCmp<0x23, 0x400, "pcmpne ">;
+
+  let Predicates=[HasPatCmp] in {
+    def PCMPBF : PatCmp<0x20, 0x400, "pcmpbf ">;
+    def PCMPEQ : PatCmp<0x22, 0x400, "pcmpeq ">;
+    def PCMPNE : PatCmp<0x23, 0x400, "pcmpne ">;
+  }
 
   let Defs = [CARRY] in {
-    def ADD    :  Arith<0x00, 0x000, "add    ", addc, IIAlu>;
+    def ADD    :  Arith<0x00, 0x000, "add    ", addc, IIC_ALU>;
 
     let Uses = [CARRY] in {
-      def ADDC   :  Arith<0x02, 0x000, "addc   ", adde, IIAlu>;
+      def ADDC   :  Arith<0x02, 0x000, "addc   ", adde, IIC_ALU>;
     }
   }
 
   let Uses = [CARRY] in {
-    def ADDKC  : ArithN<0x06, 0x000, "addkc  ", IIAlu>;
+    def ADDKC  : ArithN<0x06, 0x000, "addkc  ", IIC_ALU>;
   }
 }
 
 let isAsCheapAsAMove = 1 in {
-  def ANDN   :  ArithN<0x23, 0x000, "andn   ", IIAlu>;
-  def CMP    :  ArithN<0x05, 0x001, "cmp    ", IIAlu>;
-  def CMPU   :  ArithN<0x05, 0x003, "cmpu   ", IIAlu>;
-  def RSUBK  :  ArithR<0x05, 0x000, "rsubk  ", sub,  IIAlu>;
+  def ANDN   :  ArithN<0x23, 0x000, "andn   ", IIC_ALU>;
+  def CMP    :  ArithN<0x05, 0x001, "cmp    ", IIC_ALU>;
+  def CMPU   :  ArithN<0x05, 0x003, "cmpu   ", IIC_ALU>;
+  def RSUBK  :  ArithR<0x05, 0x000, "rsubk  ", sub,  IIC_ALU>;
 
   let Defs = [CARRY] in {
-    def RSUB   :  ArithR<0x01, 0x000, "rsub   ", subc, IIAlu>;
+    def RSUB   :  ArithR<0x01, 0x000, "rsub   ", subc, IIC_ALU>;
 
     let Uses = [CARRY] in {
-      def RSUBC  :  ArithR<0x03, 0x000, "rsubc  ", sube, IIAlu>;
+      def RSUBC  :  ArithR<0x03, 0x000, "rsubc  ", sube, IIC_ALU>;
     }
   }
 
   let Uses = [CARRY] in {
-    def RSUBKC : ArithRN<0x07, 0x000, "rsubkc ", IIAlu>;
+    def RSUBKC : ArithRN<0x07, 0x000, "rsubkc ", IIC_ALU>;
   }
 }
 
 let isCommutable = 1, Predicates=[HasMul] in {
-  def MUL    : Arith<0x10, 0x000, "mul    ", mul,   IIAlu>;
+  def MUL    : Arith<0x10, 0x000, "mul    ", mul,   IIC_ALUm>;
 }
 
 let isCommutable = 1, Predicates=[HasMul,HasMul64] in {
-  def MULH   : Arith<0x10, 0x001, "mulh   ", mulhs, IIAlu>;
-  def MULHU  : Arith<0x10, 0x003, "mulhu  ", mulhu, IIAlu>;
+  def MULH   : Arith<0x10, 0x001, "mulh   ", mulhs, IIC_ALUm>;
+  def MULHU  : Arith<0x10, 0x003, "mulhu  ", mulhu, IIC_ALUm>;
 }
 
 let Predicates=[HasMul,HasMul64] in {
-  def MULHSU : ArithN<0x10, 0x002, "mulhsu ", IIAlu>;
+  def MULHSU : ArithN<0x10, 0x002, "mulhsu ", IIC_ALUm>;
 }
 
 let Predicates=[HasBarrel] in {
-  def BSRL   :   Arith<0x11, 0x000, "bsrl   ", srl, IIAlu>;
-  def BSRA   :   Arith<0x11, 0x200, "bsra   ", sra, IIAlu>;
-  def BSLL   :   Arith<0x11, 0x400, "bsll   ", shl, IIAlu>;
+  def BSRL   :   Arith<0x11, 0x000, "bsrl   ", srl, IIC_SHT>;
+  def BSRA   :   Arith<0x11, 0x200, "bsra   ", sra, IIC_SHT>;
+  def BSLL   :   Arith<0x11, 0x400, "bsll   ", shl, IIC_SHT>;
   def BSRLI  :  ShiftI<0x19, 0x0, "bsrli  ", srl, uimm5, immZExt5>;
   def BSRAI  :  ShiftI<0x19, 0x1, "bsrai  ", sra, uimm5, immZExt5>;
   def BSLLI  :  ShiftI<0x19, 0x2, "bslli  ", shl, uimm5, immZExt5>;
 }
 
 let Predicates=[HasDiv] in {
-  def IDIV   :  ArithR<0x12, 0x000, "idiv   ", sdiv, IIAlu>;
-  def IDIVU  :  ArithR<0x12, 0x002, "idivu  ", udiv, IIAlu>;
+  def IDIV   :  ArithR<0x12, 0x000, "idiv   ", sdiv, IIC_ALUd>;
+  def IDIVU  :  ArithR<0x12, 0x002, "idivu  ", udiv, IIC_ALUd>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -552,7 +555,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
   def RTSD   : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm),
                   "rtsd      $target, $imm",
                   [],
-                  IIBranch>;
+                  IIC_BR>;
 }
 
 let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
@@ -560,7 +563,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
   def RTID   : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm),
                   "rtid      $target, $imm",
                   [],
-                  IIBranch>;
+                  IIC_BR>;
 }
 
 let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
@@ -568,7 +571,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
   def RTBD   : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm),
                   "rtbd      $target, $imm",
                   [],
-                  IIBranch>;
+                  IIC_BR>;
 }
 
 let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
@@ -576,7 +579,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
   def RTED   : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm),
                   "rted      $target, $imm",
                   [],
-                  IIBranch>;
+                  IIC_BR>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -584,7 +587,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
 //===----------------------------------------------------------------------===//
 
 let neverHasSideEffects = 1 in {
-  def NOP :  MBlazeInst< 0x20, FC, (outs), (ins), "nop    ", [], IIAlu>;
+  def NOP :  MBlazeInst< 0x20, FC, (outs), (ins), "nop    ", [], IIC_ALU>;
 }
 
 let usesCustomInserter = 1 in {
@@ -611,17 +614,17 @@ let usesCustomInserter = 1 in {
 
 let rb = 0 in {
   def SEXT16 : TA<0x24, 0x061, (outs GPR:$dst), (ins GPR:$src),
-                  "sext16    $dst, $src", [], IIAlu>;
+                  "sext16    $dst, $src", [], IIC_ALU>;
   def SEXT8  : TA<0x24, 0x060, (outs GPR:$dst), (ins GPR:$src),
-                  "sext8     $dst, $src", [], IIAlu>;
+                  "sext8     $dst, $src", [], IIC_ALU>;
   let Defs = [CARRY] in {
     def SRL    : TA<0x24, 0x041, (outs GPR:$dst), (ins GPR:$src),
-                    "srl       $dst, $src", [], IIAlu>;
+                    "srl       $dst, $src", [], IIC_ALU>;
     def SRA    : TA<0x24, 0x001, (outs GPR:$dst), (ins GPR:$src),
-                    "sra       $dst, $src", [], IIAlu>;
+                    "sra       $dst, $src", [], IIC_ALU>;
     let Uses = [CARRY] in {
       def SRC    : TA<0x24, 0x021, (outs GPR:$dst), (ins GPR:$src),
-                      "src       $dst, $src", [], IIAlu>;
+                      "src       $dst, $src", [], IIC_ALU>;
     }
   }
 }
@@ -637,36 +640,36 @@ let isCodeGenOnly=1 in {
 //===----------------------------------------------------------------------===//
 let Form=FRCS in {
   def MFS : SPC<0x25, 0x2, (outs GPR:$dst), (ins SPR:$src),
-                "mfs       $dst, $src", [], IIAlu>;
+                "mfs       $dst, $src", [], IIC_ALU>;
 }
 
 let Form=FCRCS in {
   def MTS : SPC<0x25, 0x3, (outs SPR:$dst), (ins GPR:$src),
-                "mts       $dst, $src", [], IIAlu>;
+                "mts       $dst, $src", [], IIC_ALU>;
 }
 
 def MSRSET : MSR<0x25, 0x20, (outs GPR:$dst), (ins uimm15:$set),
-                 "msrset    $dst, $set", [], IIAlu>;
+                 "msrset    $dst, $set", [], IIC_ALU>;
 
 def MSRCLR : MSR<0x25, 0x22, (outs GPR:$dst), (ins uimm15:$clr),
-                 "msrclr    $dst, $clr", [], IIAlu>;
+                 "msrclr    $dst, $clr", [], IIC_ALU>;
 
 let rd=0x0, Form=FCRR in {
   def WDC  : TA<0x24, 0x64, (outs), (ins GPR:$a, GPR:$b),
-                "wdc       $a, $b", [], IIAlu>;
+                "wdc       $a, $b", [], IIC_WDC>;
   def WDCF : TA<0x24, 0x74, (outs), (ins GPR:$a, GPR:$b),
-                "wdc.flush $a, $b", [], IIAlu>;
+                "wdc.flush $a, $b", [], IIC_WDC>;
   def WDCC : TA<0x24, 0x66, (outs), (ins GPR:$a, GPR:$b),
-                "wdc.clear $a, $b", [], IIAlu>;
+                "wdc.clear $a, $b", [], IIC_WDC>;
   def WIC  : TA<0x24, 0x68, (outs), (ins GPR:$a, GPR:$b),
-                "wic       $a, $b", [], IIAlu>;
+                "wic       $a, $b", [], IIC_WDC>;
 }
 
 def BRK  :  BranchL<0x26, 0x0C, 0x000, "brk    ">;
 def BRKI : BranchLI<0x2E, 0x0C, "brki   ">;
 
 def IMM : MBlazeInst<0x2C, FCCI, (outs), (ins simm16:$imm),
-                     "imm       $imm", [], IIAlu>;
+                     "imm       $imm", [], IIC_ALU>;
 
 //===----------------------------------------------------------------------===//
 // Pseudo instructions for atomic operations
@@ -848,11 +851,6 @@ def : Pat<(MBWrapper tconstpool:$in),  (ORI (i32 R0), tconstpool:$in)>;
 // Misc instructions
 def : Pat<(and (i32 GPR:$lh), (not (i32 GPR:$rh))),(ANDN GPR:$lh, GPR:$rh)>;
 
-// Arithmetic with immediates
-def : Pat<(add (i32 GPR:$in), imm:$imm),(ADDIK GPR:$in, imm:$imm)>;
-def : Pat<(or (i32 GPR:$in), imm:$imm),(ORI GPR:$in, imm:$imm)>;
-def : Pat<(xor (i32 GPR:$in), imm:$imm),(XORI GPR:$in, imm:$imm)>;
-
 // Convert any extend loads into zero extend loads
 def : Pat<(extloadi8  iaddr:$src), (i32 (LBUI iaddr:$src))>;
 def : Pat<(extloadi16 iaddr:$src), (i32 (LHUI iaddr:$src))>;
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
index fa9140d..ed8511d 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
@@ -181,6 +181,26 @@ unsigned MBlazeRegisterInfo::getSpecialRegisterFromNumbering(unsigned Reg) {
   return 0; // Not reached
 }
 
+bool MBlazeRegisterInfo::isRegister(unsigned Reg) {
+  return Reg <= 31;
+}
+
+bool MBlazeRegisterInfo::isSpecialRegister(unsigned Reg) {
+  switch (Reg) {
+    case 0x0000 : case 0x0001 : case 0x0003 : case 0x0005 : 
+    case 0x0007 : case 0x000B : case 0x000D : case 0x1000 : 
+    case 0x1001 : case 0x1002 : case 0x1003 : case 0x1004 : 
+    case 0x2000 : case 0x2001 : case 0x2002 : case 0x2003 : 
+    case 0x2004 : case 0x2005 : case 0x2006 : case 0x2007 : 
+    case 0x2008 : case 0x2009 : case 0x200A : case 0x200B : 
+      return true;
+
+    default:
+      return false;
+  }
+  return false; // Not reached
+}
+
 unsigned MBlazeRegisterInfo::getPICCallReg() {
   return MBlaze::R20;
 }
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.h b/lib/Target/MBlaze/MBlazeRegisterInfo.h
index 839536d..69ec5aa 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.h
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.h
@@ -45,6 +45,8 @@ struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo {
   static unsigned getRegisterNumbering(unsigned RegEnum);
   static unsigned getRegisterFromNumbering(unsigned RegEnum);
   static unsigned getSpecialRegisterFromNumbering(unsigned RegEnum);
+  static bool isRegister(unsigned RegEnum);
+  static bool isSpecialRegister(unsigned RegEnum);
 
   /// Get PIC indirect call register
   static unsigned getPICCallReg();
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.td b/lib/Target/MBlaze/MBlazeRegisterInfo.td
index fbefb22..1a695a7 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.td
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.td
@@ -85,18 +85,19 @@ let Namespace = "MBlaze" in {
   def RTLBX  : MBlazeSPRReg<0x1002, "rtlbx">,  DwarfRegNum<[41]>;
   def RTLBLO : MBlazeSPRReg<0x1003, "rtlblo">, DwarfRegNum<[42]>;
   def RTLBHI : MBlazeSPRReg<0x1004, "rtlbhi">, DwarfRegNum<[43]>;
-  def RPVR0  : MBlazeSPRReg<0x2000, "rpvr0">,  DwarfRegNum<[44]>;
-  def RPVR1  : MBlazeSPRReg<0x2001, "rpvr1">,  DwarfRegNum<[45]>;
-  def RPVR2  : MBlazeSPRReg<0x2002, "rpvr2">,  DwarfRegNum<[46]>;
-  def RPVR3  : MBlazeSPRReg<0x2003, "rpvr3">,  DwarfRegNum<[47]>;
-  def RPVR4  : MBlazeSPRReg<0x2004, "rpvr4">,  DwarfRegNum<[48]>;
-  def RPVR5  : MBlazeSPRReg<0x2005, "rpvr5">,  DwarfRegNum<[49]>;
-  def RPVR6  : MBlazeSPRReg<0x2006, "rpvr6">,  DwarfRegNum<[50]>;
-  def RPVR7  : MBlazeSPRReg<0x2007, "rpvr7">,  DwarfRegNum<[51]>;
-  def RPVR8  : MBlazeSPRReg<0x2008, "rpvr8">,  DwarfRegNum<[52]>;
-  def RPVR9  : MBlazeSPRReg<0x2009, "rpvr9">,  DwarfRegNum<[53]>;
-  def RPVR10 : MBlazeSPRReg<0x200A, "rpvr10">, DwarfRegNum<[54]>;
-  def RPVR11 : MBlazeSPRReg<0x200B, "rpvr11">, DwarfRegNum<[55]>;
+  def RTLBSX : MBlazeSPRReg<0x1004, "rtlbsx">, DwarfRegNum<[44]>;
+  def RPVR0  : MBlazeSPRReg<0x2000, "rpvr0">,  DwarfRegNum<[45]>;
+  def RPVR1  : MBlazeSPRReg<0x2001, "rpvr1">,  DwarfRegNum<[46]>;
+  def RPVR2  : MBlazeSPRReg<0x2002, "rpvr2">,  DwarfRegNum<[47]>;
+  def RPVR3  : MBlazeSPRReg<0x2003, "rpvr3">,  DwarfRegNum<[48]>;
+  def RPVR4  : MBlazeSPRReg<0x2004, "rpvr4">,  DwarfRegNum<[49]>;
+  def RPVR5  : MBlazeSPRReg<0x2005, "rpvr5">,  DwarfRegNum<[50]>;
+  def RPVR6  : MBlazeSPRReg<0x2006, "rpvr6">,  DwarfRegNum<[51]>;
+  def RPVR7  : MBlazeSPRReg<0x2007, "rpvr7">,  DwarfRegNum<[52]>;
+  def RPVR8  : MBlazeSPRReg<0x2008, "rpvr8">,  DwarfRegNum<[53]>;
+  def RPVR9  : MBlazeSPRReg<0x2009, "rpvr9">,  DwarfRegNum<[54]>;
+  def RPVR10 : MBlazeSPRReg<0x200A, "rpvr10">, DwarfRegNum<[55]>;
+  def RPVR11 : MBlazeSPRReg<0x200B, "rpvr11">, DwarfRegNum<[56]>;
 
   // The carry bit. In the Microblaze this is really bit 29 of the
   // MSR register but this is the only bit of that register that we
diff --git a/lib/Target/MBlaze/MBlazeSchedule.td b/lib/Target/MBlaze/MBlazeSchedule.td
index ac4d98c..4662f25 100644
--- a/lib/Target/MBlaze/MBlazeSchedule.td
+++ b/lib/Target/MBlaze/MBlazeSchedule.td
@@ -8,57 +8,48 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// Functional units across MBlaze chips sets. Based on GCC/MBlaze backend files.
+// MBlaze functional units.
 //===----------------------------------------------------------------------===//
-def ALU     : FuncUnit;
-def IMULDIV : FuncUnit;
+def IF : FuncUnit;
+def ID : FuncUnit;
+def EX : FuncUnit;
+def MA : FuncUnit;
+def WB : FuncUnit;
 
 //===----------------------------------------------------------------------===//
 // Instruction Itinerary classes used for MBlaze
 //===----------------------------------------------------------------------===//
-def IIAlu              : InstrItinClass;
-def IILoad             : InstrItinClass;
-def IIStore            : InstrItinClass;
-def IIXfer             : InstrItinClass;
-def IIBranch           : InstrItinClass;
-def IIHiLo             : InstrItinClass;
-def IIImul             : InstrItinClass;
-def IIIdiv             : InstrItinClass;
-def IIFcvt             : InstrItinClass;
-def IIFmove            : InstrItinClass;
-def IIFcmp             : InstrItinClass;
-def IIFadd             : InstrItinClass;
-def IIFmulSingle       : InstrItinClass;
-def IIFmulDouble       : InstrItinClass;
-def IIFdivSingle       : InstrItinClass;
-def IIFdivDouble       : InstrItinClass;
-def IIFsqrtSingle      : InstrItinClass;
-def IIFsqrtDouble      : InstrItinClass;
-def IIFrecipFsqrtStep  : InstrItinClass;
-def IIPseudo           : InstrItinClass;
+def IIC_ALU    : InstrItinClass;
+def IIC_ALUm   : InstrItinClass;
+def IIC_ALUd   : InstrItinClass;
+def IIC_SHT    : InstrItinClass;
+def IIC_FSLg   : InstrItinClass;
+def IIC_FSLp   : InstrItinClass;
+def IIC_MEMs   : InstrItinClass;
+def IIC_MEMl   : InstrItinClass;
+def IIC_FPU    : InstrItinClass;
+def IIC_FPUd   : InstrItinClass;
+def IIC_FPUf   : InstrItinClass;
+def IIC_FPUi   : InstrItinClass;
+def IIC_FPUs   : InstrItinClass;
+def IIC_FPUc   : InstrItinClass;
+def IIC_BR     : InstrItinClass;
+def IIC_BRc    : InstrItinClass;
+def IIC_BRl    : InstrItinClass;
+def IIC_WDC    : InstrItinClass;
+def IIC_Pseudo : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
-// MBlaze Generic instruction itineraries.
+// MBlaze generic instruction itineraries.
 //===----------------------------------------------------------------------===//
-def MBlazeGenericItineraries : ProcessorItineraries<
-  [ALU, IMULDIV], [], [
-  InstrItinData<IIAlu              , [InstrStage<1,  [ALU]>]>,
-  InstrItinData<IILoad             , [InstrStage<3,  [ALU]>]>,
-  InstrItinData<IIStore            , [InstrStage<1,  [ALU]>]>,
-  InstrItinData<IIXfer             , [InstrStage<2,  [ALU]>]>,
-  InstrItinData<IIBranch           , [InstrStage<1,  [ALU]>]>,
-  InstrItinData<IIHiLo             , [InstrStage<1,  [IMULDIV]>]>,
-  InstrItinData<IIImul             , [InstrStage<17, [IMULDIV]>]>,
-  InstrItinData<IIIdiv             , [InstrStage<38, [IMULDIV]>]>,
-  InstrItinData<IIFcvt             , [InstrStage<1,  [ALU]>]>,
-  InstrItinData<IIFmove            , [InstrStage<2,  [ALU]>]>,
-  InstrItinData<IIFcmp             , [InstrStage<3,  [ALU]>]>,
-  InstrItinData<IIFadd             , [InstrStage<4,  [ALU]>]>,
-  InstrItinData<IIFmulSingle       , [InstrStage<7,  [ALU]>]>,
-  InstrItinData<IIFmulDouble       , [InstrStage<8,  [ALU]>]>,
-  InstrItinData<IIFdivSingle       , [InstrStage<23, [ALU]>]>,
-  InstrItinData<IIFdivDouble       , [InstrStage<36, [ALU]>]>,
-  InstrItinData<IIFsqrtSingle      , [InstrStage<54, [ALU]>]>,
-  InstrItinData<IIFsqrtDouble      , [InstrStage<12, [ALU]>]>,
-  InstrItinData<IIFrecipFsqrtStep  , [InstrStage<5,  [ALU]>]>
-]>;
+def MBlazeGenericItineraries : ProcessorItineraries<[], [], []>;
+
+//===----------------------------------------------------------------------===//
+// MBlaze instruction itineraries for three stage pipeline.
+//===----------------------------------------------------------------------===//
+include "MBlazeSchedule3.td"
+
+//===----------------------------------------------------------------------===//
+// MBlaze instruction itineraries for five stage pipeline.
+//===----------------------------------------------------------------------===//
+include "MBlazeSchedule5.td"
diff --git a/lib/Target/MBlaze/MBlazeSchedule3.td b/lib/Target/MBlaze/MBlazeSchedule3.td
new file mode 100644
index 0000000..ccbf99d
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeSchedule3.td
@@ -0,0 +1,236 @@
+//===- MBlazeSchedule3.td - MBlaze Scheduling Definitions --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MBlaze instruction itineraries for the three stage pipeline.
+//===----------------------------------------------------------------------===//
+def MBlazePipe3Itineraries : ProcessorItineraries<
+  [IF,ID,EX], [], [
+
+  // ALU instruction with one destination register and either two register
+  // source operands or one register source operand and one immediate operand.
+  // The instruction takes one cycle to execute in each of the stages. The
+  // two source operands are read during the decode stage and the result is
+  // ready after the execute stage.
+  InstrItinData< IIC_ALU,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>], // one cycle in execute stage
+               [ 2                    // result ready after two cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // ALU multiply instruction with one destination register and either two
+  // register source operands or one register source operand and one immediate
+  // operand.  The instruction takes one cycle to execute in each of the
+  // pipeline stages except the execute stage, which takes three cycles. The
+  // two source operands are read during the decode stage and the result is
+  // ready after the execute stage.
+  InstrItinData< IIC_ALUm,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<3,[EX]>], // three cycles in execute stage
+               [ 4                    // result ready after four cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // ALU divide instruction with one destination register two register source
+  // operands. The instruction takes one cycle to execute in each the pipeline
+  // stages except the execute stage, which takes 34 cycles. The two
+  // source operands are read during the decode stage and the result is ready
+  // after the execute stage.
+  InstrItinData< IIC_ALUd,
+               [ InstrStage<1,[IF]>    // one cycle in fetch stage
+               , InstrStage<1,[ID]>    // one cycle in decode stage
+               , InstrStage<34,[EX]>], // 34 cycles in execute stage
+               [ 35                    // result ready after 35 cycles
+               , 1                     // first operand read after one cycle
+               , 1 ]>,                 // second operand read after one cycle
+
+  // Shift instruction with one destination register and either two register
+  // source operands or one register source operand and one immediate operand.
+  // The instruction takes one cycle to execute in each of the pipeline stages
+  // except the execute stage, which takes two cycles.  The two source operands
+  // are read during the decode stage and the result is ready after the execute
+  // stage.
+  InstrItinData< IIC_SHT,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<2,[EX]>], // two cycles in execute stage
+               [ 3                    // result ready after three cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Branch instruction with one source operand register. The instruction takes
+  // one cycle to execute in each of the pipeline stages. The source operand is
+  // read during the decode stage.
+  InstrItinData< IIC_BR,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>], // one cycle in execute stage
+               [ 1 ]>,                // first operand read after one cycle
+
+  // Conditional branch instruction with two source operand registers. The
+  // instruction takes one cycle to execute in each of the pipeline stages. The
+  // two source operands are read during the decode stage.
+  InstrItinData< IIC_BRc,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>], // one cycle in execute stage
+               [ 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Branch and link instruction with one destination register and one source
+  // operand register. The instruction takes one cycle to execute in each of
+  // the pipeline stages. The source operand is read during the decode stage
+  // and the destination register is ready after the execute stage.
+  InstrItinData< IIC_BRl,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>], // one cycle in execute stage
+               [ 2                    // result ready after two cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Cache control instruction with two source operand registers. The
+  // instruction takes one cycle to execute in each of the pipeline stages
+  // except the memory access stage, which takes two cycles. The source
+  // operands are read during the decode stage.
+  InstrItinData< IIC_WDC,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<2,[EX]>], // two cycles in execute stage
+               [ 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Floating point instruction with one destination register and two source
+  // operand registers. The instruction takes one cycle to execute in each of
+  // the pipeline stages except the execute stage, which takes six cycles. The
+  // source operands are read during the decode stage and the results are ready
+  // after the execute stage.
+  InstrItinData< IIC_FPU,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<6,[EX]>], // six cycles in execute stage
+               [ 7                    // result ready after seven cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Floating point divide instruction with one destination register and two
+  // source operand registers. The instruction takes one cycle to execute in
+  // each of the pipeline stages except the execute stage, which takes 30
+  // cycles. The source operands are read during the decode stage and the
+  // results are ready after the execute stage.
+  InstrItinData< IIC_FPUd,
+               [ InstrStage<1,[IF]>    // one cycle in fetch stage
+               , InstrStage<1,[ID]>    // one cycle in decode stage
+               , InstrStage<30,[EX]>], // one cycle in execute stage
+               [ 31                    // result ready after 31 cycles
+               , 1                     // first operand read after one cycle
+               , 1 ]>,                 // second operand read after one cycle
+
+  // Convert floating point to integer instruction with one destination
+  // register and one source operand register. The instruction takes one cycle
+  // to execute in each of the pipeline stages except the execute stage,
+  // which takes seven cycles. The source operands are read during the decode
+  // stage and the results are ready after the execute stage.
+  InstrItinData< IIC_FPUi,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<7,[EX]>], // seven cycles in execute stage
+               [ 8                    // result ready after eight cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Convert integer to floating point instruction with one destination
+  // register and one source operand register. The instruction takes one cycle
+  // to execute in each of the pipeline stages except the execute stage,
+  // which takes six cycles. The source operands are read during the decode
+  // stage and the results are ready after the execute stage.
+  InstrItinData< IIC_FPUf,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<6,[EX]>], // six cycles in execute stage
+               [ 7                    // result ready after seven cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Floating point square root instruction with one destination register and
+  // one source operand register. The instruction takes one cycle to execute in
+  // each of the pipeline stages except the execute stage, which takes 29
+  // cycles. The source operands are read during the decode stage and the
+  // results are ready after the execute stage.
+  InstrItinData< IIC_FPUs,
+               [ InstrStage<1,[IF]>    // one cycle in fetch stage
+               , InstrStage<1,[ID]>    // one cycle in decode stage
+               , InstrStage<29,[EX]>], // 29 cycles in execute stage
+               [ 30                    // result ready after 30 cycles
+               , 1 ]>,                 // first operand read after one cycle
+
+  // Floating point comparison instruction with one destination register and
+  // two source operand registers. The instruction takes one cycle to execute
+  // in each of the pipeline stages except the execute stage, which takes three
+  // cycles. The source operands are read during the decode stage and the
+  // results are ready after the execute stage.
+  InstrItinData< IIC_FPUc,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<3,[EX]>], // three cycles in execute stage
+               [ 4                    // result ready after four cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // FSL get instruction with one register or immediate source operand and one
+  // destination register. The instruction takes one cycle to execute in each
+  // of the pipeline stages except the execute stage, which takes two cycles.
+  // The one source operand is read during the decode stage and the result is
+  // ready after the execute stage.
+  InstrItinData< IIC_FSLg,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<2,[EX]>], // two cycles in execute stage
+               [ 3                    // result ready after two cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // FSL put instruction with either two register source operands or one
+  // register source operand and one immediate operand. There is no result
+  // produced by the instruction. The instruction takes one cycle to execute in
+  // each of the pipeline stages except the execute stage, which takes two
+  // cycles. The two source operands are read during the decode stage.
+  InstrItinData< IIC_FSLp,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<2,[EX]>], // two cycles in execute stage
+               [ 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Memory store instruction with either three register source operands or two
+  // register source operands and one immediate operand. There is no result
+  // produced by the instruction. The instruction takes one cycle to execute in
+  // each of the pipeline stages except the execute stage, which takes two
+  // cycles. All of the source operands are read during the decode stage.
+  InstrItinData< IIC_MEMs,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<2,[EX]>], // two cycles in execute stage
+               [ 1                    // first operand read after one cycle
+               , 1                    // second operand read after one cycle
+               , 1 ]>,                // third operand read after one cycle
+
+  // Memory load instruction with one destination register and either two
+  // register source operands or one register source operand and one immediate
+  // operand. The instruction takes one cycle to execute in each of the
+  // pipeline stages except the execute stage, which takes two cycles. All of
+  // the source operands are read during the decode stage and the result is
+  // ready after the execute stage.
+  InstrItinData< IIC_MEMl,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<2,[EX]>], // two cycles in execute stage
+               [ 3                    // result ready after four cycles
+               , 1                    // second operand read after one cycle
+               , 1 ]>                 // third operand read after one cycle
+]>;
diff --git a/lib/Target/MBlaze/MBlazeSchedule5.td b/lib/Target/MBlaze/MBlazeSchedule5.td
new file mode 100644
index 0000000..fa88766
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeSchedule5.td
@@ -0,0 +1,267 @@
+//===- MBlazeSchedule5.td - MBlaze Scheduling Definitions --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MBlaze instruction itineraries for the five stage pipeline.
+//===----------------------------------------------------------------------===//
+def MBlazePipe5Itineraries : ProcessorItineraries<
+  [IF,ID,EX,MA,WB], [], [
+
+  // ALU instruction with one destination register and either two register
+  // source operands or one register source operand and one immediate operand.
+  // The instruction takes one cycle to execute in each of the stages. The
+  // two source operands are read during the decode stage and the result is
+  // ready after the execute stage.
+  InstrItinData< IIC_ALU,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 2                    // result ready after two cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // ALU multiply instruction with one destination register and either two
+  // register source operands or one register source operand and one immediate
+  // operand.  The instruction takes one cycle to execute in each of the
+  // pipeline stages. The two source operands are read during the decode stage
+  // and the result is ready after the execute stage.
+  InstrItinData< IIC_ALUm,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 2                    // result ready after two cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // ALU divide instruction with one destination register two register source
+  // operands. The instruction takes one cycle to execute in each the pipeline
+  // stages except the memory access stage, which takes 31 cycles. The two
+  // source operands are read during the decode stage and the result is ready
+  // after the memory access stage.
+  InstrItinData< IIC_ALUd,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<31,[MA]>  // 31 cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 33                   // result ready after 33 cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Shift instruction with one destination register and either two register
+  // source operands or one register source operand and one immediate operand.
+  // The instruction takes one cycle to execute in each of the pipeline stages.
+  // The two source operands are read during the decode stage and the result is
+  // ready after the memory access stage.
+  InstrItinData< IIC_SHT,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 3                    // result ready after three cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Branch instruction with one source operand register. The instruction takes
+  // one cycle to execute in each of the pipeline stages. The source operand is
+  // read during the decode stage.
+  InstrItinData< IIC_BR,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 1 ]>,                // first operand read after one cycle
+
+  // Conditional branch instruction with two source operand registers. The
+  // instruction takes one cycle to execute in each of the pipeline stages. The
+  // two source operands are read during the decode stage.
+  InstrItinData< IIC_BRc,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Branch and link instruction with one destination register and one source
+  // operand register. The instruction takes one cycle to execute in each of
+  // the pipeline stages. The source operand is read during the decode stage
+  // and the destination register is ready after the writeback stage.
+  InstrItinData< IIC_BRl,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 4                    // result ready after four cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Cache control instruction with two source operand registers. The
+  // instruction takes one cycle to execute in each of the pipeline stages
+  // except the memory access stage, which takes two cycles. The source
+  // operands are read during the decode stage.
+  InstrItinData< IIC_WDC,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<2,[MA]>   // two cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Floating point instruction with one destination register and two source
+  // operand registers. The instruction takes one cycle to execute in each of
+  // the pipeline stages except the memory access stage, which takes two
+  // cycles. The source operands are read during the decode stage and the
+  // results are ready after the writeback stage.
+  InstrItinData< IIC_FPU,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<2,[MA]>   // two cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 5                    // result ready after five cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Floating point divide instruction with one destination register and two
+  // source operand registers. The instruction takes one cycle to execute in
+  // each of the pipeline stages except the memory access stage, which takes 26
+  // cycles. The source operands are read during the decode stage and the
+  // results are ready after the writeback stage.
+  InstrItinData< IIC_FPUd,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<26,[MA]>  // 26 cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 29                   // result ready after 29 cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Convert floating point to integer instruction with one destination
+  // register and one source operand register. The instruction takes one cycle
+  // to execute in each of the pipeline stages except the memory access stage,
+  // which takes three cycles. The source operands are read during the decode
+  // stage and the results are ready after the writeback stage.
+  InstrItinData< IIC_FPUi,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<3,[MA]>   // three cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 6                   // result ready after six cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Convert integer to floating point instruction with one destination
+  // register and one source operand register. The instruction takes one cycle
+  // to execute in each of the pipeline stages except the memory access stage,
+  // which takes two cycles. The source operands are read during the decode
+  // stage and the results are ready after the writeback stage.
+  InstrItinData< IIC_FPUf,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<2,[MA]>   // two cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 5                    // result ready after five cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Floating point square root instruction with one destination register and
+  // one source operand register. The instruction takes one cycle to execute in
+  // each of the pipeline stages except the memory access stage, which takes 25
+  // cycles. The source operands are read during the decode stage and the
+  // results are ready after the writeback stage.
+  InstrItinData< IIC_FPUs,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<25,[MA]>  // 25 cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 28                   // result ready after 28 cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Floating point comparison instruction with one destination register and
+  // two source operand registers. The instruction takes one cycle to execute
+  // in each of the pipeline stages. The source operands are read during the
+  // decode stage and the results are ready after the execute stage.
+  InstrItinData< IIC_FPUc,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 2                    // result ready after two cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // FSL get instruction with one register or immediate source operand and one
+  // destination register. The instruction takes one cycle to execute in each
+  // of the pipeline stages. The one source operand is read during the decode
+  // stage and the result is ready after the execute stage.
+  InstrItinData< IIC_FSLg,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 2                    // result ready after two cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // FSL put instruction with either two register source operands or one
+  // register source operand and one immediate operand. There is no result
+  // produced by the instruction. The instruction takes one cycle to execute in
+  // each of the pipeline stages. The two source operands are read during the
+  // decode stage.
+  InstrItinData< IIC_FSLp,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Memory store instruction with either three register source operands or two
+  // register source operands and one immediate operand. There is no result
+  // produced by the instruction. The instruction takes one cycle to execute in
+  // each of the pipeline stages. All of the source operands are read during
+  // the decode stage.
+  InstrItinData< IIC_MEMs,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 1                    // first operand read after one cycle
+               , 1                    // second operand read after one cycle
+               , 1 ]>,                // third operand read after one cycle
+
+  // Memory load instruction with one destination register and either two
+  // register source operands or one register source operand and one immediate
+  // operand. The instruction takes one cycle to execute in each of the
+  // pipeline stages. All of the source operands are read during the decode
+  // stage and the result is ready after the writeback stage.
+  InstrItinData< IIC_MEMl,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 4                    // result ready after four cycles
+               , 1                    // second operand read after one cycle
+               , 1 ]>                 // third operand read after one cycle
+]>;
diff --git a/lib/Target/MBlaze/MBlazeSubtarget.cpp b/lib/Target/MBlaze/MBlazeSubtarget.cpp
index 3440521..a80744a 100644
--- a/lib/Target/MBlaze/MBlazeSubtarget.cpp
+++ b/lib/Target/MBlaze/MBlazeSubtarget.cpp
@@ -13,19 +13,39 @@
 
 #include "MBlazeSubtarget.h"
 #include "MBlaze.h"
+#include "MBlazeRegisterInfo.h"
 #include "MBlazeGenSubtarget.inc"
 #include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
 MBlazeSubtarget::MBlazeSubtarget(const std::string &TT, const std::string &FS):
-  HasPipe3(false), HasBarrel(false), HasDiv(false), HasMul(false),
-  HasFSL(false), HasEFSL(false), HasMSRSet(false), HasException(false),
-  HasPatCmp(false), HasFPU(false), HasESR(false), HasPVR(false),
-  HasMul64(false), HasSqrt(false), HasMMU(false)
+  HasBarrel(false), HasDiv(false), HasMul(false), HasPatCmp(false),
+  HasFPU(false), HasMul64(false), HasSqrt(false)
 {
-  std::string CPU = "v400";
-  MBlazeArchVersion = V400;
-
   // Parse features string.
-  ParseSubtargetFeatures(FS, CPU);
+  std::string CPU = "mblaze";
+  CPU = ParseSubtargetFeatures(FS, CPU);
+
+  // Only use instruction scheduling if the selected CPU has an instruction
+  // itinerary (the default CPU is the only one that doesn't).
+  HasItin = CPU != "mblaze";
+  DEBUG(dbgs() << "CPU " << CPU << "(" << HasItin << ")\n");
+
+  // Compute the issue width of the MBlaze itineraries
+  computeIssueWidth();
+}
+
+void MBlazeSubtarget::computeIssueWidth() {
+  InstrItins.IssueWidth = 1;
+}
+
+bool MBlazeSubtarget::
+enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                      TargetSubtarget::AntiDepBreakMode& Mode,
+                      RegClassVector& CriticalPathRCs) const {
+  Mode = TargetSubtarget::ANTIDEP_CRITICAL;
+  CriticalPathRCs.clear();
+  CriticalPathRCs.push_back(&MBlaze::GPRRegClass);
+  return HasItin && OptLevel >= CodeGenOpt::Default;
 }
+
diff --git a/lib/Target/MBlaze/MBlazeSubtarget.h b/lib/Target/MBlaze/MBlazeSubtarget.h
index bebb3f7..2255b28 100644
--- a/lib/Target/MBlaze/MBlazeSubtarget.h
+++ b/lib/Target/MBlaze/MBlazeSubtarget.h
@@ -24,29 +24,14 @@ namespace llvm {
 class MBlazeSubtarget : public TargetSubtarget {
 
 protected:
-
-  enum MBlazeArchEnum {
-    V400, V500, V600, V700, V710
-  };
-
-  // MBlaze architecture version
-  MBlazeArchEnum MBlazeArchVersion;
-
-  bool HasPipe3;
   bool HasBarrel;
   bool HasDiv;
   bool HasMul;
-  bool HasFSL;
-  bool HasEFSL;
-  bool HasMSRSet;
-  bool HasException;
   bool HasPatCmp;
   bool HasFPU;
-  bool HasESR;
-  bool HasPVR;
   bool HasMul64;
   bool HasSqrt;
-  bool HasMMU;
+  bool HasItin;
 
   InstrItineraryData InstrItins;
 
@@ -61,18 +46,26 @@ public:
   std::string ParseSubtargetFeatures(const std::string &FS,
                                      const std::string &CPU);
 
+  /// Compute the number of maximum number of issues per cycle for the
+  /// MBlaze scheduling itineraries.
+  void computeIssueWidth();
+
+  /// enablePostRAScheduler - True at 'More' optimization.
+  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                             TargetSubtarget::AntiDepBreakMode& Mode,
+                             RegClassVector& CriticalPathRCs) const;
+
+  /// getInstrItins - Return the instruction itineraies based on subtarget.
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+
+  bool hasItin()   const { return HasItin; }
+  bool hasPCMP()   const { return HasPatCmp; }
   bool hasFPU()    const { return HasFPU; }
   bool hasSqrt()   const { return HasSqrt; }
   bool hasMul()    const { return HasMul; }
   bool hasMul64()  const { return HasMul64; }
   bool hasDiv()    const { return HasDiv; }
   bool hasBarrel() const { return HasBarrel; }
-
-  bool isV400() const { return MBlazeArchVersion == V400; }
-  bool isV500() const { return MBlazeArchVersion == V500; }
-  bool isV600() const { return MBlazeArchVersion == V600; }
-  bool isV700() const { return MBlazeArchVersion == V700; }
-  bool isV710() const { return MBlazeArchVersion == V710; }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.cpp b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
index cd949e1..df34a83 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.cpp
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
@@ -36,19 +36,18 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
                                     bool RelaxAll,
                                     bool NoExecStack) {
   Triple TheTriple(TT);
-  switch (TheTriple.getOS()) {
-  case Triple::Darwin:
+
+  if (TheTriple.isOSDarwin()) {
     llvm_unreachable("MBlaze does not support Darwin MACH-O format");
     return NULL;
-  case Triple::MinGW32:
-  case Triple::Cygwin:
-  case Triple::Win32:
+  }
+
+  if (TheTriple.isOSWindows()) {
     llvm_unreachable("MBlaze does not support Windows COFF format");
     return NULL;
-  default:
-    return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll,
-                             NoExecStack);
   }
+
+  return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack);
 }
 
 
@@ -87,7 +86,8 @@ MBlazeTargetMachine(const Target &T, const std::string &TT,
   DataLayout("E-p:32:32:32-i8:8:8-i16:16:16"),
   InstrInfo(*this),
   FrameLowering(Subtarget),
-  TLInfo(*this), TSInfo(*this), ELFWriterInfo(*this) {
+  TLInfo(*this), TSInfo(*this), ELFWriterInfo(*this),
+  InstrItins(Subtarget.getInstrItineraryData()) {
   if (getRelocationModel() == Reloc::Default) {
       setRelocationModel(Reloc::Static);
   }
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.h b/lib/Target/MBlaze/MBlazeTargetMachine.h
index 45ad078..48ce37a 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.h
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.h
@@ -38,13 +38,18 @@ namespace llvm {
     MBlazeSelectionDAGInfo TSInfo;
     MBlazeIntrinsicInfo    IntrinsicInfo;
     MBlazeELFWriterInfo    ELFWriterInfo;
+    InstrItineraryData     InstrItins;
+
   public:
     MBlazeTargetMachine(const Target &T, const std::string &TT,
-                      const std::string &FS);
+                        const std::string &FS);
 
     virtual const MBlazeInstrInfo *getInstrInfo() const
     { return &InstrInfo; }
 
+    virtual const InstrItineraryData *getInstrItineraryData() const
+    {  return &InstrItins; }
+
     virtual const TargetFrameLowering *getFrameLowering() const
     { return &FrameLowering; }
 
diff --git a/lib/Target/MBlaze/TODO b/lib/Target/MBlaze/TODO
index 2e613eb..317d7c0 100644
--- a/lib/Target/MBlaze/TODO
+++ b/lib/Target/MBlaze/TODO
@@ -9,8 +9,6 @@
   needs to be examined more closely:
     - The stack layout needs to be examined to make sure it meets
       the standard, especially in regards to var arg functions.
-    - The processor itineraries are copied from a different backend
-      and need to be updated to model the MicroBlaze correctly.
     - Look at the MBlazeGenFastISel.inc stuff and make use of it
       if appropriate.
 
@@ -18,9 +16,6 @@
   There are a few things that need to be looked at:
     - There are some instructions that are not generated by the backend
       and have not been tested as far as the parser is concerned.
-    - The assembly parser does not use any MicroBlaze specific directives.
+    - The assembly parser does not use many MicroBlaze specific directives.
       I should investigate if there are MicroBlaze specific directive and,
       if there are, add them.
-    - The instruction MFS and MTS use special names for some of the
-      special registers that can be accessed. These special register
-      names should be parsed by the assembly parser.
diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
index f0e1ce2..63860dc 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
+++ b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
@@ -18,11 +18,12 @@
 
 namespace llvm {
   class MCOperand;
+  class TargetMachine;
 
   class MSP430InstPrinter : public MCInstPrinter {
   public:
-    MSP430InstPrinter(const MCAsmInfo &MAI) : MCInstPrinter(MAI) {
-    }
+    MSP430InstPrinter(TargetMachine &TM, const MCAsmInfo &MAI)
+      : MCInstPrinter(MAI) {}
 
     virtual void printInst(const MCInst *MI, raw_ostream &O);
 
diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp
index a1a7f44..5264d68 100644
--- a/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -164,10 +164,11 @@ void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 }
 
 static MCInstPrinter *createMSP430MCInstPrinter(const Target &T,
+                                                TargetMachine &TM,
                                                 unsigned SyntaxVariant,
                                                 const MCAsmInfo &MAI) {
   if (SyntaxVariant == 0)
-    return new MSP430InstPrinter(MAI);
+    return new MSP430InstPrinter(TM, MAI);
   return 0;
 }
 
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index a95d59c..006785b 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -515,7 +515,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
 
   // Build a sequence of copy-to-reg nodes chained together with token chain and
   // flag operands which copy the outgoing args into registers.  The InFlag in
-  // necessary since all emited instructions must be stuck together.
+  // necessary since all emitted instructions must be stuck together.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index 26df1a0..8939b0a 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -13,6 +13,7 @@ tablegen(MipsGenSubtarget.inc -gen-subtarget)
 add_llvm_target(MipsCodeGen
   MipsAsmPrinter.cpp
   MipsDelaySlotFiller.cpp
+  MipsExpandPseudo.cpp
   MipsInstrInfo.cpp
   MipsISelDAGToDAG.cpp
   MipsISelLowering.cpp
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index a9ab050..05b4c5a 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the entry points for global functions defined in 
+// This file contains the entry points for global functions defined in
 // the LLVM Mips back-end.
 //
 //===----------------------------------------------------------------------===//
@@ -25,6 +25,7 @@ namespace llvm {
 
   FunctionPass *createMipsISelDag(MipsTargetMachine &TM);
   FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsExpandPseudoPass(MipsTargetMachine &TM);
 
   extern Target TheMipsTarget;
   extern Target TheMipselTarget;
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 3e6437b..b79016d 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -59,7 +59,7 @@ def FeatureMips1       : SubtargetFeature<"mips1", "MipsArchVersion", "Mips1",
 def FeatureMips2       : SubtargetFeature<"mips2", "MipsArchVersion", "Mips2",
                                 "Mips2 ISA Support">;
 def FeatureMips32      : SubtargetFeature<"mips32", "MipsArchVersion", "Mips32",
-                                "Mips32 ISA Support", 
+                                "Mips32 ISA Support",
                                 [FeatureCondMov, FeatureBitCount]>;
 def FeatureMips32r2    : SubtargetFeature<"mips32r2", "MipsArchVersion",
                                 "Mips32r2", "Mips32r2 ISA Support",
@@ -81,7 +81,7 @@ def : Proc<"r6000", [FeatureMips2]>;
 
 def : Proc<"4ke", [FeatureMips32r2]>;
 
-// Allegrex is a 32bit subset of r4000, both for interger and fp registers,
+// Allegrex is a 32bit subset of r4000, both for integer and fp registers,
 // but much more similar to Mips2 than Mips3. It also contains some of
 // Mips32/Mips32r2 instructions and a custom vector fpu processor.
 def : Proc<"allegrex", [FeatureMips2, FeatureSingleFloat, FeatureEABI,
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index bd28a9b..502f744 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -30,7 +30,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetLoweringObjectFile.h" 
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegistry.h"
@@ -53,14 +53,14 @@ namespace {
       return "Mips Assembly Printer";
     }
 
-    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, 
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                          unsigned AsmVariant, const char *ExtraCode,
                          raw_ostream &O);
     void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
     void printUnsignedImm(const MachineInstr *MI, int opNum, raw_ostream &O);
-    void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O, 
+    void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
                          const char *Modifier = 0);
-    void printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O, 
+    void printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
                          const char *Modifier = 0);
     void printSavedRegsBitmask(raw_ostream &O);
     void printHex32(unsigned int Value, raw_ostream &O);
@@ -77,7 +77,8 @@ namespace {
     }
     virtual void EmitFunctionBodyStart();
     virtual void EmitFunctionBodyEnd();
-    virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const;
+    virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
+                                                   MBB) const;
     static const char *getRegisterName(unsigned RegNo);
 
     virtual void EmitFunctionEntryLabel();
@@ -94,12 +95,12 @@ namespace {
 //  -- Frame directive "frame Stackpointer, Stacksize, RARegister"
 //  Describe the stack frame.
 //
-//  -- Mask directives "(f)mask  bitmask, offset" 
+//  -- Mask directives "(f)mask  bitmask, offset"
 //  Tells the assembler which registers are saved and where.
-//  bitmask - contain a little endian bitset indicating which registers are 
-//            saved on function prologue (e.g. with a 0x80000000 mask, the 
+//  bitmask - contain a little endian bitset indicating which registers are
+//            saved on function prologue (e.g. with a 0x80000000 mask, the
 //            assembler knows the register 31 (RA) is saved at prologue.
-//  offset  - the position before stack pointer subtraction indicating where 
+//  offset  - the position before stack pointer subtraction indicating where
 //            the first saved register on prologue is located. (e.g. with a
 //
 //  Consider the following function prologue:
@@ -110,9 +111,9 @@ namespace {
 //       sw $ra, 40($sp)
 //       sw $fp, 36($sp)
 //
-//    With a 0xc0000000 mask, the assembler knows the register 31 (RA) and 
-//    30 (FP) are saved at prologue. As the save order on prologue is from 
-//    left to right, RA is saved first. A -8 offset means that after the 
+//    With a 0xc0000000 mask, the assembler knows the register 31 (RA) and
+//    30 (FP) are saved at prologue. As the save order on prologue is from
+//    left to right, RA is saved first. A -8 offset means that after the
 //    stack pointer subtration, the first register in the mask (RA) will be
 //    saved at address 48-8=40.
 //
@@ -122,7 +123,7 @@ namespace {
 // Mask directives
 //===----------------------------------------------------------------------===//
 
-// Create a bitmask with all callee saved registers for CPU or Floating Point 
+// Create a bitmask with all callee saved registers for CPU or Floating Point
 // registers. For CPU registers consider RA, GP and FP for saving if necessary.
 void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) {
   const TargetFrameLowering *TFI = TM.getFrameLowering();
@@ -168,7 +169,7 @@ void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) {
 // Print a 32 bit hex number with all numbers.
 void MipsAsmPrinter::printHex32(unsigned Value, raw_ostream &O) {
   O << "0x";
-  for (int i = 7; i >= 0; i--) 
+  for (int i = 7; i >= 0; i--)
     O << utohexstr((Value & (0xF << (i*4))) >> (i*4));
 }
 
@@ -191,9 +192,9 @@ void MipsAsmPrinter::emitFrameDirective() {
 }
 
 /// Emit Set directives.
-const char *MipsAsmPrinter::getCurrentABIString() const { 
+const char *MipsAsmPrinter::getCurrentABIString() const {
   switch (Subtarget->getTargetABI()) {
-  case MipsSubtarget::O32:  return "abi32";  
+  case MipsSubtarget::O32:  return "abi32";
   case MipsSubtarget::O64:  return "abiO64";
   case MipsSubtarget::N32:  return "abiN32";
   case MipsSubtarget::N64:  return "abi64";
@@ -203,7 +204,7 @@ const char *MipsAsmPrinter::getCurrentABIString() const {
 
   llvm_unreachable("Unknown Mips ABI");
   return NULL;
-}  
+}
 
 void MipsAsmPrinter::EmitFunctionEntryLabel() {
   OutStreamer.EmitRawText("\t.ent\t" + Twine(CurrentFnSym->getName()));
@@ -214,7 +215,7 @@ void MipsAsmPrinter::EmitFunctionEntryLabel() {
 /// the first basic block in the function.
 void MipsAsmPrinter::EmitFunctionBodyStart() {
   emitFrameDirective();
-  
+
   SmallString<128> Str;
   raw_svector_ostream OS(Str);
   printSavedRegsBitmask(OS);
@@ -226,7 +227,7 @@ void MipsAsmPrinter::EmitFunctionBodyStart() {
 void MipsAsmPrinter::EmitFunctionBodyEnd() {
   // There are instruction for this macros, but they must
   // always be at the function end, and we can't emit and
-  // break with BB logic. 
+  // break with BB logic.
   OutStreamer.EmitRawText(StringRef("\t.set\tmacro"));
   OutStreamer.EmitRawText(StringRef("\t.set\treorder"));
   OutStreamer.EmitRawText("\t.end\t" + Twine(CurrentFnSym->getName()));
@@ -236,8 +237,8 @@ void MipsAsmPrinter::EmitFunctionBodyEnd() {
 /// isBlockOnlyReachableByFallthough - Return true if the basic block has
 /// exactly one predecessor and the control transfer mechanism between
 /// the predecessor and this block is a fall-through.
-bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) 
-    const {
+bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
+                                                       MBB) const {
   // The predecessor has to be immediately before this block.
   const MachineBasicBlock *Pred = *MBB->pred_begin();
 
@@ -246,16 +247,41 @@ bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock *
   if (const BasicBlock *bb = Pred->getBasicBlock())
     if (isa<SwitchInst>(bb->getTerminator()))
       return false;
+
+  // If this is a landing pad, it isn't a fall through.  If it has no preds,
+  // then nothing falls through to it.
+  if (MBB->isLandingPad() || MBB->pred_empty())
+    return false;
+
+  // If there isn't exactly one predecessor, it can't be a fall through.
+  MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), PI2 = PI;
+  ++PI2;
+ 
+  if (PI2 != MBB->pred_end())
+    return false;  
+
+  // The predecessor has to be immediately before this block.
+  if (!Pred->isLayoutSuccessor(MBB))
+    return false;
+   
+  // If the block is completely empty, then it definitely does fall through.
+  if (Pred->empty())
+    return true;
   
-  return AsmPrinter::isBlockOnlyReachableByFallthrough(MBB);
+  // Otherwise, check the last instruction.
+  // Check if the last terminator is an unconditional branch.
+  MachineBasicBlock::const_iterator I = Pred->end();
+  while (I != Pred->begin() && !(--I)->getDesc().isTerminator()) ;
+
+  return !I->getDesc().isBarrier();
 }
 
 // Print out an operand for an inline asm expression.
-bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, 
+bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                                      unsigned AsmVariant,const char *ExtraCode,
                                      raw_ostream &O) {
   // Does this asm operand have a single letter operand modifier?
-  if (ExtraCode && ExtraCode[0]) 
+  if (ExtraCode && ExtraCode[0])
     return true; // Unknown modifier.
 
   printOperand(MI, OpNo, O);
@@ -273,22 +299,9 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   switch(MO.getTargetFlags()) {
   case MipsII::MO_GPREL:    O << "%gp_rel("; break;
   case MipsII::MO_GOT_CALL: O << "%call16("; break;
-  case MipsII::MO_GOT: {
-    const MachineOperand &LastMO = MI->getOperand(opNum-1);
-    bool LastMOIsGP = LastMO.getType() == MachineOperand::MO_Register
-                      && LastMO.getReg() == Mips::GP;
-    if (MI->getOpcode() == Mips::LW || LastMOIsGP)
-      O << "%got(";
-    else
-      O << "%lo(";
-    break;
-  }
-  case MipsII::MO_ABS_HILO:
-    if (MI->getOpcode() == Mips::LUi)
-      O << "%hi(";
-    else
-      O << "%lo(";     
-    break;
+  case MipsII::MO_GOT:      O << "%got(";    break;
+  case MipsII::MO_ABS_HI:   O << "%hi(";     break;
+  case MipsII::MO_ABS_LO:   O << "%lo(";     break;
   }
 
   switch (MO.getType()) {
@@ -308,6 +321,12 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
       O << *Mang->getSymbol(MO.getGlobal());
       break;
 
+    case MachineOperand::MO_BlockAddress: {
+      MCSymbol* BA = GetBlockAddressSymbol(MO.getBlockAddress());
+      O << BA->getName();
+      break;
+    }
+
     case MachineOperand::MO_ExternalSymbol:
       O << *GetExternalSymbolSymbol(MO.getSymbolName());
       break;
@@ -323,7 +342,7 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
       if (MO.getOffset())
         O << "+" << MO.getOffset();
       break;
-  
+
     default:
       llvm_unreachable("<unknown operand type>");
   }
@@ -336,7 +355,7 @@ void MipsAsmPrinter::printUnsignedImm(const MachineInstr *MI, int opNum,
   const MachineOperand &MO = MI->getOperand(opNum);
   if (MO.isImm())
     O << (unsigned short int)MO.getImm();
-  else 
+  else
     printOperand(MI, opNum, O);
 }
 
@@ -352,8 +371,8 @@ printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
     return;
   }
 
-  // Load/Store memory operands -- imm($reg) 
-  // If PIC target the target is loaded as the 
+  // Load/Store memory operands -- imm($reg)
+  // If PIC target the target is loaded as the
   // pattern lw $25,%call16($28)
   printOperand(MI, opNum, O);
   O << "(";
@@ -365,12 +384,12 @@ void MipsAsmPrinter::
 printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
                 const char *Modifier) {
   const MachineOperand& MO = MI->getOperand(opNum);
-  O << Mips::MipsFCCToString((Mips::CondCode)MO.getImm()); 
+  O << Mips::MipsFCCToString((Mips::CondCode)MO.getImm());
 }
 
 void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
   // FIXME: Use SwitchSection.
-  
+
   // Tell the assembler which ABI we are using
   OutStreamer.EmitRawText("\t.section .mdebug." + Twine(getCurrentABIString()));
 
@@ -383,11 +402,11 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
   }
 
   // return to previous section
-  OutStreamer.EmitRawText(StringRef("\t.previous")); 
+  OutStreamer.EmitRawText(StringRef("\t.previous"));
 }
 
 // Force static initialization.
-extern "C" void LLVMInitializeMipsAsmPrinter() { 
+extern "C" void LLVMInitializeMipsAsmPrinter() {
   RegisterAsmPrinter<MipsAsmPrinter> X(TheMipsTarget);
   RegisterAsmPrinter<MipsAsmPrinter> Y(TheMipselTarget);
 }
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index 8f313ef..57aeb1d 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -1,23 +1,23 @@
 //===- MipsCallingConv.td - Calling Conventions for Mips ---*- tablegen -*-===//
-// 
+//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 // This describes the calling conventions for Mips architecture.
 //===----------------------------------------------------------------------===//
 
 /// CCIfSubtarget - Match if the current subtarget has a feature F.
-class CCIfSubtarget<string F, CCAction A>: 
+class CCIfSubtarget<string F, CCAction A>:
   CCIf<!strconcat("State.getTarget().getSubtarget<MipsSubtarget>().", F), A>;
 
 //===----------------------------------------------------------------------===//
 // Mips O32 Calling Convention
 //===----------------------------------------------------------------------===//
 
-// Only the return rules are defined here for O32. The rules for argument 
+// Only the return rules are defined here for O32. The rules for argument
 // passing are defined in MipsISelLowering.cpp.
 def RetCC_MipsO32 : CallingConv<[
   // i32 are returned in registers V0, V1
@@ -41,15 +41,15 @@ def CC_MipsEABI : CallingConv<[
   // Integer arguments are passed in integer registers.
   CCIfType<[i32], CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3]>>,
 
-  // Single fp arguments are passed in pairs within 32-bit mode 
-  CCIfType<[f32], CCIfSubtarget<"isSingleFloat()", 
+  // Single fp arguments are passed in pairs within 32-bit mode
+  CCIfType<[f32], CCIfSubtarget<"isSingleFloat()",
                   CCAssignToReg<[F12, F13, F14, F15, F16, F17, F18, F19]>>>,
 
-  CCIfType<[f32], CCIfSubtarget<"isNotSingleFloat()", 
+  CCIfType<[f32], CCIfSubtarget<"isNotSingleFloat()",
                   CCAssignToReg<[F12, F14, F16, F18]>>>,
 
-  // The first 4 doubl fp arguments are passed in single fp registers.
-  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", 
+  // The first 4 double fp arguments are passed in single fp registers.
+  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()",
                   CCAssignToReg<[D6, D7, D8, D9]>>>,
 
   // Integer values get stored in stack slots that are 4 bytes in
diff --git a/lib/Target/Mips/MipsExpandPseudo.cpp b/lib/Target/Mips/MipsExpandPseudo.cpp
new file mode 100644
index 0000000..4423f51
--- /dev/null
+++ b/lib/Target/Mips/MipsExpandPseudo.cpp
@@ -0,0 +1,117 @@
+//===--  MipsExpandPseudo.cpp - Expand pseudo instructions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands pseudo instructions into target instructions after register
+// allocation but before post-RA scheduling.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-expand-pseudo"
+
+#include "Mips.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+namespace {
+  struct MipsExpandPseudo : public MachineFunctionPass {
+
+    TargetMachine &TM;
+    const TargetInstrInfo *TII;
+
+    static char ID;
+    MipsExpandPseudo(TargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
+
+    virtual const char *getPassName() const {
+      return "Mips PseudoInstrs Expansion";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F);
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+
+  private:
+    void ExpandBuildPairF64(MachineBasicBlock&, MachineBasicBlock::iterator);
+    void ExpandExtractElementF64(MachineBasicBlock&,
+                                 MachineBasicBlock::iterator);
+  };
+  char MipsExpandPseudo::ID = 0;
+} // end of anonymous namespace
+
+bool MipsExpandPseudo::runOnMachineFunction(MachineFunction& F) {
+  bool Changed = false;
+
+  for (MachineFunction::iterator I = F.begin(); I != F.end(); ++I)
+    Changed |= runOnMachineBasicBlock(*I);
+
+  return Changed;
+}
+
+bool MipsExpandPseudo::runOnMachineBasicBlock(MachineBasicBlock& MBB) {
+
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end();) {
+    const TargetInstrDesc& Tid = I->getDesc();
+
+    switch(Tid.getOpcode()) {
+    default: 
+      ++I;
+      continue;
+    case Mips::BuildPairF64:
+      ExpandBuildPairF64(MBB, I);
+      break;
+    case Mips::ExtractElementF64:
+      ExpandExtractElementF64(MBB, I);
+      break;
+    } 
+
+    // delete original instr
+    MBB.erase(I++);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+void MipsExpandPseudo::ExpandBuildPairF64(MachineBasicBlock& MBB,
+                                            MachineBasicBlock::iterator I) {  
+  unsigned DstReg = I->getOperand(0).getReg();
+  unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg();
+  const TargetInstrDesc& Mtc1Tdd = TII->get(Mips::MTC1);
+  DebugLoc dl = I->getDebugLoc();
+  const unsigned* SubReg =
+    TM.getRegisterInfo()->getSubRegisters(DstReg);
+
+  // mtc1 Lo, $fp
+  // mtc1 Hi, $fp + 1
+  BuildMI(MBB, I, dl, Mtc1Tdd, *SubReg).addReg(LoReg);
+  BuildMI(MBB, I, dl, Mtc1Tdd, *(SubReg + 1)).addReg(HiReg);
+}
+
+void MipsExpandPseudo::ExpandExtractElementF64(MachineBasicBlock& MBB,
+                                               MachineBasicBlock::iterator I) {
+  unsigned DstReg = I->getOperand(0).getReg();
+  unsigned SrcReg = I->getOperand(1).getReg();
+  unsigned N = I->getOperand(2).getImm();
+  const TargetInstrDesc& Mfc1Tdd = TII->get(Mips::MFC1);
+  DebugLoc dl = I->getDebugLoc();
+  const unsigned* SubReg = TM.getRegisterInfo()->getSubRegisters(SrcReg);
+
+  BuildMI(MBB, I, dl, Mfc1Tdd, DstReg).addReg(*(SubReg + N));
+}
+
+/// createMipsMipsExpandPseudoPass - Returns a pass that expands pseudo 
+/// instrs into real instrs
+FunctionPass *llvm::createMipsExpandPseudoPass(MipsTargetMachine &tm) {
+  return new MipsExpandPseudo(tm);
+}
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index 87a097a..21e3314 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -203,6 +203,46 @@ void MipsFrameLowering::adjustMipsStackFrame(MachineFunction &MF) const {
     MipsFI->setFPUTopSavedRegOff(TopFPUSavedRegOff-StackOffset);
 }
 
+
+// expand pair of register and immediate if the immediate doesn't fit in the
+// 16-bit offset field.
+// e.g.
+//  if OrigImm = 0x10000, OrigReg = $sp:
+//    generate the following sequence of instrs:
+//      lui  $at, hi(0x10000)
+//      addu $at, $sp, $at
+//
+//    (NewReg, NewImm) = ($at, lo(Ox10000))
+//    return true
+static bool expandRegLargeImmPair(unsigned OrigReg, int OrigImm,
+                                  unsigned& NewReg, int& NewImm,
+                                  MachineBasicBlock& MBB,
+                                  MachineBasicBlock::iterator I) {
+  // OrigImm fits in the 16-bit field
+  if (OrigImm < 0x8000 && OrigImm >= -0x8000) {
+    NewReg = OrigReg;
+    NewImm = OrigImm;
+    return false;
+  }
+
+  MachineFunction* MF = MBB.getParent();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  DebugLoc DL = I->getDebugLoc();
+  int ImmLo = OrigImm & 0xffff;
+  int ImmHi = (((unsigned)OrigImm & 0xffff0000) >> 16) +
+              ((OrigImm & 0x8000) != 0);
+
+  // FIXME: change this when mips goes MC".
+  BuildMI(MBB, I, DL, TII->get(Mips::NOAT));
+  BuildMI(MBB, I, DL, TII->get(Mips::LUi), Mips::AT).addImm(ImmHi);
+  BuildMI(MBB, I, DL, TII->get(Mips::ADDu), Mips::AT).addReg(OrigReg)
+                                                     .addReg(Mips::AT);
+  NewReg = Mips::AT;
+  NewImm = ImmLo;
+
+  return true;
+}
+
 void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB   = MF.front();
   MachineFrameInfo *MFI    = MF.getFrameInfo();
@@ -214,6 +254,9 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   bool isPIC = (MF.getTarget().getRelocationModel() == Reloc::PIC_);
+  unsigned NewReg = 0;
+  int NewImm = 0;
+  bool ATUsed;
 
   // Get the right frame order for Mips.
   adjustMipsStackFrame(MF);
@@ -236,22 +279,40 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
   BuildMI(MBB, MBBI, dl, TII.get(Mips::NOMACRO));
 
   // Adjust stack : addi sp, sp, (-imm)
+  ATUsed = expandRegLargeImmPair(Mips::SP, -StackSize, NewReg, NewImm, MBB,
+                                 MBBI);
   BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDiu), Mips::SP)
-      .addReg(Mips::SP).addImm(-StackSize);
+    .addReg(NewReg).addImm(NewImm);
+
+  // FIXME: change this when mips goes MC".
+  if (ATUsed)
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO));
 
-  // Save the return address only if the function isnt a leaf one.
+  // Save the return address only if the function isn't a leaf one.
   // sw  $ra, stack_loc($sp)
   if (MFI->adjustsStack()) {
+    ATUsed = expandRegLargeImmPair(Mips::SP, RAOffset, NewReg, NewImm, MBB,
+                                   MBBI);
     BuildMI(MBB, MBBI, dl, TII.get(Mips::SW))
-        .addReg(Mips::RA).addImm(RAOffset).addReg(Mips::SP);
+      .addReg(Mips::RA).addImm(NewImm).addReg(NewReg);
+
+    // FIXME: change this when mips goes MC".
+    if (ATUsed)
+      BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO));
   }
 
   // if framepointer enabled, save it and set it
   // to point to the stack pointer
   if (hasFP(MF)) {
     // sw  $fp,stack_loc($sp)
+    ATUsed = expandRegLargeImmPair(Mips::SP, FPOffset, NewReg, NewImm, MBB,
+                                   MBBI);
     BuildMI(MBB, MBBI, dl, TII.get(Mips::SW))
-      .addReg(Mips::FP).addImm(FPOffset).addReg(Mips::SP);
+      .addReg(Mips::FP).addImm(NewImm).addReg(NewReg);
+
+    // FIXME: change this when mips goes MC".
+    if (ATUsed)
+      BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO));
 
     // move $fp, $sp
     BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDu), Mips::FP)
@@ -280,6 +341,10 @@ void MipsFrameLowering::emitEpilogue(MachineFunction &MF,
   int FPOffset = MipsFI->getFPStackOffset();
   int RAOffset = MipsFI->getRAStackOffset();
 
+  unsigned NewReg = 0;
+  int NewImm = 0;
+  bool ATUsed = false;
+
   // if framepointer enabled, restore it and restore the
   // stack pointer
   if (hasFP(MF)) {
@@ -288,21 +353,39 @@ void MipsFrameLowering::emitEpilogue(MachineFunction &MF,
       .addReg(Mips::FP).addReg(Mips::ZERO);
 
     // lw  $fp,stack_loc($sp)
+    ATUsed = expandRegLargeImmPair(Mips::SP, FPOffset, NewReg, NewImm, MBB,
+                                   MBBI);
     BuildMI(MBB, MBBI, dl, TII.get(Mips::LW), Mips::FP)
-      .addImm(FPOffset).addReg(Mips::SP);
+      .addImm(NewImm).addReg(NewReg);
+
+    // FIXME: change this when mips goes MC".
+    if (ATUsed)
+      BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO));
   }
 
-  // Restore the return address only if the function isnt a leaf one.
+  // Restore the return address only if the function isn't a leaf one.
   // lw  $ra, stack_loc($sp)
   if (MFI->adjustsStack()) {
+    ATUsed = expandRegLargeImmPair(Mips::SP, RAOffset, NewReg, NewImm, MBB,
+                                   MBBI);
     BuildMI(MBB, MBBI, dl, TII.get(Mips::LW), Mips::RA)
-      .addImm(RAOffset).addReg(Mips::SP);
+      .addImm(NewImm).addReg(NewReg);
+
+    // FIXME: change this when mips goes MC".
+    if (ATUsed)
+      BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO));
   }
 
   // adjust stack  : insert addi sp, sp, (imm)
   if (NumBytes) {
+    ATUsed = expandRegLargeImmPair(Mips::SP, NumBytes, NewReg, NewImm, MBB,
+                                   MBBI);
     BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDiu), Mips::SP)
-      .addReg(Mips::SP).addImm(NumBytes);
+      .addReg(NewReg).addImm(NewImm);
+
+    // FIXME: change this when mips goes MC".
+    if (ATUsed)
+      BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO));
   }
 }
 
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index a8426c1..34647df 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ALPHA_FRAMEINFO_H
-#define ALPHA_FRAMEINFO_H
+#ifndef MIPS_FRAMEINFO_H
+#define MIPS_FRAMEINFO_H
 
 #include "Mips.h"
 #include "MipsSubtarget.h"
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 755e04d..0382964 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -52,19 +52,19 @@ class MipsDAGToDAGISel : public SelectionDAGISel {
   /// Subtarget - Keep a pointer to the MipsSubtarget around so that we can
   /// make the right decision when generating code for different targets.
   const MipsSubtarget &Subtarget;
- 
+
 public:
   explicit MipsDAGToDAGISel(MipsTargetMachine &tm) :
   SelectionDAGISel(tm),
   TM(tm), Subtarget(tm.getSubtarget<MipsSubtarget>()) {}
-  
+
   // Pass Name
   virtual const char *getPassName() const {
     return "MIPS DAG->DAG Pattern Instruction Selection";
-  } 
-  
+  }
 
-private:  
+
+private:
   // Include the pieces autogenerated from the target description.
   #include "MipsGenDAGISel.inc"
 
@@ -116,12 +116,14 @@ SelectAddr(SDValue Addr, SDValue &Offset, SDValue &Base) {
     Offset = CurDAG->getTargetConstant(0, MVT::i32);
     return true;
   }
-    
+
   // on PIC code Load GA
   if (TM.getRelocationModel() == Reloc::PIC_) {
-    if ((Addr.getOpcode() == ISD::TargetGlobalAddress) || 
-        (Addr.getOpcode() == ISD::TargetConstantPool) || 
-        (Addr.getOpcode() == ISD::TargetJumpTable)){
+    if ((Addr.getOpcode() == ISD::TargetGlobalAddress) ||
+        (Addr.getOpcode() == ISD::TargetConstantPool) ||
+        (Addr.getOpcode() == ISD::TargetJumpTable) ||
+        (Addr.getOpcode() == ISD::TargetBlockAddress) ||
+        (Addr.getOpcode() == ISD::TargetExternalSymbol)) {
       Base   = CurDAG->getRegister(Mips::GP, MVT::i32);
       Offset = Addr;
       return true;
@@ -130,8 +132,8 @@ SelectAddr(SDValue Addr, SDValue &Offset, SDValue &Base) {
     if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
         Addr.getOpcode() == ISD::TargetGlobalAddress))
       return false;
-  }    
-  
+  }
+
   // Operand is a result from an ADD.
   if (Addr.getOpcode() == ISD::ADD) {
     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
@@ -158,10 +160,10 @@ SelectAddr(SDValue Addr, SDValue &Offset, SDValue &Base) {
     // Generate:
     //  lui $2, %hi($CPI1_0)
     //  lwc1 $f0, %lo($CPI1_0)($2)
-    if ((Addr.getOperand(0).getOpcode() == MipsISD::Hi || 
+    if ((Addr.getOperand(0).getOpcode() == MipsISD::Hi ||
          Addr.getOperand(0).getOpcode() == ISD::LOAD) &&
         Addr.getOperand(1).getOpcode() == MipsISD::Lo) {
-      SDValue LoVal = Addr.getOperand(1); 
+      SDValue LoVal = Addr.getOperand(1);
       if (dyn_cast<ConstantPoolSDNode>(LoVal.getOperand(0))) {
         Base = Addr.getOperand(0);
         Offset = LoVal.getOperand(0);
@@ -176,7 +178,7 @@ SelectAddr(SDValue Addr, SDValue &Offset, SDValue &Base) {
 }
 
 SDNode *MipsDAGToDAGISel::SelectLoadFp64(SDNode *N) {
-  MVT::SimpleValueType NVT = 
+  MVT::SimpleValueType NVT =
     N->getValueType(0).getSimpleVT().SimpleTy;
 
   if (!Subtarget.isMips1() || NVT != MVT::f64)
@@ -199,14 +201,14 @@ SDNode *MipsDAGToDAGISel::SelectLoadFp64(SDNode *N) {
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
   DebugLoc dl = N->getDebugLoc();
 
-  // The second load should start after for 4 bytes. 
+  // The second load should start after for 4 bytes.
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Offset0))
     Offset1 = CurDAG->getTargetConstant(C->getSExtValue()+4, MVT::i32);
   else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Offset0))
-    Offset1 = CurDAG->getTargetConstantPool(CP->getConstVal(), 
-                                            MVT::i32, 
-                                            CP->getAlignment(), 
-                                            CP->getOffset()+4, 
+    Offset1 = CurDAG->getTargetConstantPool(CP->getConstVal(),
+                                            MVT::i32,
+                                            CP->getAlignment(),
+                                            CP->getOffset()+4,
                                             CP->getTargetFlags());
   else
     return NULL;
@@ -220,16 +222,16 @@ SDNode *MipsDAGToDAGISel::SelectLoadFp64(SDNode *N) {
   // Generate:
   //    lwc $f0, X($3)
   //    lwc $f1, X+4($3)
-  SDNode *LD0 = CurDAG->getMachineNode(Mips::LWC1, dl, MVT::f32, 
+  SDNode *LD0 = CurDAG->getMachineNode(Mips::LWC1, dl, MVT::f32,
                                     MVT::Other, Offset0, Base, Chain);
   SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
                                                  dl, NVT), 0);
-  SDValue I0 = CurDAG->getTargetInsertSubreg(Mips::sub_fpeven, dl, 
+  SDValue I0 = CurDAG->getTargetInsertSubreg(Mips::sub_fpeven, dl,
                             MVT::f64, Undef, SDValue(LD0, 0));
 
   SDNode *LD1 = CurDAG->getMachineNode(Mips::LWC1, dl, MVT::f32,
                           MVT::Other, Offset1, Base, SDValue(LD0, 1));
-  SDValue I1 = CurDAG->getTargetInsertSubreg(Mips::sub_fpodd, dl, 
+  SDValue I1 = CurDAG->getTargetInsertSubreg(Mips::sub_fpodd, dl,
                             MVT::f64, I0, SDValue(LD1, 0));
 
   ReplaceUses(SDValue(N, 0), I1);
@@ -241,7 +243,7 @@ SDNode *MipsDAGToDAGISel::SelectLoadFp64(SDNode *N) {
 
 SDNode *MipsDAGToDAGISel::SelectStoreFp64(SDNode *N) {
 
-  if (!Subtarget.isMips1() || 
+  if (!Subtarget.isMips1() ||
       N->getOperand(1).getValueType() != MVT::f64)
     return NULL;
 
@@ -265,12 +267,12 @@ SDNode *MipsDAGToDAGISel::SelectStoreFp64(SDNode *N) {
   DebugLoc dl = N->getDebugLoc();
 
   // Get the even and odd part from the f64 register
-  SDValue FPOdd = CurDAG->getTargetExtractSubreg(Mips::sub_fpodd, 
+  SDValue FPOdd = CurDAG->getTargetExtractSubreg(Mips::sub_fpodd,
                                                  dl, MVT::f32, N1);
   SDValue FPEven = CurDAG->getTargetExtractSubreg(Mips::sub_fpeven,
                                                  dl, MVT::f32, N1);
 
-  // The second store should start after for 4 bytes. 
+  // The second store should start after for 4 bytes.
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Offset0))
     Offset1 = CurDAG->getTargetConstant(C->getSExtValue()+4, MVT::i32);
   else
@@ -315,26 +317,26 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
   }
 
   ///
-  // Instruction Selection not handled by the auto-generated 
+  // Instruction Selection not handled by the auto-generated
   // tablegen selection should be handled here.
-  /// 
+  ///
   switch(Opcode) {
 
     default: break;
 
-    case ISD::SUBE: 
+    case ISD::SUBE:
     case ISD::ADDE: {
       SDValue InFlag = Node->getOperand(2), CmpLHS;
       unsigned Opc = InFlag.getOpcode(); (void)Opc;
-      assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) || 
-              (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&  
+      assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
+              (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
              "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
 
       unsigned MOp;
       if (Opcode == ISD::ADDE) {
         CmpLHS = InFlag.getValue(0);
         MOp = Mips::ADDu;
-      } else { 
+      } else {
         CmpLHS = InFlag.getOperand(0);
         MOp = Mips::SUBu;
       }
@@ -346,7 +348,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
 
       EVT VT = LHS.getValueType();
       SDNode *Carry = CurDAG->getMachineNode(Mips::SLTu, dl, VT, Ops, 2);
-      SDNode *AddCarry = CurDAG->getMachineNode(Mips::ADDu, dl, VT, 
+      SDNode *AddCarry = CurDAG->getMachineNode(Mips::ADDu, dl, VT,
                                                 SDValue(Carry,0), RHS);
 
       return CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue,
@@ -356,36 +358,34 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
     /// Mul/Div with two results
     case ISD::SDIVREM:
     case ISD::UDIVREM:
+      break;
     case ISD::SMUL_LOHI:
     case ISD::UMUL_LOHI: {
       SDValue Op1 = Node->getOperand(0);
       SDValue Op2 = Node->getOperand(1);
 
       unsigned Op;
-      if (Opcode == ISD::UMUL_LOHI || Opcode == ISD::SMUL_LOHI)
-        Op = (Opcode == ISD::UMUL_LOHI ? Mips::MULTu : Mips::MULT);
-      else
-        Op = (Opcode == ISD::UDIVREM ? Mips::DIVu : Mips::DIV);
+      Op = (Opcode == ISD::UMUL_LOHI ? Mips::MULTu : Mips::MULT);
 
-      SDNode *MulDiv = CurDAG->getMachineNode(Op, dl, MVT::Glue, Op1, Op2);
+      SDNode *Mul = CurDAG->getMachineNode(Op, dl, MVT::Glue, Op1, Op2);
 
-      SDValue InFlag = SDValue(MulDiv, 0);
-      SDNode *Lo = CurDAG->getMachineNode(Mips::MFLO, dl, MVT::i32, 
+      SDValue InFlag = SDValue(Mul, 0);
+      SDNode *Lo = CurDAG->getMachineNode(Mips::MFLO, dl, MVT::i32,
                                           MVT::Glue, InFlag);
       InFlag = SDValue(Lo,1);
       SDNode *Hi = CurDAG->getMachineNode(Mips::MFHI, dl, MVT::i32, InFlag);
 
-      if (!SDValue(Node, 0).use_empty()) 
+      if (!SDValue(Node, 0).use_empty())
         ReplaceUses(SDValue(Node, 0), SDValue(Lo,0));
 
-      if (!SDValue(Node, 1).use_empty()) 
+      if (!SDValue(Node, 1).use_empty())
         ReplaceUses(SDValue(Node, 1), SDValue(Hi,0));
 
       return NULL;
     }
 
     /// Special Muls
-    case ISD::MUL: 
+    case ISD::MUL:
       if (Subtarget.isMips32())
         break;
     case ISD::MULHS:
@@ -394,7 +394,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
       SDValue MulOp2 = Node->getOperand(1);
 
       unsigned MulOp  = (Opcode == ISD::MULHU ? Mips::MULTu : Mips::MULT);
-      SDNode *MulNode = CurDAG->getMachineNode(MulOp, dl, 
+      SDNode *MulNode = CurDAG->getMachineNode(MulOp, dl,
                                                MVT::Glue, MulOp1, MulOp2);
 
       SDValue InFlag = SDValue(MulNode, 0);
@@ -408,24 +408,9 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
     /// Div/Rem operations
     case ISD::SREM:
     case ISD::UREM:
-    case ISD::SDIV: 
-    case ISD::UDIV: {
-      SDValue Op1 = Node->getOperand(0);
-      SDValue Op2 = Node->getOperand(1);
-
-      unsigned Op, MOp;
-      if (Opcode == ISD::SDIV || Opcode == ISD::UDIV) {
-        Op  = (Opcode == ISD::SDIV ? Mips::DIV : Mips::DIVu);
-        MOp = Mips::MFLO;
-      } else {
-        Op  = (Opcode == ISD::SREM ? Mips::DIV : Mips::DIVu);
-        MOp = Mips::MFHI;
-      }
-      SDNode *Node = CurDAG->getMachineNode(Op, dl, MVT::Glue, Op1, Op2);
-
-      SDValue InFlag = SDValue(Node, 0);
-      return CurDAG->getMachineNode(MOp, dl, MVT::i32, InFlag);
-    }
+    case ISD::SDIV:
+    case ISD::UDIV:
+      break;
 
     // Get target GOT address.
     case ISD::GLOBAL_OFFSET_TABLE:
@@ -433,15 +418,15 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
 
     case ISD::ConstantFP: {
       ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Node);
-      if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) { 
-        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, 
+      if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) {
+        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
                                         Mips::ZERO, MVT::i32);
         SDValue Undef = SDValue(
           CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, MVT::f64), 0);
         SDNode *MTC = CurDAG->getMachineNode(Mips::MTC1, dl, MVT::f32, Zero);
-        SDValue I0 = CurDAG->getTargetInsertSubreg(Mips::sub_fpeven, dl, 
+        SDValue I0 = CurDAG->getTargetInsertSubreg(Mips::sub_fpeven, dl,
                             MVT::f64, Undef, SDValue(MTC, 0));
-        SDValue I1 = CurDAG->getTargetInsertSubreg(Mips::sub_fpodd, dl, 
+        SDValue I1 = CurDAG->getTargetInsertSubreg(Mips::sub_fpodd, dl,
                             MVT::f64, I0, SDValue(MTC, 0));
         ReplaceUses(SDValue(Node, 0), I1);
         return I1.getNode();
@@ -460,61 +445,6 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
         return ResNode;
       // Other cases are autogenerated.
       break;
-
-    /// Handle direct and indirect calls when using PIC. On PIC, when 
-    /// GOT is smaller than about 64k (small code) the GA target is 
-    /// loaded with only one instruction. Otherwise GA's target must 
-    /// be loaded with 3 instructions. 
-    case MipsISD::JmpLink: {
-      if (TM.getRelocationModel() == Reloc::PIC_) {
-        unsigned LastOpNum = Node->getNumOperands()-1;
-
-        SDValue Chain  = Node->getOperand(0);
-        SDValue Callee = Node->getOperand(1);
-        SDValue InFlag;
-
-        // Skip the incomming flag if present
-        if (Node->getOperand(LastOpNum).getValueType() == MVT::Glue)
-          LastOpNum--;
-
-        if ( (isa<GlobalAddressSDNode>(Callee)) ||
-             (isa<ExternalSymbolSDNode>(Callee)) )
-        {
-          /// Direct call for global addresses and external symbols
-          SDValue GPReg = CurDAG->getRegister(Mips::GP, MVT::i32);
-
-          // Use load to get GOT target
-          SDValue Ops[] = { Callee, GPReg, Chain };
-          SDValue Load = SDValue(CurDAG->getMachineNode(Mips::LW, dl, MVT::i32, 
-                                     MVT::Other, Ops, 3), 0);
-          Chain = Load.getValue(1);
-
-          // Call target must be on T9
-          Chain = CurDAG->getCopyToReg(Chain, dl, Mips::T9, Load, InFlag);
-        } else 
-          /// Indirect call
-          Chain = CurDAG->getCopyToReg(Chain, dl, Mips::T9, Callee, InFlag);
-
-        // Map the JmpLink operands to JALR
-        SDVTList NodeTys = CurDAG->getVTList(MVT::Other, MVT::Glue);
-        SmallVector<SDValue, 8> Ops;
-        Ops.push_back(CurDAG->getRegister(Mips::T9, MVT::i32));
-
-        for (unsigned i = 2, e = LastOpNum+1; i != e; ++i)
-          Ops.push_back(Node->getOperand(i));
-        Ops.push_back(Chain);
-        Ops.push_back(Chain.getValue(1));
-
-        // Emit Jump and Link Register
-        SDNode *ResNode = CurDAG->getMachineNode(Mips::JALR, dl, NodeTys, 
-                                  &Ops[0], Ops.size());
-
-        // Replace Chain and InFlag
-        ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
-        ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 1));
-        return ResNode;
-      } 
-    }
   }
 
   // Select the default instruction
@@ -529,7 +459,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
   return ResNode;
 }
 
-/// createMipsISelDag - This pass converts a legalized DAG into a 
+/// createMipsISelDag - This pass converts a legalized DAG into a
 /// MIPS-specific DAG, ready for instruction scheduling.
 FunctionPass *llvm::createMipsISelDag(MipsTargetMachine &TM) {
   return new MipsDAGToDAGISel(TM);
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 1d7a1c0..1f1220f 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -41,15 +41,19 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
     case MipsISD::Lo         : return "MipsISD::Lo";
     case MipsISD::GPRel      : return "MipsISD::GPRel";
     case MipsISD::Ret        : return "MipsISD::Ret";
-    case MipsISD::SelectCC   : return "MipsISD::SelectCC";
-    case MipsISD::FPSelectCC : return "MipsISD::FPSelectCC";
     case MipsISD::FPBrcond   : return "MipsISD::FPBrcond";
     case MipsISD::FPCmp      : return "MipsISD::FPCmp";
+    case MipsISD::CMovFP_T   : return "MipsISD::CMovFP_T";
+    case MipsISD::CMovFP_F   : return "MipsISD::CMovFP_F";
     case MipsISD::FPRound    : return "MipsISD::FPRound";
     case MipsISD::MAdd       : return "MipsISD::MAdd";
     case MipsISD::MAddu      : return "MipsISD::MAddu";
     case MipsISD::MSub       : return "MipsISD::MSub";
     case MipsISD::MSubu      : return "MipsISD::MSubu";
+    case MipsISD::DivRem     : return "MipsISD::DivRem";
+    case MipsISD::DivRemU    : return "MipsISD::DivRemU";
+    case MipsISD::BuildPairF64: return "MipsISD::BuildPairF64";
+    case MipsISD::ExtractElementF64: return "MipsISD::ExtractElementF64";
     default                  : return NULL;
   }
 }
@@ -89,25 +93,22 @@ MipsTargetLowering(MipsTargetMachine &TM)
 
   // Mips Custom Operations
   setOperationAction(ISD::GlobalAddress,      MVT::i32,   Custom);
+  setOperationAction(ISD::BlockAddress,       MVT::i32,   Custom);
   setOperationAction(ISD::GlobalTLSAddress,   MVT::i32,   Custom);
   setOperationAction(ISD::JumpTable,          MVT::i32,   Custom);
   setOperationAction(ISD::ConstantPool,       MVT::i32,   Custom);
   setOperationAction(ISD::SELECT,             MVT::f32,   Custom);
   setOperationAction(ISD::SELECT,             MVT::f64,   Custom);
   setOperationAction(ISD::SELECT,             MVT::i32,   Custom);
-  setOperationAction(ISD::SETCC,              MVT::f32,   Custom);
-  setOperationAction(ISD::SETCC,              MVT::f64,   Custom);
   setOperationAction(ISD::BRCOND,             MVT::Other, Custom);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,   Custom);
   setOperationAction(ISD::FP_TO_SINT,         MVT::i32,   Custom);
   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
 
-
-  // We custom lower AND/OR to handle the case where the DAG contain 'ands/ors'
-  // with operands comming from setcc fp comparions. This is necessary since
-  // the result from these setcc are in a flag registers (FCR31).
-  setOperationAction(ISD::AND,              MVT::i32,   Custom);
-  setOperationAction(ISD::OR,               MVT::i32,   Custom);
+  setOperationAction(ISD::SDIV, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIV, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
 
   // Operations not directly supported by Mips.
   setOperationAction(ISD::BR_JT,             MVT::Other, Expand);
@@ -129,7 +130,9 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::FCOPYSIGN,         MVT::f32,   Expand);
   setOperationAction(ISD::FCOPYSIGN,         MVT::f64,   Expand);
   setOperationAction(ISD::FSIN,              MVT::f32,   Expand);
+  setOperationAction(ISD::FSIN,              MVT::f64,   Expand);
   setOperationAction(ISD::FCOS,              MVT::f32,   Expand);
+  setOperationAction(ISD::FCOS,              MVT::f64,   Expand);
   setOperationAction(ISD::FPOWI,             MVT::f32,   Expand);
   setOperationAction(ISD::FPOW,              MVT::f32,   Expand);
   setOperationAction(ISD::FLOG,              MVT::f32,   Expand);
@@ -139,6 +142,10 @@ MipsTargetLowering(MipsTargetMachine &TM)
 
   setOperationAction(ISD::EH_LABEL,          MVT::Other, Expand);
 
+  setOperationAction(ISD::VAARG,             MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY,            MVT::Other, Expand);
+  setOperationAction(ISD::VAEND,             MVT::Other, Expand);
+
   // Use the default for now
   setOperationAction(ISD::STACKSAVE,         MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,      MVT::Other, Expand);
@@ -160,6 +167,9 @@ MipsTargetLowering(MipsTargetMachine &TM)
 
   setTargetDAGCombine(ISD::ADDE);
   setTargetDAGCombine(ISD::SUBE);
+  setTargetDAGCombine(ISD::SDIVREM);
+  setTargetDAGCombine(ISD::UDIVREM);
+  setTargetDAGCombine(ISD::SETCC);
 
   setStackPointerRegisterToSaveRestore(Mips::SP);
   computeRegisterProperties();
@@ -181,7 +191,7 @@ unsigned MipsTargetLowering::getFunctionAlignment(const Function *) const {
 //  multHi/Lo: product of multiplication
 //  Lo0: initial value of Lo register
 //  Hi0: initial value of Hi register
-// Return true if mattern matching was successful.
+// Return true if pattern matching was successful.
 static bool SelectMadd(SDNode* ADDENode, SelectionDAG* CurDAG) {
   // ADDENode's second operand must be a flag output of an ADDC node in order
   // for the matching to be successful.
@@ -255,7 +265,7 @@ static bool SelectMadd(SDNode* ADDENode, SelectionDAG* CurDAG) {
 //  multHi/Lo: product of multiplication
 //  Lo0: initial value of Lo register
 //  Hi0: initial value of Hi register
-// Return true if mattern matching was successful.
+// Return true if pattern matching was successful.
 static bool SelectMsub(SDNode* SUBENode, SelectionDAG* CurDAG) {
   // SUBENode's second operand must be a flag output of an SUBC node in order
   // for the matching to be successful.
@@ -346,6 +356,130 @@ static SDValue PerformSUBECombine(SDNode *N, SelectionDAG& DAG,
   return SDValue();
 }
 
+static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG& DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const MipsSubtarget* Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  unsigned opc = N->getOpcode() == ISD::SDIVREM ? MipsISD::DivRem :
+                                                  MipsISD::DivRemU;
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue DivRem = DAG.getNode(opc, dl, MVT::Glue,
+                               N->getOperand(0), N->getOperand(1));
+  SDValue InChain = DAG.getEntryNode();
+  SDValue InGlue = DivRem;
+
+  // insert MFLO
+  if (N->hasAnyUseOfValue(0)) {
+    SDValue CopyFromLo = DAG.getCopyFromReg(InChain, dl, Mips::LO, MVT::i32,
+                                            InGlue);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyFromLo);
+    InChain = CopyFromLo.getValue(1);
+    InGlue = CopyFromLo.getValue(2);
+  }
+
+  // insert MFHI
+  if (N->hasAnyUseOfValue(1)) {
+    SDValue CopyFromHi = DAG.getCopyFromReg(InChain, dl,
+                                               Mips::HI, MVT::i32, InGlue);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), CopyFromHi);
+  }
+
+  return SDValue();
+}
+
+static Mips::CondCode FPCondCCodeToFCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown fp condition code!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ: return Mips::FCOND_OEQ;
+  case ISD::SETUNE: return Mips::FCOND_UNE;
+  case ISD::SETLT:
+  case ISD::SETOLT: return Mips::FCOND_OLT;
+  case ISD::SETGT:
+  case ISD::SETOGT: return Mips::FCOND_OGT;
+  case ISD::SETLE:
+  case ISD::SETOLE: return Mips::FCOND_OLE;
+  case ISD::SETGE:
+  case ISD::SETOGE: return Mips::FCOND_OGE;
+  case ISD::SETULT: return Mips::FCOND_ULT;
+  case ISD::SETULE: return Mips::FCOND_ULE;
+  case ISD::SETUGT: return Mips::FCOND_UGT;
+  case ISD::SETUGE: return Mips::FCOND_UGE;
+  case ISD::SETUO:  return Mips::FCOND_UN;
+  case ISD::SETO:   return Mips::FCOND_OR;
+  case ISD::SETNE:
+  case ISD::SETONE: return Mips::FCOND_ONE;
+  case ISD::SETUEQ: return Mips::FCOND_UEQ;
+  }
+}
+
+
+// Returns true if condition code has to be inverted.
+static bool InvertFPCondCode(Mips::CondCode CC) {
+  if (CC >= Mips::FCOND_F && CC <= Mips::FCOND_NGT)
+    return false;
+
+  if (CC >= Mips::FCOND_T && CC <= Mips::FCOND_GT)
+    return true;
+
+  assert(false && "Illegal Condition Code");
+  return false;
+}
+
+// Creates and returns an FPCmp node from a setcc node.
+// Returns Op if setcc is not a floating point comparison.
+static SDValue CreateFPCmp(SelectionDAG& DAG, const SDValue& Op) {
+  // must be a SETCC node
+  if (Op.getOpcode() != ISD::SETCC)
+    return Op;
+
+  SDValue LHS = Op.getOperand(0);
+
+  if (!LHS.getValueType().isFloatingPoint())
+    return Op;
+
+  SDValue RHS = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Assume the 3rd operand is a CondCodeSDNode. Add code to check the type of
+  // node if necessary.
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+  return DAG.getNode(MipsISD::FPCmp, dl, MVT::Glue, LHS, RHS,
+                     DAG.getConstant(FPCondCCodeToFCC(CC), MVT::i32));
+}
+
+// Creates and returns a CMovFPT/F node.
+static SDValue CreateCMovFP(SelectionDAG& DAG, SDValue Cond, SDValue True,
+                            SDValue False, DebugLoc DL) {
+  bool invert = InvertFPCondCode((Mips::CondCode)
+                                 cast<ConstantSDNode>(Cond.getOperand(2))
+                                 ->getSExtValue());
+
+  return DAG.getNode((invert ? MipsISD::CMovFP_F : MipsISD::CMovFP_T), DL,
+                     True.getValueType(), True, False, Cond);
+}
+
+static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG& DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const MipsSubtarget* Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue Cond = CreateFPCmp(DAG, SDValue(N, 0));
+
+  if (Cond.getOpcode() != MipsISD::FPCmp)
+    return SDValue();
+
+  SDValue True  = DAG.getConstant(1, MVT::i32);
+  SDValue False = DAG.getConstant(0, MVT::i32);
+
+  return CreateCMovFP(DAG, Cond, True, False, N->getDebugLoc());
+}
+
 SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
   const {
   SelectionDAG &DAG = DCI.DAG;
@@ -357,6 +491,11 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
     return PerformADDECombine(N, DAG, DCI, Subtarget);
   case ISD::SUBE:
     return PerformSUBECombine(N, DAG, DCI, Subtarget);
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+    return PerformDivRemCombine(N, DAG, DCI, Subtarget);
+  case ISD::SETCC:
+    return PerformSETCCCombine(N, DAG, DCI, Subtarget);
   }
 
   return SDValue();
@@ -367,17 +506,15 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
 {
   switch (Op.getOpcode())
   {
-    case ISD::AND:                return LowerANDOR(Op, DAG);
     case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
     case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
     case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
     case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
     case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+    case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
     case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
     case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
-    case ISD::OR:                 return LowerANDOR(Op, DAG);
     case ISD::SELECT:             return LowerSELECT(Op, DAG);
-    case ISD::SETCC:              return LowerSETCC(Op, DAG);
     case ISD::VASTART:            return LowerVASTART(Op, DAG);
   }
   return SDValue();
@@ -410,122 +547,110 @@ static Mips::FPBranchCode GetFPBranchCodeFromCond(Mips::CondCode CC) {
   return Mips::BRANCH_INVALID;
 }
 
-static unsigned FPBranchCodeToOpc(Mips::FPBranchCode BC) {
-  switch(BC) {
-    default:
-      llvm_unreachable("Unknown branch code");
-    case Mips::BRANCH_T  : return Mips::BC1T;
-    case Mips::BRANCH_F  : return Mips::BC1F;
-    case Mips::BRANCH_TL : return Mips::BC1TL;
-    case Mips::BRANCH_FL : return Mips::BC1FL;
-  }
-}
-
-static Mips::CondCode FPCondCCodeToFCC(ISD::CondCode CC) {
-  switch (CC) {
-  default: llvm_unreachable("Unknown fp condition code!");
-  case ISD::SETEQ:
-  case ISD::SETOEQ: return Mips::FCOND_EQ;
-  case ISD::SETUNE: return Mips::FCOND_OGL;
-  case ISD::SETLT:
-  case ISD::SETOLT: return Mips::FCOND_OLT;
-  case ISD::SETGT:
-  case ISD::SETOGT: return Mips::FCOND_OGT;
-  case ISD::SETLE:
-  case ISD::SETOLE: return Mips::FCOND_OLE;
-  case ISD::SETGE:
-  case ISD::SETOGE: return Mips::FCOND_OGE;
-  case ISD::SETULT: return Mips::FCOND_ULT;
-  case ISD::SETULE: return Mips::FCOND_ULE;
-  case ISD::SETUGT: return Mips::FCOND_UGT;
-  case ISD::SETUGE: return Mips::FCOND_UGE;
-  case ISD::SETUO:  return Mips::FCOND_UN;
-  case ISD::SETO:   return Mips::FCOND_OR;
-  case ISD::SETNE:
-  case ISD::SETONE: return Mips::FCOND_NEQ;
-  case ISD::SETUEQ: return Mips::FCOND_UEQ;
-  }
-}
-
 MachineBasicBlock *
 MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                 MachineBasicBlock *BB) const {
+  // There is no need to expand CMov instructions if target has
+  // conditional moves.
+  if (Subtarget->hasCondMov())
+    return BB;
+
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   bool isFPCmp = false;
   DebugLoc dl = MI->getDebugLoc();
+  unsigned Opc;
 
   switch (MI->getOpcode()) {
   default: assert(false && "Unexpected instr type to insert");
-  case Mips::Select_FCC:
-  case Mips::Select_FCC_S32:
-  case Mips::Select_FCC_D32:
-    isFPCmp = true; // FALL THROUGH
-  case Mips::Select_CC:
-  case Mips::Select_CC_S32:
-  case Mips::Select_CC_D32: {
-    // To "insert" a SELECT_CC instruction, we actually have to insert the
-    // diamond control-flow pattern.  The incoming instruction knows the
-    // destination vreg to set, the condition code register to branch on, the
-    // true/false values to select between, and a branch opcode to use.
-    const BasicBlock *LLVM_BB = BB->getBasicBlock();
-    MachineFunction::iterator It = BB;
-    ++It;
-
-    //  thisMBB:
-    //  ...
-    //   TrueVal = ...
-    //   setcc r1, r2, r3
-    //   bNE   r1, r0, copy1MBB
-    //   fallthrough --> copy0MBB
-    MachineBasicBlock *thisMBB  = BB;
-    MachineFunction *F = BB->getParent();
-    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
-    MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
-    F->insert(It, copy0MBB);
-    F->insert(It, sinkMBB);
-
-    // Transfer the remainder of BB and its successor edges to sinkMBB.
-    sinkMBB->splice(sinkMBB->begin(), BB,
-                    llvm::next(MachineBasicBlock::iterator(MI)),
-                    BB->end());
-    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-    // Next, add the true and fallthrough blocks as its successors.
-    BB->addSuccessor(copy0MBB);
-    BB->addSuccessor(sinkMBB);
-
-    // Emit the right instruction according to the type of the operands compared
-    if (isFPCmp) {
-      // Find the condiction code present in the setcc operation.
-      Mips::CondCode CC = (Mips::CondCode)MI->getOperand(4).getImm();
-      // Get the branch opcode from the branch code.
-      unsigned Opc = FPBranchCodeToOpc(GetFPBranchCodeFromCond(CC));
-      BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB);
-    } else
-      BuildMI(BB, dl, TII->get(Mips::BNE)).addReg(MI->getOperand(1).getReg())
-        .addReg(Mips::ZERO).addMBB(sinkMBB);
-
-    //  copy0MBB:
-    //   %FalseValue = ...
-    //   # fallthrough to sinkMBB
-    BB = copy0MBB;
-
-    // Update machine-CFG edges
-    BB->addSuccessor(sinkMBB);
-
-    //  sinkMBB:
-    //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
-    //  ...
-    BB = sinkMBB;
+  case Mips::MOVT:
+  case Mips::MOVT_S:
+  case Mips::MOVT_D:
+    isFPCmp = true;
+    Opc = Mips::BC1F;
+    break;
+  case Mips::MOVF:
+  case Mips::MOVF_S:
+  case Mips::MOVF_D:
+    isFPCmp = true;
+    Opc = Mips::BC1T;
+    break;
+  case Mips::MOVZ_I:
+  case Mips::MOVZ_S:
+  case Mips::MOVZ_D:
+    Opc = Mips::BNE;
+    break;
+  case Mips::MOVN_I:
+  case Mips::MOVN_S:
+  case Mips::MOVN_D:
+    Opc = Mips::BEQ;
+    break;
+  }
+
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB  = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  // Emit the right instruction according to the type of the operands compared
+  if (isFPCmp)
+    BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB);
+  else
+    BuildMI(BB, dl, TII->get(Opc)).addReg(MI->getOperand(2).getReg())
+      .addReg(Mips::ZERO).addMBB(sinkMBB);
+
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+  //  ...
+  BB = sinkMBB;
+
+  if (isFPCmp)
     BuildMI(*BB, BB->begin(), dl,
             TII->get(Mips::PHI), MI->getOperand(0).getReg())
       .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB)
-      .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB);
+      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB);
+  else
+    BuildMI(*BB, BB->begin(), dl,
+            TII->get(Mips::PHI), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(3).getReg()).addMBB(thisMBB)
+      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB);
 
-    MI->eraseFromParent();   // The pseudo instruction is gone now.
-    return BB;
-  }
-  }
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
 }
 
 //===----------------------------------------------------------------------===//
@@ -590,27 +715,6 @@ LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
 }
 
 SDValue MipsTargetLowering::
-LowerANDOR(SDValue Op, SelectionDAG &DAG) const
-{
-  SDValue LHS   = Op.getOperand(0);
-  SDValue RHS   = Op.getOperand(1);
-  DebugLoc dl   = Op.getDebugLoc();
-
-  if (LHS.getOpcode() != MipsISD::FPCmp || RHS.getOpcode() != MipsISD::FPCmp)
-    return Op;
-
-  SDValue True  = DAG.getConstant(1, MVT::i32);
-  SDValue False = DAG.getConstant(0, MVT::i32);
-
-  SDValue LSEL = DAG.getNode(MipsISD::FPSelectCC, dl, True.getValueType(),
-                             LHS, True, False, LHS.getOperand(2));
-  SDValue RSEL = DAG.getNode(MipsISD::FPSelectCC, dl, True.getValueType(),
-                             RHS, True, False, RHS.getOperand(2));
-
-  return DAG.getNode(Op.getOpcode(), dl, MVT::i32, LSEL, RSEL);
-}
-
-SDValue MipsTargetLowering::
 LowerBRCOND(SDValue Op, SelectionDAG &DAG) const
 {
   // The first operand is the chain, the second is the condition, the third is
@@ -619,58 +723,32 @@ LowerBRCOND(SDValue Op, SelectionDAG &DAG) const
   SDValue Dest = Op.getOperand(2);
   DebugLoc dl = Op.getDebugLoc();
 
-  if (Op.getOperand(1).getOpcode() != MipsISD::FPCmp)
+  SDValue CondRes = CreateFPCmp(DAG, Op.getOperand(1));
+
+  // Return if flag is not set by a floating point comparison.
+  if (CondRes.getOpcode() != MipsISD::FPCmp)
     return Op;
 
-  SDValue CondRes = Op.getOperand(1);
   SDValue CCNode  = CondRes.getOperand(2);
   Mips::CondCode CC =
     (Mips::CondCode)cast<ConstantSDNode>(CCNode)->getZExtValue();
   SDValue BrCode = DAG.getConstant(GetFPBranchCodeFromCond(CC), MVT::i32);
 
   return DAG.getNode(MipsISD::FPBrcond, dl, Op.getValueType(), Chain, BrCode,
-             Dest, CondRes);
-}
-
-SDValue MipsTargetLowering::
-LowerSETCC(SDValue Op, SelectionDAG &DAG) const
-{
-  // The operands to this are the left and right operands to compare (ops #0,
-  // and #1) and the condition code to compare them with (op #2) as a
-  // CondCodeSDNode.
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
-
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-
-  return DAG.getNode(MipsISD::FPCmp, dl, Op.getValueType(), LHS, RHS,
-                 DAG.getConstant(FPCondCCodeToFCC(CC), MVT::i32));
+                     Dest, CondRes);
 }
 
 SDValue MipsTargetLowering::
 LowerSELECT(SDValue Op, SelectionDAG &DAG) const
 {
-  SDValue Cond  = Op.getOperand(0);
-  SDValue True  = Op.getOperand(1);
-  SDValue False = Op.getOperand(2);
-  DebugLoc dl = Op.getDebugLoc();
+  SDValue Cond = CreateFPCmp(DAG, Op.getOperand(0));
 
-  // if the incomming condition comes from a integer compare, the select
-  // operation must be SelectCC or a conditional move if the subtarget
-  // supports it.
-  if (Cond.getOpcode() != MipsISD::FPCmp) {
-    if (Subtarget->hasCondMov() && !True.getValueType().isFloatingPoint())
-      return Op;
-    return DAG.getNode(MipsISD::SelectCC, dl, True.getValueType(),
-                       Cond, True, False);
-  }
+  // Return if flag is not set by a floating point comparison.
+  if (Cond.getOpcode() != MipsISD::FPCmp)
+    return Op;
 
-  // if the incomming condition comes from fpcmp, the select
-  // operation must use FPSelectCC.
-  SDValue CCNode = Cond.getOperand(2);
-  return DAG.getNode(MipsISD::FPSelectCC, dl, True.getValueType(),
-                     Cond, True, False, CCNode);
+  return CreateCMovFP(DAG, Cond, Op.getOperand(1), Op.getOperand(2),
+                      Op.getDebugLoc());
 }
 
 SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
@@ -693,12 +771,13 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
       return DAG.getNode(ISD::ADD, dl, MVT::i32, GOT, GPRelNode);
     }
     // %hi/%lo relocation
-    SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
-                                            MipsII::MO_ABS_HILO);
-    SDValue HiPart = DAG.getNode(MipsISD::Hi, dl, VTs, &GA, 1);
-    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GA);
+    SDValue GAHi = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+                                              MipsII::MO_ABS_HI);
+    SDValue GALo = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+                                              MipsII::MO_ABS_LO);
+    SDValue HiPart = DAG.getNode(MipsISD::Hi, dl, VTs, &GAHi, 1);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GALo);
     return DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
-
   } else {
     SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
                                             MipsII::MO_GOT);
@@ -707,9 +786,12 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
                                   false, false, 0);
     // On functions and global targets not internal linked only
     // a load from got/GP is necessary for PIC to work.
-    if (!GV->hasLocalLinkage() || isa<Function>(GV))
+    if (!GV->hasInternalLinkage() &&
+        (!GV->hasLocalLinkage() || isa<Function>(GV)))
       return ResNode;
-    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GA);
+    SDValue GALo = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+                                              MipsII::MO_ABS_LO);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GALo);
     return DAG.getNode(ISD::ADD, dl, MVT::i32, ResNode, Lo);
   }
 
@@ -717,6 +799,34 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
   return SDValue(0,0);
 }
 
+SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  // FIXME there isn't actually debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
+    // %hi/%lo relocation
+    SDValue BAHi = DAG.getBlockAddress(BA, MVT::i32, true,
+                                       MipsII::MO_ABS_HI);
+    SDValue BALo = DAG.getBlockAddress(BA, MVT::i32, true,
+                                       MipsII::MO_ABS_LO);
+    SDValue Hi = DAG.getNode(MipsISD::Hi, dl, MVT::i32, BAHi);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, BALo);
+    return DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, Lo);
+  }
+
+  SDValue BAGOTOffset = DAG.getBlockAddress(BA, MVT::i32, true,
+                                            MipsII::MO_GOT);
+  SDValue BALOOffset = DAG.getBlockAddress(BA, MVT::i32, true,
+                                           MipsII::MO_ABS_LO);
+  SDValue Load = DAG.getLoad(MVT::i32, dl,
+                             DAG.getEntryNode(), BAGOTOffset,
+                             MachinePointerInfo(), false, false, 0);
+  SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, BALOOffset);
+  return DAG.getNode(ISD::ADD, dl, MVT::i32, Load, Lo);
+}
+
 SDValue MipsTargetLowering::
 LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
 {
@@ -732,7 +842,7 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG) const
   // FIXME there isn't actually debug info here
   DebugLoc dl = Op.getDebugLoc();
   bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
-  unsigned char OpFlag = IsPIC ? MipsII::MO_GOT : MipsII::MO_ABS_HILO;
+  unsigned char OpFlag = IsPIC ? MipsII::MO_GOT : MipsII::MO_ABS_HI;
 
   EVT PtrVT = Op.getValueType();
   JumpTableSDNode *JT  = cast<JumpTableSDNode>(Op);
@@ -747,7 +857,9 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG) const
                          MachinePointerInfo(),
                          false, false, 0);
 
-  SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, JTI);
+  SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                                         MipsII::MO_ABS_LO);
+  SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, JTILo);
   ResNode = DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
 
   return ResNode;
@@ -764,7 +876,7 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
 
   // gp_rel relocation
   // FIXME: we should reference the constant pool using small data sections,
-  // but the asm printer currently doens't support this feature without
+  // but the asm printer currently doesn't support this feature without
   // hacking it. This feature should come soon so we can uncomment the
   // stuff below.
   //if (IsInSmallSection(C->getType())) {
@@ -773,18 +885,22 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
   //  ResNode = DAG.getNode(ISD::ADD, MVT::i32, GOT, GPRelNode);
 
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
-    SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
-                                      N->getOffset(), MipsII::MO_ABS_HILO);
-    SDValue HiPart = DAG.getNode(MipsISD::Hi, dl, MVT::i32, CP);
-    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, CP);
+    SDValue CPHi = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
+                                             N->getOffset(), MipsII::MO_ABS_HI);
+    SDValue CPLo = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
+                                             N->getOffset(), MipsII::MO_ABS_LO);
+    SDValue HiPart = DAG.getNode(MipsISD::Hi, dl, MVT::i32, CPHi);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, CPLo);
     ResNode = DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
   } else {
     SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
-                                      N->getOffset(), MipsII::MO_GOT);
+                                           N->getOffset(), MipsII::MO_GOT);
     SDValue Load = DAG.getLoad(MVT::i32, dl, DAG.getEntryNode(),
                                CP, MachinePointerInfo::getConstantPool(),
                                false, false, 0);
-    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, CP);
+    SDValue CPLo = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
+                                             N->getOffset(), MipsII::MO_ABS_LO);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, CPLo);
     ResNode = DAG.getNode(ISD::ADD, dl, MVT::i32, Load, Lo);
   }
 
@@ -937,46 +1053,28 @@ static bool CC_MipsO32_VarArgs(unsigned ValNo, MVT ValVT,
       LocInfo = CCValAssign::AExt;
   }
 
-  if (ValVT == MVT::i32 || ValVT == MVT::f32) {
-    if (unsigned Reg = State.AllocateReg(IntRegs, IntRegsSize)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, MVT::i32, LocInfo));
-      return false;
-    }
-    unsigned Off = State.AllocateStack(4, 4);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Off, LocVT, LocInfo));
-    return false;
-  }
-
-  unsigned UnallocIntReg = State.getFirstUnallocated(IntRegs, IntRegsSize);
-  if (ValVT == MVT::f64) {
-    if (IntRegs[UnallocIntReg] == (unsigned (Mips::A1))) {
-      // A1 can't be used anymore, because 64 bit arguments
-      // must be aligned when copied back to the caller stack
-      State.AllocateReg(IntRegs, IntRegsSize);
-      UnallocIntReg++;
-    }
-
-    if (IntRegs[UnallocIntReg] == (unsigned (Mips::A0)) ||
-        IntRegs[UnallocIntReg] == (unsigned (Mips::A2))) {
-      unsigned Reg = State.AllocateReg(IntRegs, IntRegsSize);
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, MVT::i32, LocInfo));
-      // Shadow the next register so it can be used
-      // later to get the other 32bit part.
-      State.AllocateReg(IntRegs, IntRegsSize);
-      return false;
-    }
+  unsigned Reg;
 
-    // Register is shadowed to preserve alignment, and the
-    // argument goes to a stack location.
-    if (UnallocIntReg != IntRegsSize)
-      State.AllocateReg(IntRegs, IntRegsSize);
+  if (ValVT == MVT::i32 || ValVT == MVT::f32) {
+    Reg = State.AllocateReg(IntRegs, IntRegsSize);
+    LocVT = MVT::i32;
+  } else if (ValVT == MVT::f64) {
+    Reg = State.AllocateReg(IntRegs, IntRegsSize);
+    if (Reg == Mips::A1 || Reg == Mips::A3)
+      Reg = State.AllocateReg(IntRegs, IntRegsSize);
+    State.AllocateReg(IntRegs, IntRegsSize);
+    LocVT = MVT::i32;
+  } else
+    llvm_unreachable("Cannot handle this ValVT.");
 
-    unsigned Off = State.AllocateStack(8, 8);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Off, LocVT, LocInfo));
-    return false;
-  }
+  if (!Reg) {
+    unsigned SizeInBytes = ValVT.getSizeInBits() >> 3;
+    unsigned Offset = State.AllocateStack(SizeInBytes, SizeInBytes);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  } else
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
 
-  return true; // CC didn't match
+  return false; // CC must always match
 }
 
 //===----------------------------------------------------------------------===//
@@ -1043,11 +1141,12 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
         if (VA.getValVT() == MVT::f32 && VA.getLocVT() == MVT::i32)
           Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
         if (VA.getValVT() == MVT::f64 && VA.getLocVT() == MVT::i32) {
-          Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
-          SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Arg,
-                                   DAG.getConstant(0, getPointerTy()));
-          SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Arg,
-                                   DAG.getConstant(1, getPointerTy()));
+          SDValue Lo = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32,
+                                   Arg, DAG.getConstant(0, MVT::i32));
+          SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32,
+                                   Arg, DAG.getConstant(1, MVT::i32));
+          if (!Subtarget->isLittle())
+            std::swap(Lo, Hi);
           RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
           RegsToPass.push_back(std::make_pair(VA.getLocReg()+1, Hi));
           continue;
@@ -1100,7 +1199,7 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
-  // The InFlag in necessary since all emited instructions must be
+  // The InFlag in necessary since all emitted instructions must be
   // stuck together.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
@@ -1113,12 +1212,52 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
   unsigned char OpFlag = IsPIC ? MipsII::MO_GOT_CALL : MipsII::MO_NO_FLAG;
-  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
-                                getPointerTy(), 0, OpFlag);
-  else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+  bool LoadSymAddr = false;
+  SDValue CalleeLo;
+
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    if (IsPIC && G->getGlobal()->hasInternalLinkage()) {
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
+                                          getPointerTy(), 0,MipsII:: MO_GOT);
+      CalleeLo = DAG.getTargetGlobalAddress(G->getGlobal(), dl, getPointerTy(),
+                                            0, MipsII::MO_ABS_LO);
+    } else {
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
+                                          getPointerTy(), 0, OpFlag);
+    }
+
+    LoadSymAddr = true;
+  }
+  else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     Callee = DAG.getTargetExternalSymbol(S->getSymbol(),
                                 getPointerTy(), OpFlag);
+    LoadSymAddr = true;
+  }
+
+  // Create nodes that load address of callee and copy it to T9
+  if (IsPIC) {
+    if (LoadSymAddr) {
+      // Load callee address
+      SDValue LoadValue = DAG.getLoad(MVT::i32, dl, Chain, Callee,
+                                      MachinePointerInfo::getGOT(),
+                                      false, false, 0);
+
+      // Use GOT+LO if callee has internal linkage.
+      if (CalleeLo.getNode()) {
+        SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, CalleeLo);
+        Callee = DAG.getNode(ISD::ADD, dl, MVT::i32, LoadValue, Lo);
+      } else
+        Callee = LoadValue;
+
+      // Use chain output from LoadValue 
+      Chain = LoadValue.getValue(1);
+    }
+
+    // copy to T9
+    Chain = DAG.getCopyToReg(Chain, dl, Mips::T9, Callee, SDValue(0, 0));
+    InFlag = Chain.getValue(1);
+    Callee = DAG.getRegister(Mips::T9, MVT::i32);
+  }
 
   // MipsJmpLink = #chain, #target_address, #opt_in_flags...
   //             = Chain, Callee, Reg#1, Reg#2, ...
@@ -1143,7 +1282,7 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Create a stack location to hold GP when PIC is used. This stack
   // location is used on function prologue to save GP and also after all
-  // emited CALL's to restore GP.
+  // emitted CALL's to restore GP.
   if (IsPIC) {
       // Function can have an arbitrary number of calls, so
       // hold the LastArgStackLoc with the biggest offset.
@@ -1218,18 +1357,18 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 /// and generate load operations for arguments places on the stack.
 SDValue
 MipsTargetLowering::LowerFormalArguments(SDValue Chain,
-                                        CallingConv::ID CallConv, bool isVarArg,
-                                        const SmallVectorImpl<ISD::InputArg>
-                                        &Ins,
-                                        DebugLoc dl, SelectionDAG &DAG,
-                                        SmallVectorImpl<SDValue> &InVals)
+                                         CallingConv::ID CallConv,
+                                         bool isVarArg,
+                                         const SmallVectorImpl<ISD::InputArg>
+                                         &Ins,
+                                         DebugLoc dl, SelectionDAG &DAG,
+                                         SmallVectorImpl<SDValue> &InVals)
                                           const {
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
 
-  unsigned StackReg = MF.getTarget().getRegisterInfo()->getFrameRegister(MF);
   MipsFI->setVarArgsFrameIndex(0);
 
   // Used with vargs to acumulate store chains.
@@ -1249,9 +1388,9 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   else
     CCInfo.AnalyzeFormalArguments(Ins, CC_Mips);
 
-  SDValue StackPtr;
-
   unsigned FirstStackArgLoc = (Subtarget->isABI_EABI() ? 0 : 16);
+  unsigned LastStackArgEndOffset = 0;
+  EVT LastRegArgValVT;
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
@@ -1260,6 +1399,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
     if (VA.isRegLoc()) {
       EVT RegVT = VA.getLocVT();
       ArgRegEnd = VA.getLocReg();
+      LastRegArgValVT = VA.getValVT();
       TargetRegisterClass *RC = 0;
 
       if (RegVT == MVT::i32)
@@ -1300,8 +1440,10 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
           unsigned Reg2 = AddLiveIn(DAG.getMachineFunction(),
                                     VA.getLocReg()+1, RC);
           SDValue ArgValue2 = DAG.getCopyFromReg(Chain, dl, Reg2, RegVT);
-          SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, ArgValue2, ArgValue);
-          ArgValue = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Pair);
+          if (!Subtarget->isLittle())
+            std::swap(ArgValue, ArgValue2);
+          ArgValue = DAG.getNode(MipsISD::BuildPairF64, dl, MVT::f64,
+                                 ArgValue, ArgValue2);
         }
       }
 
@@ -1321,10 +1463,10 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
       // used instead of a direct negative address (which is recorded to
       // be used on emitPrologue) to avoid mis-calc of the first stack
       // offset on PEI::calculateFrameObjectOffsets.
-      // Arguments are always 32-bit.
-      unsigned ArgSize = VA.getLocVT().getSizeInBits()/8;
+      unsigned ArgSize = VA.getValVT().getSizeInBits()/8;
+      LastStackArgEndOffset = FirstStackArgLoc + VA.getLocMemOffset() + ArgSize;
       int FI = MFI->CreateFixedObject(ArgSize, 0, true);
-      MipsFI->recordLoadArgsFI(FI, -(ArgSize+
+      MipsFI->recordLoadArgsFI(FI, -(4 +
         (FirstStackArgLoc + VA.getLocMemOffset())));
 
       // Create load nodes to retrieve arguments from the stack
@@ -1351,29 +1493,52 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   // To meet ABI, when VARARGS are passed on registers, the registers
   // must have their values written to the caller stack frame. If the last
   // argument was placed in the stack, there's no need to save any register.
-  if ((isVarArg) && (Subtarget->isABI_O32() && ArgRegEnd)) {
-    if (StackPtr.getNode() == 0)
-      StackPtr = DAG.getRegister(StackReg, getPointerTy());
-
-    // The last register argument that must be saved is Mips::A3
-    TargetRegisterClass *RC = Mips::CPURegsRegisterClass;
-    unsigned StackLoc = ArgLocs.size()-1;
-
-    for (++ArgRegEnd; ArgRegEnd <= Mips::A3; ++ArgRegEnd, ++StackLoc) {
-      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), ArgRegEnd, RC);
-      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, MVT::i32);
-
-      int FI = MFI->CreateFixedObject(4, 0, true);
-      MipsFI->recordStoreVarArgsFI(FI, -(4+(StackLoc*4)));
-      SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
-      OutChains.push_back(DAG.getStore(Chain, dl, ArgValue, PtrOff,
-                                       MachinePointerInfo(),
-                                       false, false, 0));
-
-      // Record the frame index of the first variable argument
-      // which is a value necessary to VASTART.
-      if (!MipsFI->getVarArgsFrameIndex())
+  if (isVarArg && Subtarget->isABI_O32()) {
+    if (ArgRegEnd) {
+      // Last named formal argument is passed in register.
+
+      // The last register argument that must be saved is Mips::A3
+      TargetRegisterClass *RC = Mips::CPURegsRegisterClass;
+      if (LastRegArgValVT == MVT::f64)
+        ArgRegEnd++;
+
+      if (ArgRegEnd < Mips::A3) {
+        // Both the last named formal argument and the first variable
+        // argument are passed in registers.
+        for (++ArgRegEnd; ArgRegEnd <= Mips::A3; ++ArgRegEnd) {
+          unsigned Reg = AddLiveIn(DAG.getMachineFunction(), ArgRegEnd, RC);
+          SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, MVT::i32);
+
+          int FI = MFI->CreateFixedObject(4, 0, true);
+          MipsFI->recordStoreVarArgsFI(FI, -(4+(ArgRegEnd-Mips::A0)*4));
+          SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
+          OutChains.push_back(DAG.getStore(Chain, dl, ArgValue, PtrOff,
+                                           MachinePointerInfo(),
+                                           false, false, 0));
+
+          // Record the frame index of the first variable argument
+          // which is a value necessary to VASTART.
+          if (!MipsFI->getVarArgsFrameIndex()) {
+            MFI->setObjectAlignment(FI, 4);
+            MipsFI->setVarArgsFrameIndex(FI);
+          }
+        }
+      } else {
+        // Last named formal argument is in register Mips::A3, and the first
+        // variable argument is on stack. Record the frame index of the first
+        // variable argument.
+        int FI = MFI->CreateFixedObject(4, 0, true);
+        MFI->setObjectAlignment(FI, 4);
+        MipsFI->recordStoreVarArgsFI(FI, -20);
         MipsFI->setVarArgsFrameIndex(FI);
+      }
+    } else {
+      // Last named formal argument and all the variable arguments are passed
+      // on stack. Record the frame index of the first variable argument.
+      int FI = MFI->CreateFixedObject(4, 0, true);
+      MFI->setObjectAlignment(FI, 4);
+      MipsFI->recordStoreVarArgsFI(FI, -(4+LastStackArgEndOffset));
+      MipsFI->setVarArgsFrameIndex(FI);
     }
   }
 
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 9d6b9f3..e4d0c3d 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -31,45 +31,50 @@ namespace llvm {
 
       // Get the Higher 16 bits from a 32-bit immediate
       // No relation with Mips Hi register
-      Hi, 
+      Hi,
 
       // Get the Lower 16 bits from a 32-bit immediate
       // No relation with Mips Lo register
-      Lo, 
+      Lo,
 
       // Handle gp_rel (small data/bss sections) relocation.
       GPRel,
 
-      // Select CC Pseudo Instruction
-      SelectCC,
-
-      // Floating Point Select CC Pseudo Instruction
-      FPSelectCC,
-
       // Floating Point Branch Conditional
       FPBrcond,
 
       // Floating Point Compare
       FPCmp,
 
+      // Floating Point Conditional Moves
+      CMovFP_T,
+      CMovFP_F,
+
       // Floating Point Rounding
       FPRound,
 
-      // Return 
+      // Return
       Ret,
 
       // MAdd/Sub nodes
       MAdd,
       MAddu,
       MSub,
-      MSubu
+      MSubu,
+
+      // DivRem(u)
+      DivRem,
+      DivRemU,
+
+      BuildPairF64,
+      ExtractElementF64
     };
   }
 
   //===--------------------------------------------------------------------===//
   // TargetLowering Implementation
   //===--------------------------------------------------------------------===//
-  
+
   class MipsTargetLowering : public TargetLowering  {
   public:
     explicit MipsTargetLowering(MipsTargetMachine &TM);
@@ -77,7 +82,7 @@ namespace llvm {
     /// LowerOperation - Provide custom lowering hooks for some operations.
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
 
-    /// getTargetNodeName - This method returns the name of a target specific 
+    /// getTargetNodeName - This method returns the name of a target specific
     //  DAG node.
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
@@ -87,7 +92,7 @@ namespace llvm {
     /// getFunctionAlignment - Return the Log2 alignment of this function.
     virtual unsigned getFunctionAlignment(const Function *F) const;
 
-    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; 
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   private:
     // Subtarget Info
     const MipsSubtarget *Subtarget;
@@ -101,16 +106,15 @@ namespace llvm {
                             SmallVectorImpl<SDValue> &InVals) const;
 
     // Lower Operand specifics
-    SDValue LowerANDOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
 
     virtual SDValue
@@ -149,7 +153,7 @@ namespace llvm {
     ConstraintWeight getSingleConstraintMatchWeight(
       AsmOperandInfo &info, const char *constraint) const;
 
-    std::pair<unsigned, const TargetRegisterClass*> 
+    std::pair<unsigned, const TargetRegisterClass*>
               getRegForInlineAsmConstraint(const std::string &Constraint,
               EVT VT) const;
 
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 977e0df..a86c5c7 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -24,19 +24,28 @@
 //===----------------------------------------------------------------------===//
 
 // Floating Point Compare and Branch
-def SDT_MipsFPBrcond : SDTypeProfile<0, 3, [SDTCisSameAs<0, 2>, SDTCisInt<0>,
-                                     SDTCisVT<1, OtherVT>]>;
-def SDT_MipsFPCmp : SDTypeProfile<1, 3, [SDTCisVT<0, i32>,
-                                         SDTCisSameAs<1, 2>, SDTCisFP<1>, 
-                                         SDTCisInt<3>]>;
-def SDT_MipsFPSelectCC : SDTypeProfile<1, 4, [SDTCisInt<1>, SDTCisInt<4>,
-                                  SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>;
-
+def SDT_MipsFPBrcond : SDTypeProfile<0, 2, [SDTCisInt<0>,
+                                            SDTCisVT<1, OtherVT>]>;
+def SDT_MipsFPCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisFP<1>,
+                                         SDTCisInt<2>]>;
+def SDT_MipsCMovFP : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
+                                          SDTCisSameAs<1, 2>]>;
+def SDT_MipsBuildPairF64 : SDTypeProfile<1, 2, [SDTCisVT<0, f64>,
+                                                SDTCisVT<1, i32>,
+                                                SDTCisSameAs<1, 2>]>;
+def SDT_MipsExtractElementF64 : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+                                                     SDTCisVT<1, f64>,
+                                                     SDTCisVT<0, i32>]>;
+
+def MipsFPCmp : SDNode<"MipsISD::FPCmp", SDT_MipsFPCmp, [SDNPOutGlue]>;
+def MipsCMovFP_T : SDNode<"MipsISD::CMovFP_T", SDT_MipsCMovFP, [SDNPInGlue]>;
+def MipsCMovFP_F : SDNode<"MipsISD::CMovFP_F", SDT_MipsCMovFP, [SDNPInGlue]>;
 def MipsFPRound : SDNode<"MipsISD::FPRound", SDTFPRoundOp, [SDNPOptInGlue]>;
-def MipsFPBrcond : SDNode<"MipsISD::FPBrcond", SDT_MipsFPBrcond, 
-                          [SDNPHasChain]>; 
-def MipsFPCmp : SDNode<"MipsISD::FPCmp", SDT_MipsFPCmp>;
-def MipsFPSelectCC : SDNode<"MipsISD::FPSelectCC", SDT_MipsFPSelectCC>;
+def MipsFPBrcond : SDNode<"MipsISD::FPBrcond", SDT_MipsFPBrcond,
+                          [SDNPHasChain, SDNPOptInGlue]>;
+def MipsBuildPairF64 : SDNode<"MipsISD::BuildPairF64", SDT_MipsBuildPairF64>;
+def MipsExtractElementF64 : SDNode<"MipsISD::ExtractElementF64",
+                                   SDT_MipsExtractElementF64>;
 
 // Operand for printing out a condition code.
 let PrintMethod = "printFCCOperand" in
@@ -54,7 +63,7 @@ def IsNotMipsI       : Predicate<"!Subtarget.isMips1()">;
 //===----------------------------------------------------------------------===//
 // Instruction Class Templates
 //
-// A set of multiclasses is used to address the register usage. 
+// A set of multiclasses is used to address the register usage.
 //
 // S32 - single precision in 16 32bit even fp registers
 //       single precision in 32 32bit fp registers in SingleOnly mode
@@ -65,7 +74,7 @@ def IsNotMipsI       : Predicate<"!Subtarget.isMips1()">;
 // Only S32 and D32 are supported right now.
 //===----------------------------------------------------------------------===//
 
-multiclass FFR1_1<bits<6> funct, string asmstr> 
+multiclass FFR1_1<bits<6> funct, string asmstr>
 {
   def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
       !strconcat(asmstr, ".s $fd, $fs"), []>;
@@ -74,31 +83,31 @@ multiclass FFR1_1<bits<6> funct, string asmstr>
       !strconcat(asmstr, ".d $fd, $fs"), []>, Requires<[In32BitMode]>;
 }
 
-multiclass FFR1_2<bits<6> funct, string asmstr, SDNode FOp> 
+multiclass FFR1_2<bits<6> funct, string asmstr, SDNode FOp>
 {
   def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
-                 !strconcat(asmstr, ".s $fd, $fs"), 
+                 !strconcat(asmstr, ".s $fd, $fs"),
                  [(set FGR32:$fd, (FOp FGR32:$fs))]>;
 
   def _D32  : FFR<0x11, funct, 0x1, (outs AFGR64:$fd), (ins AFGR64:$fs),
-                 !strconcat(asmstr, ".d $fd, $fs"), 
+                 !strconcat(asmstr, ".d $fd, $fs"),
                  [(set AFGR64:$fd, (FOp AFGR64:$fs))]>, Requires<[In32BitMode]>;
 }
 
-class FFR1_3<bits<6> funct, bits<5> fmt, RegisterClass RcSrc, 
-              RegisterClass RcDst, string asmstr>: 
-  FFR<0x11, funct, fmt, (outs RcSrc:$fd), (ins RcDst:$fs), 
-      !strconcat(asmstr, " $fd, $fs"), []>; 
+class FFR1_3<bits<6> funct, bits<5> fmt, RegisterClass RcSrc,
+              RegisterClass RcDst, string asmstr>:
+  FFR<0x11, funct, fmt, (outs RcSrc:$fd), (ins RcDst:$fs),
+      !strconcat(asmstr, " $fd, $fs"), []>;
 
 
 multiclass FFR1_4<bits<6> funct, string asmstr, SDNode FOp> {
-  def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), 
-                 (ins FGR32:$fs, FGR32:$ft), 
+  def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd),
+                 (ins FGR32:$fs, FGR32:$ft),
                  !strconcat(asmstr, ".s $fd, $fs, $ft"),
                  [(set FGR32:$fd, (FOp FGR32:$fs, FGR32:$ft))]>;
 
-  def _D32 : FFR<0x11, funct, 0x1, (outs AFGR64:$fd), 
-                 (ins AFGR64:$fs, AFGR64:$ft), 
+  def _D32 : FFR<0x11, funct, 0x1, (outs AFGR64:$fd),
+                 (ins AFGR64:$fs, AFGR64:$ft),
                  !strconcat(asmstr, ".d $fd, $fs, $ft"),
                  [(set AFGR64:$fd, (FOp AFGR64:$fs, AFGR64:$ft))]>,
                  Requires<[In32BitMode]>;
@@ -115,8 +124,8 @@ let ft = 0 in {
   defm TRUNC_W : FFR1_1<0b001101, "trunc.w">;
   defm CVTW    : FFR1_1<0b100100, "cvt.w">;
 
-  defm FABS    : FFR1_2<0b000101, "abs",  fabs>; 
-  defm FNEG    : FFR1_2<0b000111, "neg",  fneg>; 
+  defm FABS    : FFR1_2<0b000101, "abs",  fabs>;
+  defm FNEG    : FFR1_2<0b000111, "neg",  fneg>;
   defm FSQRT   : FFR1_2<0b000100, "sqrt", fsqrt>;
 
   /// Convert to Single Precison
@@ -140,23 +149,23 @@ let ft = 0 in {
     def TRUNC_LD  : FFR1_3<0b001001, 0x1, AFGR64, AFGR64, "trunc.l">;
 
     /// Convert to long signed integer
-    def CVTL_S    : FFR1_3<0b100101, 0x0, FGR32, FGR32, "cvt.l">; 
-    def CVTL_D    : FFR1_3<0b100101, 0x1, AFGR64, AFGR64, "cvt.l">; 
-
-    /// Convert to Double Precison 
-    def CVTD_S32 : FFR1_3<0b100001, 0x0, AFGR64, FGR32, "cvt.d.s">; 
-    def CVTD_W32 : FFR1_3<0b100001, 0x2, AFGR64, FGR32, "cvt.d.w">; 
-    def CVTD_L32 : FFR1_3<0b100001, 0x3, AFGR64, AFGR64, "cvt.d.l">; 
-                   
+    def CVTL_S    : FFR1_3<0b100101, 0x0, FGR32, FGR32, "cvt.l">;
+    def CVTL_D    : FFR1_3<0b100101, 0x1, AFGR64, AFGR64, "cvt.l">;
+
+    /// Convert to Double Precison
+    def CVTD_S32 : FFR1_3<0b100001, 0x0, AFGR64, FGR32, "cvt.d.s">;
+    def CVTD_W32 : FFR1_3<0b100001, 0x2, AFGR64, FGR32, "cvt.d.w">;
+    def CVTD_L32 : FFR1_3<0b100001, 0x3, AFGR64, AFGR64, "cvt.d.l">;
+
     /// Convert to Single Precison
     def CVTS_D32 : FFR1_3<0b100000, 0x1, FGR32, AFGR64, "cvt.s.d">;
-    def CVTS_L32 : FFR1_3<0b100000, 0x3, FGR32, AFGR64, "cvt.s.l">; 
+    def CVTS_L32 : FFR1_3<0b100000, 0x3, FGR32, AFGR64, "cvt.s.l">;
   }
 }
 
 // The odd-numbered registers are only referenced when doing loads,
 // stores, and moves between floating-point and integer registers.
-// When defining instructions, we reference all 32-bit registers, 
+// When defining instructions, we reference all 32-bit registers,
 // regardless of register aliasing.
 let fd = 0 in {
   /// Move Control Registers From/To CPU Registers
@@ -165,7 +174,7 @@ let fd = 0 in {
 
   def CTC1  : FFR<0x11, 0x0, 0x6, (outs CCR:$rt), (ins CPURegs:$fs),
                   "ctc1 $fs, $rt", []>;
-                  
+
   def MFC1  : FFR<0x11, 0x00, 0x00, (outs CPURegs:$rt), (ins FGR32:$fs),
                   "mfc1 $rt, $fs", []>;
 
@@ -180,18 +189,18 @@ def FMOV_D32 : FFR<0x11, 0b000110, 0x1, (outs AFGR64:$fd), (ins AFGR64:$fs),
 
 /// Floating Point Memory Instructions
 let Predicates = [IsNotSingleFloat, IsNotMipsI] in {
-  def LDC1 : FFI<0b110101, (outs AFGR64:$ft), (ins mem:$addr), 
+  def LDC1 : FFI<0b110101, (outs AFGR64:$ft), (ins mem:$addr),
                  "ldc1 $ft, $addr", [(set AFGR64:$ft, (load addr:$addr))]>;
 
-  def SDC1 : FFI<0b111101, (outs), (ins AFGR64:$ft, mem:$addr), 
+  def SDC1 : FFI<0b111101, (outs), (ins AFGR64:$ft, mem:$addr),
                  "sdc1 $ft, $addr", [(store AFGR64:$ft, addr:$addr)]>;
 }
 
-// LWC1 and SWC1 can always be emited with odd registers.
+// LWC1 and SWC1 can always be emitted with odd registers.
 def LWC1  : FFI<0b110001, (outs FGR32:$ft), (ins mem:$addr), "lwc1 $ft, $addr",
-               [(set FGR32:$ft, (load addr:$addr))]>; 
+               [(set FGR32:$ft, (load addr:$addr))]>;
 def SWC1  : FFI<0b111001, (outs), (ins FGR32:$ft, mem:$addr), "swc1 $ft, $addr",
-               [(store FGR32:$ft, addr:$addr)]>; 
+               [(store FGR32:$ft, addr:$addr)]>;
 
 /// Floating-point Aritmetic
 defm FADD : FFR1_4<0x10, "add", fadd>;
@@ -202,7 +211,7 @@ defm FSUB : FFR1_4<0x01, "sub", fsub>;
 //===----------------------------------------------------------------------===//
 // Floating Point Branch Codes
 //===----------------------------------------------------------------------===//
-// Mips branch codes. These correspond to condcode in MipsInstrInfo.h. 
+// Mips branch codes. These correspond to condcode in MipsInstrInfo.h.
 // They must be kept in synch.
 def MIPS_BRANCH_F  : PatLeaf<(i32 0)>;
 def MIPS_BRANCH_T  : PatLeaf<(i32 1)>;
@@ -210,11 +219,11 @@ def MIPS_BRANCH_FL : PatLeaf<(i32 2)>;
 def MIPS_BRANCH_TL : PatLeaf<(i32 3)>;
 
 /// Floating Point Branch of False/True (Likely)
-let isBranch=1, isTerminator=1, hasDelaySlot=1, base=0x8, Uses=[FCR31] in {
-  class FBRANCH<PatLeaf op, string asmstr> : FFI<0x11, (outs), 
+let isBranch=1, isTerminator=1, hasDelaySlot=1, base=0x8, Uses=[FCR31] in
+  class FBRANCH<PatLeaf op, string asmstr> : FFI<0x11, (outs),
         (ins brtarget:$dst), !strconcat(asmstr, " $dst"),
-        [(MipsFPBrcond op, bb:$dst, FCR31)]>;
-}
+        [(MipsFPBrcond op, bb:$dst)]>;
+
 def BC1F  : FBRANCH<MIPS_BRANCH_F,  "bc1f">;
 def BC1T  : FBRANCH<MIPS_BRANCH_T,  "bc1t">;
 def BC1FL : FBRANCH<MIPS_BRANCH_FL, "bc1fl">;
@@ -223,11 +232,11 @@ def BC1TL : FBRANCH<MIPS_BRANCH_TL, "bc1tl">;
 //===----------------------------------------------------------------------===//
 // Floating Point Flag Conditions
 //===----------------------------------------------------------------------===//
-// Mips condition codes. They must correspond to condcode in MipsInstrInfo.h. 
+// Mips condition codes. They must correspond to condcode in MipsInstrInfo.h.
 // They must be kept in synch.
 def MIPS_FCOND_F    : PatLeaf<(i32 0)>;
 def MIPS_FCOND_UN   : PatLeaf<(i32 1)>;
-def MIPS_FCOND_EQ   : PatLeaf<(i32 2)>;
+def MIPS_FCOND_OEQ  : PatLeaf<(i32 2)>;
 def MIPS_FCOND_UEQ  : PatLeaf<(i32 3)>;
 def MIPS_FCOND_OLT  : PatLeaf<(i32 4)>;
 def MIPS_FCOND_ULT  : PatLeaf<(i32 5)>;
@@ -245,44 +254,90 @@ def MIPS_FCOND_NGT  : PatLeaf<(i32 15)>;
 /// Floating Point Compare
 let hasDelaySlot = 1, Defs=[FCR31] in {
   def FCMP_S32 : FCC<0x0, (outs), (ins FGR32:$fs, FGR32:$ft, condcode:$cc),
-      "c.$cc.s $fs, $ft",
-        [(set FCR31, (MipsFPCmp FGR32:$fs, FGR32:$ft, imm:$cc))]>; 
-  
+                     "c.$cc.s $fs, $ft",
+                     [(MipsFPCmp FGR32:$fs, FGR32:$ft, imm:$cc)]>;
+
   def FCMP_D32 : FCC<0x1, (outs), (ins AFGR64:$fs, AFGR64:$ft, condcode:$cc),
-      "c.$cc.d $fs, $ft",
-       [(set FCR31, (MipsFPCmp AFGR64:$fs, AFGR64:$ft, imm:$cc))]>,
-      Requires<[In32BitMode]>;
+                     "c.$cc.d $fs, $ft",
+                     [(MipsFPCmp AFGR64:$fs, AFGR64:$ft, imm:$cc)]>,
+                     Requires<[In32BitMode]>;
 }
 
-//===----------------------------------------------------------------------===//
-// Floating Point Pseudo-Instructions
-//===----------------------------------------------------------------------===//
 
-// For some explanation, see Select_CC at MipsInstrInfo.td. We also embedd a 
-// condiciton code to enable easy handling by the Custom Inserter.
-let usesCustomInserter = 1, Uses=[FCR31] in {
-  class PseudoFPSelCC<RegisterClass RC, string asmstr> : 
-    MipsPseudo<(outs RC:$dst), 
-               (ins CPURegs:$CmpRes, RC:$T, RC:$F, condcode:$cc), asmstr, 
-               [(set RC:$dst, (MipsFPSelectCC CPURegs:$CmpRes, RC:$T, RC:$F,
-                 imm:$cc))]>;
+// Conditional moves:
+// These instructions are expanded in
+// MipsISelLowering::EmitInstrWithCustomInserter if target does not have
+// conditional move instructions.
+// flag:int, data:float
+let usesCustomInserter = 1, Constraints = "$F = $dst" in
+class CondMovIntFP<RegisterClass RC, bits<5> fmt, bits<6> func,
+                   string instr_asm> :
+  FFR<0x11, func, fmt, (outs RC:$dst), (ins RC:$T, CPURegs:$cond, RC:$F),
+      !strconcat(instr_asm, "\t$dst, $T, $cond"), []>;
+
+def MOVZ_S : CondMovIntFP<FGR32, 16, 18, "movz.s">;
+def MOVN_S : CondMovIntFP<FGR32, 16, 19, "movn.s">;
+
+let Predicates = [In32BitMode] in {
+  def MOVZ_D : CondMovIntFP<AFGR64, 17, 18, "movz.d">;
+  def MOVN_D : CondMovIntFP<AFGR64, 17, 19, "movn.d">;
 }
 
-// The values to be selected are fp but the condition test is with integers.
-def Select_CC_S32 : PseudoSelCC<FGR32, "# MipsSelect_CC_S32_f32">;
-def Select_CC_D32 : PseudoSelCC<AFGR64, "# MipsSelect_CC_D32_f32">,
-                    Requires<[In32BitMode]>;
+defm : MovzPats<FGR32, MOVZ_S>;
+defm : MovnPats<FGR32, MOVN_S>;
+
+let Predicates = [In32BitMode] in {
+  defm : MovzPats<AFGR64, MOVZ_D>;
+  defm : MovnPats<AFGR64, MOVN_D>;
+}
 
-// The values to be selected are int but the condition test is done with fp.
-def Select_FCC     : PseudoFPSelCC<CPURegs, "# MipsSelect_FCC">;
+let usesCustomInserter = 1, Uses = [FCR31], Constraints = "$F = $dst" in {
+// flag:float, data:int
+class CondMovFPInt<SDNode cmov, bits<1> tf, string instr_asm> :
+  FCMOV<tf, (outs CPURegs:$dst), (ins CPURegs:$T, CPURegs:$F),
+        !strconcat(instr_asm, "\t$dst, $T, $$fcc0"),
+        [(set CPURegs:$dst, (cmov CPURegs:$T, CPURegs:$F))]>;
+
+// flag:float, data:float
+class CondMovFPFP<RegisterClass RC, SDNode cmov, bits<5> fmt, bits<1> tf,
+                  string instr_asm> :
+  FFCMOV<fmt, tf, (outs RC:$dst), (ins RC:$T, RC:$F),
+         !strconcat(instr_asm, "\t$dst, $T, $$fcc0"),
+         [(set RC:$dst, (cmov RC:$T, RC:$F))]>;
+}
 
-// The values to be selected and the condition test is done with fp.
-def Select_FCC_S32 : PseudoFPSelCC<FGR32, "# MipsSelect_FCC_S32_f32">;
-def Select_FCC_D32 : PseudoFPSelCC<AFGR64, "# MipsSelect_FCC_D32_f32">, 
-                     Requires<[In32BitMode]>;
+def MOVT : CondMovFPInt<MipsCMovFP_T, 1, "movt">;
+def MOVF : CondMovFPInt<MipsCMovFP_F, 0, "movf">;
+def MOVT_S : CondMovFPFP<FGR32, MipsCMovFP_T, 16, 1, "movt.s">;
+def MOVF_S : CondMovFPFP<FGR32, MipsCMovFP_F, 16, 0, "movf.s">;
 
-def MOVCCRToCCR : MipsPseudo<(outs CCR:$dst), (ins CCR:$src), 
-                             "# MOVCCRToCCR", []>; 
+let Predicates = [In32BitMode] in {
+  def MOVT_D : CondMovFPFP<AFGR64, MipsCMovFP_T, 17, 1, "movt.d">;
+  def MOVF_D : CondMovFPFP<AFGR64, MipsCMovFP_F, 17, 0, "movf.d">;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Pseudo-Instructions
+//===----------------------------------------------------------------------===//
+def MOVCCRToCCR : MipsPseudo<(outs CCR:$dst), (ins CCR:$src),
+                             "# MOVCCRToCCR", []>;
+
+// This pseudo instr gets expanded into 2 mtc1 instrs after register
+// allocation.
+def BuildPairF64 :
+  MipsPseudo<(outs AFGR64:$dst),
+             (ins CPURegs:$lo, CPURegs:$hi), "",
+             [(set AFGR64:$dst, (MipsBuildPairF64 CPURegs:$lo, CPURegs:$hi))]>;
+
+// This pseudo instr gets expanded into 2 mfc1 instrs after register
+// allocation.
+// if n is 0, lower part of src is extracted.
+// if n is 1, higher part of src is extracted.
+def ExtractElementF64 :
+  MipsPseudo<(outs CPURegs:$dst),
+             (ins AFGR64:$src, i32imm:$n), "",
+             [(set CPURegs:$dst,
+               (MipsExtractElementF64 AFGR64:$src, imm:$n))]>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Patterns
@@ -306,7 +361,7 @@ def : Pat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S32 FGR32:$src))>;
 def : Pat<(i32 (bitconvert FGR32:$src)),  (MFC1 FGR32:$src)>;
 def : Pat<(f32 (bitconvert CPURegs:$src)), (MTC1 CPURegs:$src)>;
 
-let Predicates = [In32BitMode] in { 
+let Predicates = [In32BitMode] in {
   def : Pat<(f32 (fround AFGR64:$src)), (CVTS_D32 AFGR64:$src)>;
   def : Pat<(f64 (fextend FGR32:$src)), (CVTD_S32 FGR32:$src)>;
 }
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 98ae2fa..9dfcdfb 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -22,8 +22,8 @@
 //===----------------------------------------------------------------------===//
 
 // Generic Mips Format
-class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern, 
-               InstrItinClass itin>: Instruction 
+class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
+               InstrItinClass itin>: Instruction
 {
   field bits<32> Inst;
 
@@ -32,8 +32,8 @@ class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
   bits<6> opcode;
 
   // Top 5 bits are the 'opcode' field
-  let Inst{31-26} = opcode;   
-  
+  let Inst{31-26} = opcode;
+
   dag OutOperandList = outs;
   dag InOperandList  = ins;
 
@@ -52,7 +52,7 @@ class MipsPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>:
 
 class FR<bits<6> op, bits<6> _funct, dag outs, dag ins, string asmstr,
          list<dag> pattern, InstrItinClass itin>:
-      MipsInst<outs, ins, asmstr, pattern, itin> 
+      MipsInst<outs, ins, asmstr, pattern, itin>
 {
   bits<5>  rd;
   bits<5>  rs;
@@ -64,7 +64,7 @@ class FR<bits<6> op, bits<6> _funct, dag outs, dag ins, string asmstr,
   let funct  = _funct;
 
   let Inst{25-21} = rs;
-  let Inst{20-16} = rt; 
+  let Inst{20-16} = rt;
   let Inst{15-11} = rd;
   let Inst{10-6}  = shamt;
   let Inst{5-0}   = funct;
@@ -75,7 +75,7 @@ class FR<bits<6> op, bits<6> _funct, dag outs, dag ins, string asmstr,
 //===----------------------------------------------------------------------===//
 
 class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
-         InstrItinClass itin>: MipsInst<outs, ins, asmstr, pattern, itin> 
+         InstrItinClass itin>: MipsInst<outs, ins, asmstr, pattern, itin>
 {
   bits<5>  rt;
   bits<5>  rs;
@@ -84,7 +84,7 @@ class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
   let opcode = op;
 
   let Inst{25-21} = rs;
-  let Inst{20-16} = rt; 
+  let Inst{20-16} = rt;
   let Inst{15-0}  = imm16;
 }
 
@@ -93,12 +93,12 @@ class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
 //===----------------------------------------------------------------------===//
 
 class FJ<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
-         InstrItinClass itin>: MipsInst<outs, ins, asmstr, pattern, itin> 
+         InstrItinClass itin>: MipsInst<outs, ins, asmstr, pattern, itin>
 {
   bits<26> addr;
 
   let opcode = op;
-  
+
   let Inst{25-0} = addr;
 }
 
@@ -119,9 +119,9 @@ class FJ<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
 // Format FR instruction class in Mips : <|opcode|fmt|ft|fs|fd|funct|>
 //===----------------------------------------------------------------------===//
 
-class FFR<bits<6> op, bits<6> _funct, bits<5> _fmt, dag outs, dag ins, 
-          string asmstr, list<dag> pattern> : 
-          MipsInst<outs, ins, asmstr, pattern, NoItinerary> 
+class FFR<bits<6> op, bits<6> _funct, bits<5> _fmt, dag outs, dag ins,
+          string asmstr, list<dag> pattern> :
+          MipsInst<outs, ins, asmstr, pattern, NoItinerary>
 {
   bits<5>  fd;
   bits<5>  fs;
@@ -134,7 +134,7 @@ class FFR<bits<6> op, bits<6> _funct, bits<5> _fmt, dag outs, dag ins,
   let fmt    = _fmt;
 
   let Inst{25-21} = fmt;
-  let Inst{20-16} = ft; 
+  let Inst{20-16} = ft;
   let Inst{15-11} = fs;
   let Inst{10-6}  = fd;
   let Inst{5-0}   = funct;
@@ -144,8 +144,8 @@ class FFR<bits<6> op, bits<6> _funct, bits<5> _fmt, dag outs, dag ins,
 // Format FI instruction class in Mips : <|opcode|base|ft|immediate|>
 //===----------------------------------------------------------------------===//
 
-class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>: 
-          MipsInst<outs, ins, asmstr, pattern, NoItinerary> 
+class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>:
+          MipsInst<outs, ins, asmstr, pattern, NoItinerary>
 {
   bits<5>  ft;
   bits<5>  base;
@@ -154,7 +154,7 @@ class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>:
   let opcode = op;
 
   let Inst{25-21} = base;
-  let Inst{20-16} = ft; 
+  let Inst{20-16} = ft;
   let Inst{15-0}  = imm16;
 }
 
@@ -162,8 +162,8 @@ class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>:
 // Compare instruction class in Mips : <|010001|fmt|ft|fs|0000011|condcode|>
 //===----------------------------------------------------------------------===//
 
-class FCC<bits<5> _fmt, dag outs, dag ins, string asmstr, list<dag> pattern> : 
-          MipsInst<outs, ins, asmstr, pattern, NoItinerary> 
+class FCC<bits<5> _fmt, dag outs, dag ins, string asmstr, list<dag> pattern> :
+          MipsInst<outs, ins, asmstr, pattern, NoItinerary>
 {
   bits<5>  fs;
   bits<5>  ft;
@@ -174,9 +174,54 @@ class FCC<bits<5> _fmt, dag outs, dag ins, string asmstr, list<dag> pattern> :
   let fmt    = _fmt;
 
   let Inst{25-21} = fmt;
-  let Inst{20-16} = ft; 
+  let Inst{20-16} = ft;
   let Inst{15-11} = fs;
   let Inst{10-6}  = 0;
   let Inst{5-4}   = 0b11;
   let Inst{3-0}   = cc;
 }
+
+
+class FCMOV<bits<1> _tf, dag outs, dag ins, string asmstr,
+            list<dag> pattern> :
+  MipsInst<outs, ins, asmstr, pattern, NoItinerary>
+{
+  bits<5>  rd;
+  bits<5>  rs;
+  bits<3>  N;
+  bits<1>  tf;
+
+  let opcode = 0;
+  let tf = _tf;
+
+  let Inst{25-21} = rs;
+  let Inst{20-18} = N;
+  let Inst{17} = 0;
+  let Inst{16} = tf;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = 1;
+}
+
+class FFCMOV<bits<5> _fmt, bits<1> _tf, dag outs, dag ins, string asmstr,
+             list<dag> pattern> :
+  MipsInst<outs, ins, asmstr, pattern, NoItinerary>
+{
+  bits<5>  fd;
+  bits<5>  fs;
+  bits<3>  N;
+  bits<5>  fmt;
+  bits<1>  tf;
+
+  let opcode = 17;
+  let fmt = _fmt;
+  let tf = _tf;
+
+  let Inst{25-21} = fmt;
+  let Inst{20-18} = N;
+  let Inst{17} = 0;
+  let Inst{16} = tf;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = 17;
+}
+\ No newline at end of file
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index aaf307b..be044fa 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -36,7 +36,7 @@ static bool isZeroImm(const MachineOperand &op) {
 /// not, return 0.  This predicate must return 0 if the instruction has
 /// any side effects other than loading from the stack slot.
 unsigned MipsInstrInfo::
-isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const 
+isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
 {
   if ((MI->getOpcode() == Mips::LW) || (MI->getOpcode() == Mips::LWC1) ||
       (MI->getOpcode() == Mips::LDC1)) {
@@ -57,7 +57,7 @@ isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
 /// not, return 0.  This predicate must return 0 if the instruction has
 /// any side effects other than storing to the stack slot.
 unsigned MipsInstrInfo::
-isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const 
+isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const
 {
   if ((MI->getOpcode() == Mips::SW) || (MI->getOpcode() == Mips::SWC1) ||
       (MI->getOpcode() == Mips::SDC1)) {
@@ -74,7 +74,7 @@ isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const
 /// insertNoop - If data hazard condition is found insert the target nop
 /// instruction.
 void MipsInstrInfo::
-insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const 
+insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const
 {
   DebugLoc DL;
   BuildMI(MBB, MI, DL, get(Mips::NOP));
@@ -136,7 +136,7 @@ copyPhysReg(MachineBasicBlock &MBB,
       .addReg(SrcReg, getKillRegState(KillSrc));
     return;
   }
-  
+
   if (Mips::AFGR64RegClass.contains(DestReg, SrcReg)) {
     BuildMI(MBB, I, DL, get(Mips::FMOV_D32), DestReg)
       .addReg(SrcReg, getKillRegState(KillSrc));
@@ -153,13 +153,13 @@ copyPhysReg(MachineBasicBlock &MBB,
 
 void MipsInstrInfo::
 storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                    unsigned SrcReg, bool isKill, int FI, 
+                    unsigned SrcReg, bool isKill, int FI,
                     const TargetRegisterClass *RC,
                     const TargetRegisterInfo *TRI) const {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
 
-  if (RC == Mips::CPURegsRegisterClass) 
+  if (RC == Mips::CPURegsRegisterClass)
     BuildMI(MBB, I, DL, get(Mips::SW)).addReg(SrcReg, getKillRegState(isKill))
           .addImm(0).addFrameIndex(FI);
   else if (RC == Mips::FGR32RegisterClass)
@@ -171,7 +171,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
         .addReg(SrcReg, getKillRegState(isKill))
         .addImm(0).addFrameIndex(FI);
     } else {
-      const TargetRegisterInfo *TRI = 
+      const TargetRegisterInfo *TRI =
         MBB.getParent()->getTarget().getRegisterInfo();
       const unsigned *SubSet = TRI->getSubRegisters(SrcReg);
       BuildMI(MBB, I, DL, get(Mips::SWC1))
@@ -189,12 +189,12 @@ void MipsInstrInfo::
 loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      unsigned DestReg, int FI,
                      const TargetRegisterClass *RC,
-                     const TargetRegisterInfo *TRI) const 
+                     const TargetRegisterInfo *TRI) const
 {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
 
-  if (RC == Mips::CPURegsRegisterClass) 
+  if (RC == Mips::CPURegsRegisterClass)
     BuildMI(MBB, I, DL, get(Mips::LW), DestReg).addImm(0).addFrameIndex(FI);
   else if (RC == Mips::FGR32RegisterClass)
     BuildMI(MBB, I, DL, get(Mips::LWC1), DestReg).addImm(0).addFrameIndex(FI);
@@ -202,7 +202,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     if (!TM.getSubtarget<MipsSubtarget>().isMips1()) {
       BuildMI(MBB, I, DL, get(Mips::LDC1), DestReg).addImm(0).addFrameIndex(FI);
     } else {
-      const TargetRegisterInfo *TRI = 
+      const TargetRegisterInfo *TRI =
         MBB.getParent()->getTarget().getRegisterInfo();
       const unsigned *SubSet = TRI->getSubRegisters(DestReg);
       BuildMI(MBB, I, DL, get(Mips::LWC1), SubSet[0])
@@ -218,281 +218,202 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 // Branch Analysis
 //===----------------------------------------------------------------------===//
 
-/// GetCondFromBranchOpc - Return the Mips CC that matches 
-/// the correspondent Branch instruction opcode.
-static Mips::CondCode GetCondFromBranchOpc(unsigned BrOpc) 
-{
-  switch (BrOpc) {
-  default: return Mips::COND_INVALID;
-  case Mips::BEQ  : return Mips::COND_E;
-  case Mips::BNE  : return Mips::COND_NE;
-  case Mips::BGTZ : return Mips::COND_GZ;
-  case Mips::BGEZ : return Mips::COND_GEZ;
-  case Mips::BLTZ : return Mips::COND_LZ;
-  case Mips::BLEZ : return Mips::COND_LEZ;
-
-  // We dont do fp branch analysis yet!  
-  case Mips::BC1T : 
-  case Mips::BC1F : return Mips::COND_INVALID;
-  }
+static unsigned GetAnalyzableBrOpc(unsigned Opc) {
+  return (Opc == Mips::BEQ  || Opc == Mips::BNE  || Opc == Mips::BGTZ ||
+          Opc == Mips::BGEZ || Opc == Mips::BLTZ || Opc == Mips::BLEZ ||
+          Opc == Mips::BC1T || Opc == Mips::BC1F || Opc == Mips::J) ? Opc : 0;
 }
 
-/// GetCondBranchFromCond - Return the Branch instruction
-/// opcode that matches the cc.
-unsigned Mips::GetCondBranchFromCond(Mips::CondCode CC) 
+/// GetOppositeBranchOpc - Return the inverse of the specified
+/// opcode, e.g. turning BEQ to BNE.
+unsigned Mips::GetOppositeBranchOpc(unsigned Opc)
 {
-  switch (CC) {
-  default: llvm_unreachable("Illegal condition code!");
-  case Mips::COND_E   : return Mips::BEQ;
-  case Mips::COND_NE  : return Mips::BNE;
-  case Mips::COND_GZ  : return Mips::BGTZ;
-  case Mips::COND_GEZ : return Mips::BGEZ;
-  case Mips::COND_LZ  : return Mips::BLTZ;
-  case Mips::COND_LEZ : return Mips::BLEZ;
-
-  case Mips::FCOND_F:
-  case Mips::FCOND_UN:
-  case Mips::FCOND_EQ:
-  case Mips::FCOND_UEQ:
-  case Mips::FCOND_OLT:
-  case Mips::FCOND_ULT:
-  case Mips::FCOND_OLE:
-  case Mips::FCOND_ULE:
-  case Mips::FCOND_SF:
-  case Mips::FCOND_NGLE:
-  case Mips::FCOND_SEQ:
-  case Mips::FCOND_NGL:
-  case Mips::FCOND_LT:
-  case Mips::FCOND_NGE:
-  case Mips::FCOND_LE:
-  case Mips::FCOND_NGT: return Mips::BC1T;
-
-  case Mips::FCOND_T:
-  case Mips::FCOND_OR:
-  case Mips::FCOND_NEQ:
-  case Mips::FCOND_OGL:
-  case Mips::FCOND_UGE:
-  case Mips::FCOND_OGE:
-  case Mips::FCOND_UGT:
-  case Mips::FCOND_OGT:
-  case Mips::FCOND_ST:
-  case Mips::FCOND_GLE:
-  case Mips::FCOND_SNE:
-  case Mips::FCOND_GL:
-  case Mips::FCOND_NLT:
-  case Mips::FCOND_GE:
-  case Mips::FCOND_NLE:
-  case Mips::FCOND_GT: return Mips::BC1F;
+  switch (Opc) {
+  default: llvm_unreachable("Illegal opcode!");
+  case Mips::BEQ  : return Mips::BNE;
+  case Mips::BNE  : return Mips::BEQ;
+  case Mips::BGTZ : return Mips::BLEZ;
+  case Mips::BGEZ : return Mips::BLTZ;
+  case Mips::BLTZ : return Mips::BGEZ;
+  case Mips::BLEZ : return Mips::BGTZ;
+  case Mips::BC1T : return Mips::BC1F;
+  case Mips::BC1F : return Mips::BC1T;
   }
 }
 
-/// GetOppositeBranchCondition - Return the inverse of the specified 
-/// condition, e.g. turning COND_E to COND_NE.
-Mips::CondCode Mips::GetOppositeBranchCondition(Mips::CondCode CC) 
-{
-  switch (CC) {
-  default: llvm_unreachable("Illegal condition code!");
-  case Mips::COND_E   : return Mips::COND_NE;
-  case Mips::COND_NE  : return Mips::COND_E;
-  case Mips::COND_GZ  : return Mips::COND_LEZ;
-  case Mips::COND_GEZ : return Mips::COND_LZ;
-  case Mips::COND_LZ  : return Mips::COND_GEZ;
-  case Mips::COND_LEZ : return Mips::COND_GZ;
-  case Mips::FCOND_F  : return Mips::FCOND_T;
-  case Mips::FCOND_UN : return Mips::FCOND_OR;
-  case Mips::FCOND_EQ : return Mips::FCOND_NEQ;
-  case Mips::FCOND_UEQ: return Mips::FCOND_OGL;
-  case Mips::FCOND_OLT: return Mips::FCOND_UGE;
-  case Mips::FCOND_ULT: return Mips::FCOND_OGE;
-  case Mips::FCOND_OLE: return Mips::FCOND_UGT;
-  case Mips::FCOND_ULE: return Mips::FCOND_OGT;
-  case Mips::FCOND_SF:  return Mips::FCOND_ST;
-  case Mips::FCOND_NGLE:return Mips::FCOND_GLE;
-  case Mips::FCOND_SEQ: return Mips::FCOND_SNE;
-  case Mips::FCOND_NGL: return Mips::FCOND_GL;
-  case Mips::FCOND_LT:  return Mips::FCOND_NLT;
-  case Mips::FCOND_NGE: return Mips::FCOND_GE;
-  case Mips::FCOND_LE:  return Mips::FCOND_NLE;
-  case Mips::FCOND_NGT: return Mips::FCOND_GT;
-  }
+static void AnalyzeCondBr(const MachineInstr* Inst, unsigned Opc,
+                          MachineBasicBlock *&BB,
+                          SmallVectorImpl<MachineOperand>& Cond) {
+  assert(GetAnalyzableBrOpc(Opc) && "Not an analyzable branch");
+  int NumOp = Inst->getNumExplicitOperands();
+  
+  // for both int and fp branches, the last explicit operand is the
+  // MBB.
+  BB = Inst->getOperand(NumOp-1).getMBB();
+  Cond.push_back(MachineOperand::CreateImm(Opc));
+
+  for (int i=0; i<NumOp-1; i++)
+    Cond.push_back(Inst->getOperand(i));
 }
 
-bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, 
+bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
                                   MachineBasicBlock *&TBB,
                                   MachineBasicBlock *&FBB,
                                   SmallVectorImpl<MachineOperand> &Cond,
-                                  bool AllowModify) const 
+                                  bool AllowModify) const
 {
-  // If the block has no terminators, it just falls into the block after it.
-  MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin())
+  MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
+
+  // Skip all the debug instructions.
+  while (I != REnd && I->isDebugValue())
+    ++I;
+
+  if (I == REnd || !isUnpredicatedTerminator(&*I)) {
+    // If this block ends with no branches (it just falls through to its succ)
+    // just return false, leaving TBB/FBB null.
+    TBB = FBB = NULL;
     return false;
-  --I;
-  while (I->isDebugValue()) {
-    if (I == MBB.begin())
-      return false;
-    --I;
   }
-  if (!isUnpredicatedTerminator(I))
-    return false;
-  
-  // Get the last instruction in the block.
-  MachineInstr *LastInst = I;
-  
-  // If there is only one terminator instruction, process it.
+
+  MachineInstr *LastInst = &*I;
   unsigned LastOpc = LastInst->getOpcode();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-    if (!LastInst->getDesc().isBranch())
+
+  // Not an analyzable branch (must be an indirect jump).
+  if (!GetAnalyzableBrOpc(LastOpc))
+    return true;
+
+  // Get the second to last instruction in the block.
+  unsigned SecondLastOpc = 0;
+  MachineInstr *SecondLastInst = NULL;
+
+  if (++I != REnd) {
+    SecondLastInst = &*I;
+    SecondLastOpc = GetAnalyzableBrOpc(SecondLastInst->getOpcode());
+
+    // Not an analyzable branch (must be an indirect jump).
+    if (isUnpredicatedTerminator(SecondLastInst) && !SecondLastOpc)
       return true;
+  }
 
+  // If there is only one terminator instruction, process it.
+  if (!SecondLastOpc) {
     // Unconditional branch
     if (LastOpc == Mips::J) {
       TBB = LastInst->getOperand(0).getMBB();
       return false;
     }
 
-    Mips::CondCode BranchCode = GetCondFromBranchOpc(LastInst->getOpcode());
-    if (BranchCode == Mips::COND_INVALID)
-      return true;  // Can't handle indirect branch.
-
     // Conditional branch
-    // Block ends with fall-through condbranch.
-    if (LastOpc != Mips::COND_INVALID) {
-      int LastNumOp = LastInst->getNumOperands();
-
-      TBB = LastInst->getOperand(LastNumOp-1).getMBB();
-      Cond.push_back(MachineOperand::CreateImm(BranchCode));
-
-      for (int i=0; i<LastNumOp-1; i++) {
-        Cond.push_back(LastInst->getOperand(i));
-      }
-
-      return false;
-    }
+    AnalyzeCondBr(LastInst, LastOpc, TBB, Cond);
+    return false;
   }
-  
-  // Get the instruction before it if it is a terminator.
-  MachineInstr *SecondLastInst = I;
-  
+
+  // If we reached here, there are two branches.
   // If there are three terminators, we don't know what sort of block this is.
-  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+  if (++I != REnd && isUnpredicatedTerminator(&*I))
     return true;
 
-  // If the block ends with Mips::J and a Mips::BNE/Mips::BEQ, handle it.
-  unsigned SecondLastOpc    = SecondLastInst->getOpcode();
-  Mips::CondCode BranchCode = GetCondFromBranchOpc(SecondLastOpc);
+  // If second to last instruction is an unconditional branch,
+  // analyze it and remove the last instruction.
+  if (SecondLastOpc == Mips::J) {
+    // Return if the last instruction cannot be removed.
+    if (!AllowModify)
+      return true;
 
-  if (BranchCode != Mips::COND_INVALID && LastOpc == Mips::J) {
-    int SecondNumOp = SecondLastInst->getNumOperands();
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    LastInst->eraseFromParent();
+    return false;
+  }
 
-    TBB = SecondLastInst->getOperand(SecondNumOp-1).getMBB();
-    Cond.push_back(MachineOperand::CreateImm(BranchCode));
+  // Conditional branch followed by an unconditional branch.
+  // The last one must be unconditional.
+  if (LastOpc != Mips::J)
+    return true;
 
-    for (int i=0; i<SecondNumOp-1; i++) {
-      Cond.push_back(SecondLastInst->getOperand(i));
-    }
+  AnalyzeCondBr(SecondLastInst, SecondLastOpc, TBB, Cond);
+  FBB = LastInst->getOperand(0).getMBB();
 
-    FBB = LastInst->getOperand(0).getMBB();
-    return false;
-  }
+  return false;
+} 
   
-  // If the block ends with two unconditional branches, handle it. The last 
-  // one is not executed, so remove it.
-  if ((SecondLastOpc == Mips::J) && (LastOpc == Mips::J)) {
-    TBB = SecondLastInst->getOperand(0).getMBB();
-    I = LastInst;
-    if (AllowModify)
-      I->eraseFromParent();
-    return false;
-  }
+void MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB,
+                                MachineBasicBlock *TBB, DebugLoc DL,
+                                const SmallVectorImpl<MachineOperand>& Cond)
+  const {
+  unsigned Opc = Cond[0].getImm();
+  const TargetInstrDesc &TID = get(Opc);
+  MachineInstrBuilder MIB = BuildMI(&MBB, DL, TID);
 
-  // Otherwise, can't handle this.
-  return true;
+  for (unsigned i = 1; i < Cond.size(); ++i)
+    MIB.addReg(Cond[i].getReg());
+
+  MIB.addMBB(TBB);
 }
 
 unsigned MipsInstrInfo::
-InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, 
+InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
              MachineBasicBlock *FBB,
              const SmallVectorImpl<MachineOperand> &Cond,
              DebugLoc DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
-  assert((Cond.size() == 3 || Cond.size() == 2 || Cond.size() == 0) &&
-         "Mips branch conditions can have two|three components!");
 
-  if (FBB == 0) { // One way branch.
-    if (Cond.empty()) {
-      // Unconditional branch?
-      BuildMI(&MBB, DL, get(Mips::J)).addMBB(TBB);
-    } else {
-      // Conditional branch.
-      unsigned Opc = GetCondBranchFromCond((Mips::CondCode)Cond[0].getImm());
-      const TargetInstrDesc &TID = get(Opc);
-
-      if (TID.getNumOperands() == 3)
-        BuildMI(&MBB, DL, TID).addReg(Cond[1].getReg())
-                          .addReg(Cond[2].getReg())
-                          .addMBB(TBB);
-      else
-        BuildMI(&MBB, DL, TID).addReg(Cond[1].getReg())
-                          .addMBB(TBB);
-
-    }                             
-    return 1;
-  }
-  
-  // Two-way Conditional branch.
-  unsigned Opc = GetCondBranchFromCond((Mips::CondCode)Cond[0].getImm());
-  const TargetInstrDesc &TID = get(Opc);
+  // # of condition operands:
+  //  Unconditional branches: 0
+  //  Floating point branches: 1 (opc)
+  //  Int BranchZero: 2 (opc, reg)
+  //  Int Branch: 3 (opc, reg0, reg1)
+  assert((Cond.size() <= 3) &&
+         "# of Mips branch conditions must be <= 3!");
 
-  if (TID.getNumOperands() == 3)
-    BuildMI(&MBB, DL, TID).addReg(Cond[1].getReg()).addReg(Cond[2].getReg())
-                      .addMBB(TBB);
-  else
-    BuildMI(&MBB, DL, TID).addReg(Cond[1].getReg()).addMBB(TBB);
+  // Two-way Conditional branch.
+  if (FBB) {
+    BuildCondBr(MBB, TBB, DL, Cond);
+    BuildMI(&MBB, DL, get(Mips::J)).addMBB(FBB);
+    return 2;
+  }
 
-  BuildMI(&MBB, DL, get(Mips::J)).addMBB(FBB);
-  return 2;
+  // One way branch.
+  // Unconditional branch.
+  if (Cond.empty())
+    BuildMI(&MBB, DL, get(Mips::J)).addMBB(TBB);
+  else // Conditional branch.
+    BuildCondBr(MBB, TBB, DL, Cond);
+  return 1;
 }
 
 unsigned MipsInstrInfo::
-RemoveBranch(MachineBasicBlock &MBB) const 
+RemoveBranch(MachineBasicBlock &MBB) const
 {
-  MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin()) return 0;
-  --I;
-  while (I->isDebugValue()) {
-    if (I == MBB.begin())
-      return 0;
-    --I;
-  }
-  if (I->getOpcode() != Mips::J && 
-      GetCondFromBranchOpc(I->getOpcode()) == Mips::COND_INVALID)
-    return 0;
-  
-  // Remove the branch.
-  I->eraseFromParent();
-  
-  I = MBB.end();
-  
-  if (I == MBB.begin()) return 1;
-  --I;
-  if (GetCondFromBranchOpc(I->getOpcode()) == Mips::COND_INVALID)
-    return 1;
-  
-  // Remove the branch.
-  I->eraseFromParent();
-  return 2;
+  MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
+  MachineBasicBlock::reverse_iterator FirstBr;
+  unsigned removed;
+
+  // Skip all the debug instructions.
+  while (I != REnd && I->isDebugValue())
+    ++I;
+
+  FirstBr = I;
+
+  // Up to 2 branches are removed.
+  // Note that indirect branches are not removed.
+  for(removed = 0; I != REnd && removed < 2; ++I, ++removed)
+    if (!GetAnalyzableBrOpc(I->getOpcode()))
+      break;
+
+  MBB.erase(I.base(), FirstBr.base());
+
+  return removed;
 }
 
-/// ReverseBranchCondition - Return the inverse opcode of the 
+/// ReverseBranchCondition - Return the inverse opcode of the
 /// specified Branch instruction.
 bool MipsInstrInfo::
-ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const 
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const
 {
-  assert( (Cond.size() == 3 || Cond.size() == 2) && 
+  assert( (Cond.size() && Cond.size() <= 3) &&
           "Invalid Mips branch condition!");
-  Cond[0].setImm(GetOppositeBranchCondition((Mips::CondCode)Cond[0].getImm()));
+  Cond[0].setImm(Mips::GetOppositeBranchOpc(Cond[0].getImm()));
   return false;
 }
 
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 52a3d39..5fdbf1f 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -37,7 +37,7 @@ namespace Mips {
     // To be used with float branch True
     FCOND_F,
     FCOND_UN,
-    FCOND_EQ,
+    FCOND_OEQ,
     FCOND_UEQ,
     FCOND_OLT,
     FCOND_ULT,
@@ -57,8 +57,8 @@ namespace Mips {
     // above ones, but are used with a branch False;
     FCOND_T,
     FCOND_OR,
-    FCOND_NEQ,
-    FCOND_OGL,
+    FCOND_UNE,
+    FCOND_ONE,
     FCOND_UGE,
     FCOND_OGE,
     FCOND_UGT,
@@ -70,27 +70,15 @@ namespace Mips {
     FCOND_NLT,
     FCOND_GE,
     FCOND_NLE,
-    FCOND_GT,
-
-    // Only integer conditions
-    COND_E,
-    COND_GZ,
-    COND_GEZ,
-    COND_LZ,
-    COND_LEZ,
-    COND_NE,
-    COND_INVALID
+    FCOND_GT
   };
-  
-  // Turn condition code into conditional branch opcode.
-  unsigned GetCondBranchFromCond(CondCode CC);
 
-  /// GetOppositeBranchCondition - Return the inverse of the specified cond,
-  /// e.g. turning COND_E to COND_NE.
-  CondCode GetOppositeBranchCondition(Mips::CondCode CC);
+  /// GetOppositeBranchOpc - Return the inverse of the specified
+  /// opcode, e.g. turning BEQ to BNE.
+  unsigned GetOppositeBranchOpc(unsigned Opc);
 
   /// MipsCCToString - Map each FP condition code to its string
-  inline static const char *MipsFCCToString(Mips::CondCode CC) 
+  inline static const char *MipsFCCToString(Mips::CondCode CC)
   {
     switch (CC) {
       default: llvm_unreachable("Unknown condition code");
@@ -98,10 +86,10 @@ namespace Mips {
       case FCOND_T:   return "f";
       case FCOND_UN:
       case FCOND_OR:  return "un";
-      case FCOND_EQ: 
-      case FCOND_NEQ: return "eq";
+      case FCOND_OEQ:
+      case FCOND_UNE: return "eq";
       case FCOND_UEQ:
-      case FCOND_OGL: return "ueq";
+      case FCOND_ONE: return "ueq";
       case FCOND_OLT:
       case FCOND_UGE: return "olt";
       case FCOND_ULT:
@@ -121,11 +109,11 @@ namespace Mips {
       case FCOND_LT:
       case FCOND_NLT: return "lt";
       case FCOND_NGE:
-      case FCOND_GE:  return "ge";
+      case FCOND_GE:  return "nge";
       case FCOND_LE:
-      case FCOND_NLE: return "nle";
+      case FCOND_NLE: return "le";
       case FCOND_NGT:
-      case FCOND_GT:  return "gt";
+      case FCOND_GT:  return "ngt";
     }
   }
 }
@@ -138,27 +126,27 @@ namespace MipsII {
   enum TOF {
     //===------------------------------------------------------------------===//
     // Mips Specific MachineOperand flags.
- 
+
     MO_NO_FLAG,
 
     /// MO_GOT - Represents the offset into the global offset table at which
     /// the address the relocation entry symbol resides during execution.
     MO_GOT,
 
-    /// MO_GOT_CALL - Represents the offset into the global offset table at 
-    /// which the address of a call site relocation entry symbol resides 
+    /// MO_GOT_CALL - Represents the offset into the global offset table at
+    /// which the address of a call site relocation entry symbol resides
     /// during execution. This is different from the above since this flag
     /// can only be present in call instructions.
     MO_GOT_CALL,
 
-    /// MO_GPREL - Represents the offset from the current gp value to be used 
+    /// MO_GPREL - Represents the offset from the current gp value to be used
     /// for the relocatable object file being produced.
     MO_GPREL,
 
-    /// MO_ABS_HILO - Represents the hi or low part of an absolute symbol
-    /// address. 
-    MO_ABS_HILO
-
+    /// MO_ABS_HI/LO - Represents the hi or low part of an absolute symbol
+    /// address.
+    MO_ABS_HI,
+    MO_ABS_LO
   };
 }
 
@@ -181,7 +169,7 @@ public:
   /// any side effects other than loading from the stack slot.
   virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
                                        int &FrameIndex) const;
-  
+
   /// isStoreToStackSlot - If the specified machine instruction is a direct
   /// store to a stack slot, return the virtual or physical register number of
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
@@ -189,13 +177,19 @@ public:
   /// any side effects other than storing to the stack slot.
   virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
                                       int &FrameIndex) const;
- 
+
   /// Branch Analysis
   virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                              MachineBasicBlock *&FBB,
                              SmallVectorImpl<MachineOperand> &Cond,
                              bool AllowModify) const;
   virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+private:
+  void BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB, DebugLoc DL,
+                   const SmallVectorImpl<MachineOperand>& Cond) const;
+
+public:
   virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                                 MachineBasicBlock *FBB,
                                 const SmallVectorImpl<MachineOperand> &Cond,
@@ -220,7 +214,7 @@ public:
   bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
 
   /// Insert nop instruction when hazard condition is found
-  virtual void insertNoop(MachineBasicBlock &MBB, 
+  virtual void insertNoop(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MI) const;
 
   /// getGlobalBaseReg - Return a virtual register initialized with the
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index b70266a..19b9c35 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -19,18 +19,19 @@ include "MipsInstrFormats.td"
 
 def SDT_MipsRet          : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 def SDT_MipsJmpLink      : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
-def SDT_MipsSelectCC     : SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>,
-                                         SDTCisSameAs<2, 3>, SDTCisInt<1>]>;
 def SDT_MipsCMov         : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
-                                         SDTCisSameAs<1, 2>, SDTCisSameAs<3, 4>,
-                                         SDTCisInt<4>]>;
+                                                SDTCisSameAs<1, 2>,
+                                                SDTCisSameAs<3, 4>,
+                                                SDTCisInt<4>]>;
 def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
 def SDT_MipsCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-def SDT_MipsMAddMSub     : SDTypeProfile<0, 4, 
+def SDT_MipsMAddMSub     : SDTypeProfile<0, 4,
                                          [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
-                                          SDTCisSameAs<1, 2>, 
+                                          SDTCisSameAs<1, 2>,
                                           SDTCisSameAs<2, 3>]>;
-
+def SDT_MipsDivRem       : SDTypeProfile<0, 2,
+                                         [SDTCisVT<0, i32>,
+                                          SDTCisSameAs<0, 1>]>;
 
 // Call
 def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink,
@@ -54,9 +55,6 @@ def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart,
 def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_MipsCallSeqEnd,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
-// Select Condition Code
-def MipsSelectCC  : SDNode<"MipsISD::SelectCC", SDT_MipsSelectCC>;
-
 // MAdd*/MSub* nodes
 def MipsMAdd      : SDNode<"MipsISD::MAdd", SDT_MipsMAddMSub,
                            [SDNPOptInGlue, SDNPOutGlue]>;
@@ -67,6 +65,12 @@ def MipsMSub      : SDNode<"MipsISD::MSub", SDT_MipsMAddMSub,
 def MipsMSubu     : SDNode<"MipsISD::MSubu", SDT_MipsMAddMSub,
                            [SDNPOptInGlue, SDNPOutGlue]>;
 
+// DivRem(u) nodes
+def MipsDivRem    : SDNode<"MipsISD::DivRem", SDT_MipsDivRem,
+                           [SDNPOutGlue]>;
+def MipsDivRemU   : SDNode<"MipsISD::DivRemU", SDT_MipsDivRem,
+                           [SDNPOutGlue]>;
+
 //===----------------------------------------------------------------------===//
 // Mips Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
@@ -165,7 +169,7 @@ class ArithOverflowI<bits<6> op, string instr_asm, SDNode OpNode,
 let rd = 0, shamt = 0, Defs = [HI, LO], Uses = [HI, LO] in
 class MArithR<bits<6> func, string instr_asm, SDNode op> :
   FR<0x1c, func, (outs), (ins CPURegs:$rs, CPURegs:$rt),
-     !strconcat(instr_asm, "\t$rs, $rt"), 
+     !strconcat(instr_asm, "\t$rs, $rt"),
      [(op CPURegs:$rs, CPURegs:$rt, LO, HI)], IIImul>;
 
 //  Logical
@@ -185,7 +189,7 @@ class LogicNOR<bits<6> op, bits<6> func, string instr_asm>:
      [(set CPURegs:$dst, (not (or CPURegs:$b, CPURegs:$c)))], IIAlu>;
 
 // Shifts
-class LogicR_shift_rotate_imm<bits<6> func, bits<5> _rs, string instr_asm, 
+class LogicR_shift_rotate_imm<bits<6> func, bits<5> _rs, string instr_asm,
                               SDNode OpNode>:
   FR<0x00, func, (outs CPURegs:$dst), (ins CPURegs:$b, shamt:$c),
      !strconcat(instr_asm, "\t$dst, $b, $c"),
@@ -193,7 +197,7 @@ class LogicR_shift_rotate_imm<bits<6> func, bits<5> _rs, string instr_asm,
   let rs = _rs;
 }
 
-class LogicR_shift_rotate_reg<bits<6> func, bits<5> _shamt, string instr_asm, 
+class LogicR_shift_rotate_reg<bits<6> func, bits<5> _shamt, string instr_asm,
                               SDNode OpNode>:
   FR<0x00, func, (outs CPURegs:$dst), (ins CPURegs:$c, CPURegs:$b),
      !strconcat(instr_asm, "\t$dst, $b, $c"),
@@ -283,9 +287,16 @@ let isCall=1, hasDelaySlot=1,
 }
 
 // Mul, Div
-class MulDiv<bits<6> func, string instr_asm, InstrItinClass itin>:
-  FR<0x00, func, (outs), (ins CPURegs:$a, CPURegs:$b),
-     !strconcat(instr_asm, "\t$a, $b"), [], itin>;
+let Defs = [HI, LO] in {
+  class Mul<bits<6> func, string instr_asm, InstrItinClass itin>:
+    FR<0x00, func, (outs), (ins CPURegs:$a, CPURegs:$b),
+       !strconcat(instr_asm, "\t$a, $b"), [], itin>;
+
+  class Div<SDNode op, bits<6> func, string instr_asm, InstrItinClass itin>:
+            FR<0x00, func, (outs), (ins CPURegs:$a, CPURegs:$b),
+            !strconcat(instr_asm, "\t$$zero, $a, $b"),
+            [(op CPURegs:$a, CPURegs:$b)], itin>;
+}
 
 // Move from Hi/Lo
 class MoveFromLOHI<bits<6> func, string instr_asm>:
@@ -348,6 +359,11 @@ def REORDER   : MipsPseudo<(outs), (ins), ".set\treorder",   []>;
 def NOMACRO   : MipsPseudo<(outs), (ins), ".set\tnomacro",   []>;
 def NOREORDER : MipsPseudo<(outs), (ins), ".set\tnoreorder", []>;
 
+// These macros are inserted to prevent GAS from complaining
+// when using the AT register.
+def NOAT      : MipsPseudo<(outs), (ins), ".set\tnoat", []>;
+def ATMACRO   : MipsPseudo<(outs), (ins), ".set\tat", []>;
+
 // When handling PIC code the assembler needs .cpload and .cprestore
 // directives. If the real instructions corresponding these directives
 // are used, we have the same behavior, but get also a bunch of warnings
@@ -355,18 +371,6 @@ def NOREORDER : MipsPseudo<(outs), (ins), ".set\tnoreorder", []>;
 def CPLOAD : MipsPseudo<(outs), (ins CPURegs:$picreg), ".cpload\t$picreg", []>;
 def CPRESTORE : MipsPseudo<(outs), (ins uimm16:$loc), ".cprestore\t$loc\n", []>;
 
-// The supported Mips ISAs dont have any instruction close to the SELECT_CC
-// operation. The solution is to create a Mips pseudo SELECT_CC instruction
-// (MipsSelectCC), use LowerSELECT_CC to generate this instruction and finally
-// replace it for real supported nodes into EmitInstrWithCustomInserter
-let usesCustomInserter = 1 in {
-  class PseudoSelCC<RegisterClass RC, string asmstr>:
-    MipsPseudo<(outs RC:$dst), (ins CPURegs:$CmpRes, RC:$T, RC:$F), asmstr,
-    [(set RC:$dst, (MipsSelectCC CPURegs:$CmpRes, RC:$T, RC:$F))]>;
-}
-
-def Select_CC : PseudoSelCC<CPURegs, "# MipsSelect_CC_i32">;
-
 //===----------------------------------------------------------------------===//
 // Instruction definition
 //===----------------------------------------------------------------------===//
@@ -447,12 +451,10 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1,
                 "jr\t$target", [(MipsRet CPURegs:$target)], IIBranch>;
 
 /// Multiply and Divide Instructions.
-let Defs = [HI, LO] in {
-  def MULT    : MulDiv<0x18, "mult", IIImul>;
-  def MULTu   : MulDiv<0x19, "multu", IIImul>;
-  def DIV     : MulDiv<0x1a, "div", IIIdiv>;
-  def DIVu    : MulDiv<0x1b, "divu", IIIdiv>;
-}
+def MULT    : Mul<0x18, "mult", IIImul>;
+def MULTu   : Mul<0x19, "multu", IIImul>;
+def SDIV    : Div<MipsDivRem, 0x1a, "div", IIIdiv>;
+def UDIV    : Div<MipsDivRemU, 0x1b, "divu", IIIdiv>;
 
 let Defs = [HI] in
   def MTHI  : MoveToLOHI<0x11, "mthi">;
@@ -489,10 +491,19 @@ let Predicates = [HasSwap] in {
 def MIPS_CMOV_ZERO  : PatLeaf<(i32 0)>;
 def MIPS_CMOV_NZERO : PatLeaf<(i32 1)>;
 
-let Predicates = [HasCondMov], Constraints = "$F = $dst" in {
-  def MOVN : CondMov<0x0a, "movn", MIPS_CMOV_NZERO>;
-  def MOVZ : CondMov<0x0b, "movz", MIPS_CMOV_ZERO>;
-}
+// Conditional moves:
+// These instructions are expanded in
+// MipsISelLowering::EmitInstrWithCustomInserter if target does not have
+// conditional move instructions.
+// flag:int, data:int
+let usesCustomInserter = 1, shamt = 0, Constraints = "$F = $dst" in
+  class CondMovIntInt<bits<6> funct, string instr_asm> :
+    FR<0, funct, (outs CPURegs:$dst),
+       (ins CPURegs:$T, CPURegs:$cond, CPURegs:$F),
+       !strconcat(instr_asm, "\t$dst, $T, $cond"), [], NoItinerary>;
+
+def MOVZ_I : CondMovIntInt<0x0a, "movz">;
+def MOVN_I : CondMovIntInt<0x0b, "movn">;
 
 /// No operation
 let addr=0 in
@@ -533,7 +544,7 @@ def : Pat<(subc CPURegs:$lhs, CPURegs:$rhs),
           (SUBu CPURegs:$lhs, CPURegs:$rhs)>;
 def : Pat<(addc CPURegs:$lhs, CPURegs:$rhs),
           (ADDu CPURegs:$lhs, CPURegs:$rhs)>;
-def : Pat<(addc  CPURegs:$src, imm:$imm),
+def : Pat<(addc  CPURegs:$src, immSExt16:$imm),
           (ADDiu CPURegs:$src, imm:$imm)>;
 
 // Call
@@ -546,8 +557,11 @@ def : Pat<(MipsJmpLink (i32 texternalsym:$dst)),
 
 // hi/lo relocs
 def : Pat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
+def : Pat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>;
 def : Pat<(add CPURegs:$hi, (MipsLo tglobaladdr:$lo)),
           (ADDiu CPURegs:$hi, tglobaladdr:$lo)>;
+def : Pat<(add CPURegs:$hi, (MipsLo tblockaddress:$lo)),
+          (ADDiu CPURegs:$hi, tblockaddress:$lo)>;
 
 def : Pat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>;
 def : Pat<(add CPURegs:$hi, (MipsLo tjumptable:$lo)),
@@ -599,33 +613,43 @@ def : Pat<(brcond CPURegs:$cond, bb:$dst),
           (BNE CPURegs:$cond, ZERO, bb:$dst)>;
 
 // select patterns
-def : Pat<(select (setge CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F),
-          (MOVZ CPURegs:$F, CPURegs:$T, (SLT CPURegs:$lhs, CPURegs:$rhs))>;
-def : Pat<(select (setuge CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F),
-          (MOVZ CPURegs:$F, CPURegs:$T, (SLTu CPURegs:$lhs, CPURegs:$rhs))>;
-def : Pat<(select (setge CPURegs:$lhs, immSExt16:$rhs), CPURegs:$T, CPURegs:$F),
-          (MOVZ CPURegs:$F, CPURegs:$T, (SLTi CPURegs:$lhs, immSExt16:$rhs))>;
-def : Pat<(select (setuge CPURegs:$lh, immSExt16:$rh), CPURegs:$T, CPURegs:$F),
-          (MOVZ CPURegs:$F, CPURegs:$T, (SLTiu CPURegs:$lh, immSExt16:$rh))>;
-
-def : Pat<(select (setle CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F),
-          (MOVZ CPURegs:$F, CPURegs:$T, (SLT CPURegs:$rhs, CPURegs:$lhs))>;
-def : Pat<(select (setule CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F),
-          (MOVZ CPURegs:$F, CPURegs:$T, (SLTu CPURegs:$rhs, CPURegs:$lhs))>;
-
-def : Pat<(select (seteq CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F),
-          (MOVZ CPURegs:$F, CPURegs:$T, (XOR CPURegs:$lhs, CPURegs:$rhs))>;
-def : Pat<(select (setne CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F),
-          (MOVN CPURegs:$F, CPURegs:$T, (XOR CPURegs:$lhs, CPURegs:$rhs))>;
-
-def : Pat<(select CPURegs:$cond, CPURegs:$T, CPURegs:$F),
-          (MOVN CPURegs:$F, CPURegs:$T, CPURegs:$cond)>;
+multiclass MovzPats<RegisterClass RC, Instruction MOVZInst> {
+  def : Pat<(select (setge CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F),
+            (MOVZInst RC:$T, (SLT CPURegs:$lhs, CPURegs:$rhs), RC:$F)>;
+  def : Pat<(select (setuge CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F),
+            (MOVZInst RC:$T, (SLTu CPURegs:$lhs, CPURegs:$rhs), RC:$F)>;
+  def : Pat<(select (setge CPURegs:$lhs, immSExt16:$rhs), RC:$T, RC:$F),
+            (MOVZInst RC:$T, (SLTi CPURegs:$lhs, immSExt16:$rhs), RC:$F)>;
+  def : Pat<(select (setuge CPURegs:$lh, immSExt16:$rh), RC:$T, RC:$F),
+            (MOVZInst RC:$T, (SLTiu CPURegs:$lh, immSExt16:$rh), RC:$F)>;
+  def : Pat<(select (setle CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F),
+            (MOVZInst RC:$T, (SLT CPURegs:$rhs, CPURegs:$lhs), RC:$F)>;
+  def : Pat<(select (setule CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F),
+            (MOVZInst RC:$T, (SLTu CPURegs:$rhs, CPURegs:$lhs), RC:$F)>;
+  def : Pat<(select (seteq CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F),
+            (MOVZInst RC:$T, (XOR CPURegs:$lhs, CPURegs:$rhs), RC:$F)>;
+  def : Pat<(select (seteq CPURegs:$lhs, 0), RC:$T, RC:$F),
+            (MOVZInst RC:$T, CPURegs:$lhs, RC:$F)>;
+}
+
+multiclass MovnPats<RegisterClass RC, Instruction MOVNInst> {
+  def : Pat<(select (setne CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F),
+            (MOVNInst RC:$T, (XOR CPURegs:$lhs, CPURegs:$rhs), RC:$F)>;
+  def : Pat<(select CPURegs:$cond, RC:$T, RC:$F),
+            (MOVNInst RC:$T, CPURegs:$cond, RC:$F)>;
+  def : Pat<(select (setne CPURegs:$lhs, 0), RC:$T, RC:$F),
+            (MOVNInst RC:$T, CPURegs:$lhs, RC:$F)>;
+}
+
+defm : MovzPats<CPURegs, MOVZ_I>;
+defm : MovnPats<CPURegs, MOVN_I>;
 
 // select patterns with got access
-def : Pat<(select (setne CPURegs:$lhs, CPURegs:$rhs),
-                  (i32 tglobaladdr:$T), CPURegs:$F),
-          (MOVN CPURegs:$F, (ADDiu GP, tglobaladdr:$T),
-                (XOR CPURegs:$lhs, CPURegs:$rhs))>;
+let AddedComplexity = 10 in
+  def : Pat<(select (setne CPURegs:$lhs, CPURegs:$rhs),
+                    (i32 tglobaladdr:$T), CPURegs:$F),
+            (MOVN_I CPURegs:$F, (ADDiu GP, tglobaladdr:$T),
+                    (XOR CPURegs:$lhs, CPURegs:$rhs))>;
 
 // setcc patterns
 def : Pat<(seteq CPURegs:$lhs, CPURegs:$rhs),
diff --git a/lib/Target/Mips/MipsMCAsmInfo.h b/lib/Target/Mips/MipsMCAsmInfo.h
index 15a867e..41b7192 100644
--- a/lib/Target/Mips/MipsMCAsmInfo.h
+++ b/lib/Target/Mips/MipsMCAsmInfo.h
@@ -19,7 +19,7 @@
 
 namespace llvm {
   class Target;
-  
+
   class MipsMCAsmInfo : public MCAsmInfo {
   public:
     explicit MipsMCAsmInfo(const Target &T, StringRef TT);
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 3719e58..c09b129 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -38,7 +38,7 @@
 
 using namespace llvm;
 
-MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST, 
+MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST,
                                    const TargetInstrInfo &tii)
   : MipsGenRegisterInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
     Subtarget(ST), TII(tii) {}
@@ -46,7 +46,7 @@ MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST,
 /// getRegisterNumbering - Given the enum value for some register, e.g.
 /// Mips::RA, return the number that it corresponds to (e.g. 31).
 unsigned MipsRegisterInfo::
-getRegisterNumbering(unsigned RegEnum) 
+getRegisterNumbering(unsigned RegEnum)
 {
   switch (RegEnum) {
     case Mips::ZERO : case Mips::F0 : case Mips::D0 : return 0;
@@ -82,30 +82,30 @@ getRegisterNumbering(unsigned RegEnum)
     case Mips::FP   : case Mips::F30: case Mips::D15: return 30;
     case Mips::RA   : case Mips::F31: return 31;
     default: llvm_unreachable("Unknown register number!");
-  }    
+  }
   return 0; // Not reached
 }
 
 unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; }
 
 //===----------------------------------------------------------------------===//
-// Callee Saved Registers methods 
+// Callee Saved Registers methods
 //===----------------------------------------------------------------------===//
 
 /// Mips Callee Saved Registers
 const unsigned* MipsRegisterInfo::
-getCalleeSavedRegs(const MachineFunction *MF) const 
+getCalleeSavedRegs(const MachineFunction *MF) const
 {
   // Mips callee-save register range is $16-$23, $f20-$f30
   static const unsigned SingleFloatOnlyCalleeSavedRegs[] = {
-    Mips::S0, Mips::S1, Mips::S2, Mips::S3, 
+    Mips::S0, Mips::S1, Mips::S2, Mips::S3,
     Mips::S4, Mips::S5, Mips::S6, Mips::S7,
-    Mips::F20, Mips::F21, Mips::F22, Mips::F23, Mips::F24, Mips::F25, 
+    Mips::F20, Mips::F21, Mips::F22, Mips::F23, Mips::F24, Mips::F25,
     Mips::F26, Mips::F27, Mips::F28, Mips::F29, Mips::F30, 0
   };
 
   static const unsigned BitMode32CalleeSavedRegs[] = {
-    Mips::S0, Mips::S1, Mips::S2, Mips::S3, 
+    Mips::S0, Mips::S1, Mips::S2, Mips::S3,
     Mips::S4, Mips::S5, Mips::S6, Mips::S7,
     Mips::F20, Mips::F22, Mips::F24, Mips::F26, Mips::F28, Mips::F30, 0
   };
@@ -132,11 +132,11 @@ getReservedRegs(const MachineFunction &MF) const {
   if (!Subtarget.isSingleFloat())
     for (unsigned FReg=(Mips::F0)+1; FReg < Mips::F30; FReg+=2)
       Reserved.set(FReg);
-  
+
   return Reserved;
 }
 
-// This function eliminate ADJCALLSTACKDOWN, 
+// This function eliminate ADJCALLSTACKDOWN,
 // ADJCALLSTACKUP pseudo instructions
 void MipsRegisterInfo::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
@@ -157,7 +157,7 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
   unsigned i = 0;
   while (!MI.getOperand(i).isFI()) {
     ++i;
-    assert(i < MI.getNumOperands() && 
+    assert(i < MI.getNumOperands() &&
            "Instr doesn't have FrameIndex operand!");
   }
 
@@ -179,8 +179,43 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
 
   DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
 
-  MI.getOperand(i-1).ChangeToImmediate(Offset);
-  MI.getOperand(i).ChangeToRegister(getFrameRegister(MF), false);
+  unsigned NewReg = 0;
+  int NewImm = 0;
+  MachineBasicBlock &MBB = *MI.getParent();
+  bool ATUsed;
+  unsigned OrigReg = getFrameRegister(MF);
+  int OrigImm = Offset;
+
+// OrigImm fits in the 16-bit field
+  if (OrigImm < 0x8000 && OrigImm >= -0x8000) {
+    NewReg = OrigReg;
+    NewImm = OrigImm;
+    ATUsed = false;
+  }
+  else {
+    const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+    DebugLoc DL = II->getDebugLoc();
+    int ImmLo = OrigImm & 0xffff;
+    int ImmHi = (((unsigned)OrigImm & 0xffff0000) >> 16) +
+                ((OrigImm & 0x8000) != 0);
+
+    // FIXME: change this when mips goes MC".
+    BuildMI(MBB, II, DL, TII->get(Mips::NOAT));
+    BuildMI(MBB, II, DL, TII->get(Mips::LUi), Mips::AT).addImm(ImmHi);
+    BuildMI(MBB, II, DL, TII->get(Mips::ADDu), Mips::AT).addReg(OrigReg)
+                                                        .addReg(Mips::AT);
+    NewReg = Mips::AT;
+    NewImm = ImmLo;
+    
+    ATUsed = true;
+  }
+
+  // FIXME: change this when mips goes MC".
+  if (ATUsed)
+    BuildMI(MBB, ++II, MI.getDebugLoc(), TII.get(Mips::ATMACRO));
+
+  MI.getOperand(i).ChangeToRegister(NewReg, false);
+  MI.getOperand(i-1).ChangeToImmediate(NewImm);
 }
 
 void MipsRegisterInfo::
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index a7f4bf9..767359f 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -26,7 +26,7 @@ class Type;
 struct MipsRegisterInfo : public MipsGenRegisterInfo {
   const MipsSubtarget &Subtarget;
   const TargetInstrInfo &TII;
-  
+
   MipsRegisterInfo(const MipsSubtarget &Subtarget, const TargetInstrInfo &tii);
 
   /// getRegisterNumbering - Given the enum value for some register, e.g.
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 60efe31..9f9cae7 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -17,7 +17,7 @@ class MipsReg<string n> : Register<n> {
   let Namespace = "Mips";
 }
 
-class MipsRegWithSubRegs<string n, list<Register> subregs> 
+class MipsRegWithSubRegs<string n, list<Register> subregs>
   : RegisterWithSubRegs<n, subregs> {
   field bits<5> Num;
   let Namespace = "Mips";
@@ -83,7 +83,7 @@ let Namespace = "Mips" in {
   def SP   : MipsGPRReg< 29, "SP">,  DwarfRegNum<[29]>;
   def FP   : MipsGPRReg< 30, "FP">,  DwarfRegNum<[30]>;
   def RA   : MipsGPRReg< 31, "RA">,  DwarfRegNum<[31]>;
-  
+
   /// Mips Single point precision FPU Registers
   def F0  : FPR< 0,  "F0">, DwarfRegNum<[32]>;
   def F1  : FPR< 1,  "F1">, DwarfRegNum<[33]>;
@@ -117,7 +117,7 @@ let Namespace = "Mips" in {
   def F29 : FPR<29, "F29">, DwarfRegNum<[61]>;
   def F30 : FPR<30, "F30">, DwarfRegNum<[62]>;
   def F31 : FPR<31, "F31">, DwarfRegNum<[63]>;
-  
+
   /// Mips Double point precision FPU Registers (aliased
   /// with the single precision to hold 64 bit values)
   def D0  : AFPR< 0,  "F0", [F0,   F1]>, DwarfRegNum<[32]>;
@@ -149,11 +149,11 @@ let Namespace = "Mips" in {
 // Register Classes
 //===----------------------------------------------------------------------===//
 
-def CPURegs : RegisterClass<"Mips", [i32], 32, 
+def CPURegs : RegisterClass<"Mips", [i32], 32,
   // Return Values and Arguments
   [V0, V1, A0, A1, A2, A3,
   // Not preserved across procedure calls
-  T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, 
+  T0, T1, T2, T3, T4, T5, T6, T7, T8, T9,
   // Callee save
   S0, S1, S2, S3, S4, S5, S6, S7,
   // Reserved
@@ -173,16 +173,16 @@ def CPURegs : RegisterClass<"Mips", [i32], 32,
 
 // 64bit fp:
 // * FGR64  - 32 64-bit registers
-// * AFGR64 - 16 32-bit even registers (32-bit FP Mode) 
+// * AFGR64 - 16 32-bit even registers (32-bit FP Mode)
 //
 // 32bit fp:
 // * FGR32 - 16 32-bit even registers
 // * FGR32 - 32 32-bit registers (single float only mode)
-def FGR32 : RegisterClass<"Mips", [f32], 32, 
+def FGR32 : RegisterClass<"Mips", [f32], 32,
   // Return Values and Arguments
   [F0, F1, F2, F3, F12, F13, F14, F15,
   // Not preserved across procedure calls
-  F4, F5, F6, F7, F8, F9, F10, F11, F16, F17, F18, F19, 
+  F4, F5, F6, F7, F8, F9, F10, F11, F16, F17, F18, F19,
   // Callee save
   F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30,
   // Reserved
@@ -195,17 +195,17 @@ def FGR32 : RegisterClass<"Mips", [f32], 32,
   let MethodBodies = [{
 
     static const unsigned MIPS_FGR32[] = {
-      Mips::F0,  Mips::F1,  Mips::F2,  Mips::F3,  Mips::F12,  Mips::F13, 
-      Mips::F14, Mips::F15, Mips::F4,  Mips::F5,  Mips::F6,   Mips::F7, 
-      Mips::F8,  Mips::F9,  Mips::F10, Mips::F11, Mips::F16,  Mips::F17, 
-      Mips::F18, Mips::F19, Mips::F20, Mips::F21, Mips::F22,  Mips::F23, 
-      Mips::F24, Mips::F25, Mips::F26, Mips::F27, Mips::F28,  Mips::F29, 
+      Mips::F0,  Mips::F1,  Mips::F2,  Mips::F3,  Mips::F12,  Mips::F13,
+      Mips::F14, Mips::F15, Mips::F4,  Mips::F5,  Mips::F6,   Mips::F7,
+      Mips::F8,  Mips::F9,  Mips::F10, Mips::F11, Mips::F16,  Mips::F17,
+      Mips::F18, Mips::F19, Mips::F20, Mips::F21, Mips::F22,  Mips::F23,
+      Mips::F24, Mips::F25, Mips::F26, Mips::F27, Mips::F28,  Mips::F29,
       Mips::F30
     };
 
     static const unsigned MIPS_SVR4_FGR32[] = {
-      Mips::F0,  Mips::F2,  Mips::F12, Mips::F14, Mips::F4, 
-      Mips::F6,  Mips::F8,  Mips::F10, Mips::F16, Mips::F18, 
+      Mips::F0,  Mips::F2,  Mips::F12, Mips::F14, Mips::F4,
+      Mips::F6,  Mips::F8,  Mips::F10, Mips::F16, Mips::F18,
       Mips::F20, Mips::F22, Mips::F24, Mips::F26, Mips::F28, Mips::F30,
     };
 
@@ -217,7 +217,7 @@ def FGR32 : RegisterClass<"Mips", [f32], 32,
       if (Subtarget.isSingleFloat())
         return MIPS_FGR32;
       else
-        return MIPS_SVR4_FGR32; 
+        return MIPS_SVR4_FGR32;
     }
 
     FGR32Class::iterator
@@ -233,11 +233,11 @@ def FGR32 : RegisterClass<"Mips", [f32], 32,
   }];
 }
 
-def AFGR64 : RegisterClass<"Mips", [f64], 64, 
+def AFGR64 : RegisterClass<"Mips", [f64], 64,
   // Return Values and Arguments
   [D0, D1, D6, D7,
   // Not preserved across procedure calls
-  D2, D3, D4, D5, D8, D9, 
+  D2, D3, D4, D5, D8, D9,
   // Callee save
   D10, D11, D12, D13, D14,
   // Reserved
diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td
index 49ca5d1..00be8ee 100644
--- a/lib/Target/Mips/MipsSchedule.td
+++ b/lib/Target/Mips/MipsSchedule.td
@@ -14,7 +14,7 @@ def ALU     : FuncUnit;
 def IMULDIV : FuncUnit;
 
 //===----------------------------------------------------------------------===//
-// Instruction Itinerary classes used for Mips 
+// Instruction Itinerary classes used for Mips
 //===----------------------------------------------------------------------===//
 def IIAlu              : InstrItinClass;
 def IILoad             : InstrItinClass;
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index db114da..70747f5d 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -17,7 +17,7 @@
 using namespace llvm;
 
 MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &FS,
-                             bool little) : 
+                             bool little) :
   MipsArchVersion(Mips1), MipsABI(O32), IsLittle(little), IsSingleFloat(false),
   IsFP64bit(false), IsGP64bit(false), HasVFPU(false), IsLinux(true),
   HasSEInReg(false), HasCondMov(false), HasMulDivAdd(false), HasMinMax(false),
@@ -33,7 +33,7 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &FS,
   if (TT.find("linux") == std::string::npos)
     IsLinux = false;
 
-  // When only the target triple is specified and is 
+  // When only the target triple is specified and is
   // a allegrex target, set the features. We also match
   // big and little endian allegrex cores (dont really
   // know if a big one exists)
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index e4f4b33..096bbed 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -26,7 +26,7 @@ class MipsSubtarget : public TargetSubtarget {
 public:
   enum MipsABIEnum {
     O32, O64, N32, N64, EABI
-  }; 
+  };
 
 protected:
 
@@ -34,10 +34,10 @@ protected:
     Mips1, Mips2, Mips3, Mips4, Mips32, Mips32r2
   };
 
-  // Mips architecture version 
+  // Mips architecture version
   MipsArchEnum MipsArchVersion;
 
-  // Mips supported ABIs 
+  // Mips supported ABIs
   MipsABIEnum MipsABI;
 
   // IsLittle - The target is Little Endian
@@ -61,14 +61,14 @@ protected:
   bool IsLinux;
 
   /// Features related to the presence of specific instructions.
-  
+
   // HasSEInReg - SEB and SEH (signext in register) instructions.
   bool HasSEInReg;
 
   // HasCondMov - Conditional mov (MOVZ, MOVN) instructions.
   bool HasCondMov;
 
-  // HasMulDivAdd - Multiply add and sub (MADD, MADDu, MSUB, MSUBu) 
+  // HasMulDivAdd - Multiply add and sub (MADD, MADDu, MSUB, MSUBu)
   // instructions.
   bool HasMulDivAdd;
 
@@ -93,14 +93,14 @@ public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   MipsSubtarget(const std::string &TT, const std::string &FS, bool little);
-  
-  /// ParseSubtargetFeatures - Parses features string setting specified 
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
   std::string ParseSubtargetFeatures(const std::string &FS,
                                      const std::string &CPU);
 
   bool isMips1() const { return MipsArchVersion == Mips1; }
-  bool isMips32() const { return MipsArchVersion >= Mips32; } 
+  bool isMips32() const { return MipsArchVersion >= Mips32; }
   bool isMips32r2() const { return MipsArchVersion == Mips32r2; }
 
   bool isLittle() const { return IsLittle; }
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 7a2dd1f..53190b4 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -75,3 +75,9 @@ addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel)
   PM.add(createMipsDelaySlotFillerPass(*this));
   return true;
 }
+
+bool MipsTargetMachine::
+addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel) {
+  PM.add(createMipsExpandPseudoPass(*this));
+  return true;
+}
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index 43ab798..badb652 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -63,6 +63,7 @@ namespace llvm {
                                  CodeGenOpt::Level OptLevel);
     virtual bool addPreEmitPass(PassManagerBase &PM,
                                 CodeGenOpt::Level OptLevel);
+    virtual bool addPostRegAlloc(PassManagerBase &, CodeGenOpt::Level);
   };
 
 /// MipselTargetMachine - Mipsel target machine.
diff --git a/lib/Target/Mips/MipsTargetObjectFile.h b/lib/Target/Mips/MipsTargetObjectFile.h
index 237b160..c394a9d 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.h
+++ b/lib/Target/Mips/MipsTargetObjectFile.h
@@ -18,22 +18,22 @@ namespace llvm {
     const MCSection *SmallDataSection;
     const MCSection *SmallBSSSection;
   public:
-    
+
     void Initialize(MCContext &Ctx, const TargetMachine &TM);
 
-    
+
     /// IsGlobalInSmallSection - Return true if this global address should be
     /// placed into small data/bss section.
     bool IsGlobalInSmallSection(const GlobalValue *GV,
                                 const TargetMachine &TM, SectionKind Kind)const;
     bool IsGlobalInSmallSection(const GlobalValue *GV,
-                                const TargetMachine &TM) const;  
-    
+                                const TargetMachine &TM) const;
+
     const MCSection *SelectSectionForGlobal(const GlobalValue *GV,
                                             SectionKind Kind,
                                             Mangler *Mang,
                                             const TargetMachine &TM) const;
-      
+
     // TODO: Classify globals as mips wishes.
   };
 } // end namespace llvm
diff --git a/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp b/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
index cc3d61e..a8d6fe9 100644
--- a/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
+++ b/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
@@ -14,7 +14,7 @@ using namespace llvm;
 
 Target llvm::TheMipsTarget, llvm::TheMipselTarget;
 
-extern "C" void LLVMInitializeMipsTargetInfo() { 
+extern "C" void LLVMInitializeMipsTargetInfo() {
   RegisterTarget<Triple::mips> X(TheMipsTarget, "mips", "Mips");
 
   RegisterTarget<Triple::mipsel> Y(TheMipselTarget, "mipsel", "Mipsel");
diff --git a/lib/Target/PTX/PTX.h b/lib/Target/PTX/PTX.h
index 19385ba..ec2be92 100644
--- a/lib/Target/PTX/PTX.h
+++ b/lib/Target/PTX/PTX.h
@@ -29,6 +29,11 @@ namespace llvm {
       PARAMETER = 3,
       SHARED = 4
     };
+
+    enum Predicate {
+      PRED_NORMAL = 0,
+      PRED_NEGATE = 1
+    };
   } // namespace PTX
 
   FunctionPass *createPTXISelDag(PTXTargetMachine &TM,
@@ -37,7 +42,8 @@ namespace llvm {
   FunctionPass *createPTXMFInfoExtract(PTXTargetMachine &TM,
                                        CodeGenOpt::Level OptLevel);
 
-  extern Target ThePTXTarget;
+  extern Target ThePTX32Target;
+  extern Target ThePTX64Target;
 } // namespace llvm;
 
 // Defines symbolic names for PTX registers.
diff --git a/lib/Target/PTX/PTX.td b/lib/Target/PTX/PTX.td
index 8b1a1b1..ae8326e 100644
--- a/lib/Target/PTX/PTX.td
+++ b/lib/Target/PTX/PTX.td
@@ -19,8 +19,35 @@ include "llvm/Target/Target.td"
 // Subtarget Features.
 //===----------------------------------------------------------------------===//
 
-def FeatureSM20 : SubtargetFeature<"sm20", "is_sm20", "true",
-                                   "Enable sm_20 target architecture">;
+//===- Architectural Features ---------------------------------------------===//
+
+def FeatureDouble : SubtargetFeature<"double", "SupportsDouble", "true",
+                                     "Do not demote .f64 to .f32">;
+
+//===- PTX Version --------------------------------------------------------===//
+
+def FeaturePTX20 : SubtargetFeature<"ptx20", "PTXVersion", "PTX_VERSION_2_0",
+                                    "Use PTX Language Version 2.0",
+                                    []>;
+
+def FeaturePTX21 : SubtargetFeature<"ptx21", "PTXVersion", "PTX_VERSION_2_1",
+                                    "Use PTX Language Version 2.1",
+                                    [FeaturePTX20]>;
+
+def FeaturePTX22 : SubtargetFeature<"ptx22", "PTXVersion", "PTX_VERSION_2_2",
+                                    "Use PTX Language Version 2.2",
+                                    [FeaturePTX21]>;
+
+//===- PTX Shader Model ---------------------------------------------------===//
+
+def FeatureSM10 : SubtargetFeature<"sm10", "PTXShaderModel", "PTX_SM_1_0",
+                                   "Enable Shader Model 1.0 compliance">;
+def FeatureSM13 : SubtargetFeature<"sm13", "PTXShaderModel", "PTX_SM_1_3",
+                                   "Enable Shader Model 1.3 compliance",
+                                   [FeatureSM10, FeatureDouble]>;
+def FeatureSM20 : SubtargetFeature<"sm20", "PTXShaderModel", "PTX_SM_2_0",
+                                   "Enable Shader Model 2.0 compliance",
+                                   [FeatureSM13]>;
 
 //===----------------------------------------------------------------------===//
 // PTX supported processors.
diff --git a/lib/Target/PTX/PTXAsmPrinter.cpp b/lib/Target/PTX/PTXAsmPrinter.cpp
index a605997..29c4781 100644
--- a/lib/Target/PTX/PTXAsmPrinter.cpp
+++ b/lib/Target/PTX/PTXAsmPrinter.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
@@ -37,13 +38,6 @@
 
 using namespace llvm;
 
-static cl::opt<std::string>
-OptPTXVersion("ptx-version", cl::desc("Set PTX version"), cl::init("1.4"));
-
-static cl::opt<std::string>
-OptPTXTarget("ptx-target", cl::desc("Set GPU target (comma-separated list)"),
-             cl::init("sm_10"));
-
 namespace {
 class PTXAsmPrinter : public AsmPrinter {
 public:
@@ -68,6 +62,7 @@ public:
                        const char *Modifier = 0);
   void printParamOperand(const MachineInstr *MI, int opNum, raw_ostream &OS,
                          const char *Modifier = 0);
+  void printPredicateOperand(const MachineInstr *MI, raw_ostream &O);
 
   // autogen'd.
   void printInstruction(const MachineInstr *MI, raw_ostream &OS);
@@ -82,27 +77,20 @@ private:
 static const char PARAM_PREFIX[] = "__param_";
 
 static const char *getRegisterTypeName(unsigned RegNo) {
-#define TEST_REGCLS(cls, clsstr) \
+#define TEST_REGCLS(cls, clsstr)                \
   if (PTX::cls ## RegisterClass->contains(RegNo)) return # clsstr;
-  TEST_REGCLS(RRegs32, s32);
   TEST_REGCLS(Preds, pred);
+  TEST_REGCLS(RRegu16, u16);
+  TEST_REGCLS(RRegu32, u32);
+  TEST_REGCLS(RRegu64, u64);
+  TEST_REGCLS(RRegf32, f32);
+  TEST_REGCLS(RRegf64, f64);
 #undef TEST_REGCLS
 
   llvm_unreachable("Not in any register class!");
   return NULL;
 }
 
-static const char *getInstructionTypeName(const MachineInstr *MI) {
-  for (int i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    if (MO.getType() == MachineOperand::MO_Register)
-      return getRegisterTypeName(MO.getReg());
-  }
-
-  llvm_unreachable("No reg operand found in instruction!");
-  return NULL;
-}
-
 static const char *getStateSpaceName(unsigned addressSpace) {
   switch (addressSpace) {
   default: llvm_unreachable("Unknown state space");
@@ -115,6 +103,28 @@ static const char *getStateSpaceName(unsigned addressSpace) {
   return NULL;
 }
 
+static const char *getTypeName(const Type* type) {
+  while (true) {
+    switch (type->getTypeID()) {
+      default: llvm_unreachable("Unknown type");
+      case Type::FloatTyID: return ".f32";
+      case Type::DoubleTyID: return ".f64";
+      case Type::IntegerTyID:
+        switch (type->getPrimitiveSizeInBits()) {
+          default: llvm_unreachable("Unknown integer bit-width");
+          case 16: return ".u16";
+          case 32: return ".u32";
+          case 64: return ".u64";
+        }
+      case Type::ArrayTyID:
+      case Type::PointerTyID:
+        type = dyn_cast<const SequentialType>(type)->getElementType();
+        break;
+    }
+  }
+  return NULL;
+}
+
 bool PTXAsmPrinter::doFinalization(Module &M) {
   // XXX Temproarily remove global variables so that doFinalization() will not
   // emit them again (global variables are emitted at beginning).
@@ -146,8 +156,12 @@ bool PTXAsmPrinter::doFinalization(Module &M) {
 
 void PTXAsmPrinter::EmitStartOfAsmFile(Module &M)
 {
-  OutStreamer.EmitRawText(Twine("\t.version " + OptPTXVersion));
-  OutStreamer.EmitRawText(Twine("\t.target " + OptPTXTarget));
+  const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>();
+
+  OutStreamer.EmitRawText(Twine("\t.version " + ST.getPTXVersionString()));
+  OutStreamer.EmitRawText(Twine("\t.target " + ST.getTargetString() +
+                                (ST.supportsDouble() ? ""
+                                                     : ", map_f64_to_f32")));
   OutStreamer.AddBlankLine();
 
   // declare global variables
@@ -186,17 +200,16 @@ void PTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   std::string str;
   str.reserve(64);
 
-  // Write instruction to str
   raw_string_ostream OS(str);
+
+  // Emit predicate
+  printPredicateOperand(MI, OS);
+
+  // Write instruction to str
   printInstruction(MI, OS);
   OS << ';';
   OS.flush();
 
-  // Replace "%type" if found
-  size_t pos;
-  if ((pos = str.find("%type")) != std::string::npos)
-    str.replace(pos, /*strlen("%type")==*/5, getInstructionTypeName(MI));
-
   StringRef strref = StringRef(str);
   OutStreamer.EmitRawText(strref);
 }
@@ -213,11 +226,36 @@ void PTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
       OS << *Mang->getSymbol(MO.getGlobal());
       break;
     case MachineOperand::MO_Immediate:
-      OS << (int) MO.getImm();
+      OS << (long) MO.getImm();
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      OS << *MO.getMBB()->getSymbol();
       break;
     case MachineOperand::MO_Register:
       OS << getRegisterName(MO.getReg());
       break;
+    case MachineOperand::MO_FPImmediate:
+      APInt constFP = MO.getFPImm()->getValueAPF().bitcastToAPInt();
+      bool  isFloat = MO.getFPImm()->getType()->getTypeID() == Type::FloatTyID;
+      // Emit 0F for 32-bit floats and 0D for 64-bit doubles.
+      if (isFloat) {
+        OS << "0F";
+      }
+      else {
+        OS << "0D";
+      }
+      // Emit the encoded floating-point value.
+      if (constFP.getZExtValue() > 0) {
+        OS << constFP.toString(16, false);
+      }
+      else {
+        OS << "00000000";
+        // If We have a double-precision zero, pad to 8-bytes.
+        if (!isFloat) {
+          OS << "00000000";
+        }
+      }
+      break;
   }
 }
 
@@ -265,13 +303,77 @@ void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) {
     decl += " ";
   }
 
-  // TODO: add types
-  decl += ".s32 ";
 
-  decl += gvsym->getName();
+  if (PointerType::classof(gv->getType())) {
+    const PointerType* pointerTy = dyn_cast<const PointerType>(gv->getType());
+    const Type* elementTy = pointerTy->getElementType();
+
+    decl += ".b8 ";
+    decl += gvsym->getName();
+    decl += "[";
+    
+    if (elementTy->isArrayTy())
+    {
+      assert(elementTy->isArrayTy() && "Only pointers to arrays are supported");
 
-  if (ArrayType::classof(gv->getType()) || PointerType::classof(gv->getType()))
-    decl += "[]";
+      const ArrayType* arrayTy = dyn_cast<const ArrayType>(elementTy);
+      elementTy = arrayTy->getElementType();
+
+      unsigned numElements = arrayTy->getNumElements();
+      
+      while (elementTy->isArrayTy()) {
+
+        arrayTy = dyn_cast<const ArrayType>(elementTy);
+        elementTy = arrayTy->getElementType();
+
+        numElements *= arrayTy->getNumElements();
+      }
+
+      // FIXME: isPrimitiveType() == false for i16?
+      assert(elementTy->isSingleValueType() &&
+              "Non-primitive types are not handled");
+
+      // Compute the size of the array, in bytes.
+      uint64_t arraySize = (elementTy->getPrimitiveSizeInBits() >> 3)
+                        * numElements;
+  
+      decl += utostr(arraySize);
+    }
+    
+    decl += "]";
+    
+    // handle string constants (assume ConstantArray means string)
+    
+    if (gv->hasInitializer())
+    {
+      Constant *C = gv->getInitializer();  
+      if (const ConstantArray *CA = dyn_cast<ConstantArray>(C))
+      {
+        decl += " = {";
+
+        for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i)
+        {
+          if (i > 0)   decl += ",";
+      
+          decl += "0x" + utohexstr(cast<ConstantInt>(CA->getOperand(i))->getZExtValue());
+        }
+      
+        decl += "}";
+      }
+    }
+  }
+  else {
+    // Note: this is currently the fall-through case and most likely generates
+    //       incorrect code.
+    decl += getTypeName(gv->getType());
+    decl += " ";
+
+    decl += gvsym->getName();
+
+    if (ArrayType::classof(gv->getType()) ||
+        PointerType::classof(gv->getType()))
+      decl += "[]";
+  }
 
   decl += ";";
 
@@ -313,16 +415,24 @@ void PTXAsmPrinter::EmitFunctionDeclaration() {
   if (!MFI->argRegEmpty()) {
     decl += " (";
     if (isKernel) {
-      for (int i = 0, e = MFI->getNumArg(); i != e; ++i) {
-        if (i != 0)
+      unsigned cnt = 0;
+      for(PTXMachineFunctionInfo::reg_iterator
+          i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i;
+          i != e; ++i) {
+        reg = *i;
+        assert(reg != PTX::NoRegister && "Not a valid register!");
+        if (i != b)
           decl += ", ";
-        decl += ".param .s32 "; // TODO: add types
+        decl += ".param .";
+        decl += getRegisterTypeName(reg);
+        decl += " ";
         decl += PARAM_PREFIX;
-        decl += utostr(i + 1);
+        decl += utostr(++cnt);
       }
     } else {
       for (PTXMachineFunctionInfo::reg_iterator
-           i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i; i != e; ++i) {
+           i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i;
+           i != e; ++i) {
         reg = *i;
         assert(reg != PTX::NoRegister && "Not a valid register!");
         if (i != b)
@@ -339,9 +449,29 @@ void PTXAsmPrinter::EmitFunctionDeclaration() {
   OutStreamer.EmitRawText(Twine(decl));
 }
 
+void PTXAsmPrinter::
+printPredicateOperand(const MachineInstr *MI, raw_ostream &O) {
+  int i = MI->findFirstPredOperandIdx();
+  if (i == -1)
+    llvm_unreachable("missing predicate operand");
+
+  unsigned reg = MI->getOperand(i).getReg();
+  int predOp = MI->getOperand(i+1).getImm();
+
+  DEBUG(dbgs() << "predicate: (" << reg << ", " << predOp << ")\n");
+
+  if (reg != PTX::NoRegister) {
+    O << '@';
+    if (predOp == PTX::PRED_NEGATE)
+      O << '!';
+    O << getRegisterName(reg);
+  }
+}
+
 #include "PTXGenAsmWriter.inc"
 
 // Force static initialization.
 extern "C" void LLVMInitializePTXAsmPrinter() {
-  RegisterAsmPrinter<PTXAsmPrinter> X(ThePTXTarget);
+  RegisterAsmPrinter<PTXAsmPrinter> X(ThePTX32Target);
+  RegisterAsmPrinter<PTXAsmPrinter> Y(ThePTX64Target);
 }
diff --git a/lib/Target/PTX/PTXFrameLowering.h b/lib/Target/PTX/PTXFrameLowering.h
index 574ae7a..9320676 100644
--- a/lib/Target/PTX/PTXFrameLowering.h
+++ b/lib/Target/PTX/PTXFrameLowering.h
@@ -27,7 +27,8 @@ protected:
 
 public:
   explicit PTXFrameLowering(const PTXSubtarget &sti)
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2), STI(sti) {
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2),
+      STI(sti) {
   }
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
diff --git a/lib/Target/PTX/PTXISelDAGToDAG.cpp b/lib/Target/PTX/PTXISelDAGToDAG.cpp
index efb0e8b..b3c85da 100644
--- a/lib/Target/PTX/PTXISelDAGToDAG.cpp
+++ b/lib/Target/PTX/PTXISelDAGToDAG.cpp
@@ -15,6 +15,7 @@
 #include "PTXTargetMachine.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -42,8 +43,14 @@ class PTXDAGToDAGISel : public SelectionDAGISel {
   private:
     SDNode *SelectREAD_PARAM(SDNode *Node);
 
+    // We need this only because we can't match intruction BRAdp
+    // pattern (PTXbrcond bb:$d, ...) in PTXInstrInfo.td
+    SDNode *SelectBRCOND(SDNode *Node);
+
     bool isImm(const SDValue &operand);
     bool SelectImm(const SDValue &operand, SDValue &imm);
+
+    const PTXSubtarget& getSubtarget() const;
 }; // class PTXDAGToDAGISel
 } // namespace
 
@@ -59,21 +66,62 @@ PTXDAGToDAGISel::PTXDAGToDAGISel(PTXTargetMachine &TM,
   : SelectionDAGISel(TM, OptLevel) {}
 
 SDNode *PTXDAGToDAGISel::Select(SDNode *Node) {
-  if (Node->getOpcode() == PTXISD::READ_PARAM)
-    return SelectREAD_PARAM(Node);
-  else
-    return SelectCode(Node);
+  switch (Node->getOpcode()) {
+    case PTXISD::READ_PARAM:
+      return SelectREAD_PARAM(Node);
+    case ISD::BRCOND:
+      return SelectBRCOND(Node);
+    default:
+      return SelectCode(Node);
+  }
 }
 
 SDNode *PTXDAGToDAGISel::SelectREAD_PARAM(SDNode *Node) {
-  SDValue index = Node->getOperand(1);
-  DebugLoc dl = Node->getDebugLoc();
+  SDValue  index = Node->getOperand(1);
+  DebugLoc dl    = Node->getDebugLoc();
+  unsigned opcode;
 
   if (index.getOpcode() != ISD::TargetConstant)
     llvm_unreachable("READ_PARAM: index is not ISD::TargetConstant");
 
+  if (Node->getValueType(0) == MVT::i16) {
+    opcode = PTX::LDpiU16;
+  }
+  else if (Node->getValueType(0) == MVT::i32) {
+    opcode = PTX::LDpiU32;
+  }
+  else if (Node->getValueType(0) == MVT::i64) {
+    opcode = PTX::LDpiU64;
+  }
+  else if (Node->getValueType(0) == MVT::f32) {
+    opcode = PTX::LDpiF32;
+  }
+  else if (Node->getValueType(0) == MVT::f64) {
+    opcode = PTX::LDpiF64;
+  }
+  else {
+    llvm_unreachable("Unknown parameter type for ld.param");
+  }
+
   return PTXInstrInfo::
-    GetPTXMachineNode(CurDAG, PTX::LDpi, dl, MVT::i32, index);
+    GetPTXMachineNode(CurDAG, opcode, dl, Node->getValueType(0), index);
+}
+
+SDNode *PTXDAGToDAGISel::SelectBRCOND(SDNode *Node) {
+  assert(Node->getNumOperands() >= 3);
+
+  SDValue Chain  = Node->getOperand(0);
+  SDValue Pred   = Node->getOperand(1);
+  SDValue Target = Node->getOperand(2); // branch target
+  SDValue PredOp = CurDAG->getTargetConstant(PTX::PRED_NORMAL, MVT::i32);
+  DebugLoc dl = Node->getDebugLoc();
+
+  assert(Target.getOpcode()  == ISD::BasicBlock);
+  assert(Pred.getValueType() == MVT::i1);
+
+  // Emit BRAdp
+  SDValue Ops[] = { Target, Pred, PredOp, Chain };
+  return CurDAG->getMachineNode(PTX::BRAdp, dl, MVT::Other, Ops, 4);
 }
 
 // Match memory operand of the form [reg+reg]
@@ -82,8 +130,11 @@ bool PTXDAGToDAGISel::SelectADDRrr(SDValue &Addr, SDValue &R1, SDValue &R2) {
       isImm(Addr.getOperand(0)) || isImm(Addr.getOperand(1)))
     return false;
 
+  assert(Addr.getValueType().isSimple() && "Type must be simple");
+
   R1 = Addr;
-  R2 = CurDAG->getTargetConstant(0, MVT::i32);
+  R2 = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
+
   return true;
 }
 
@@ -95,8 +146,12 @@ bool PTXDAGToDAGISel::SelectADDRri(SDValue &Addr, SDValue &Base,
     if (isImm(Addr))
       return false;
     // it is [reg]
+
+    assert(Addr.getValueType().isSimple() && "Type must be simple");
+
     Base = Addr;
-    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
+
     return true;
   }
 
@@ -129,7 +184,10 @@ bool PTXDAGToDAGISel::SelectADDRii(SDValue &Addr, SDValue &Base,
 
   // is [imm]?
   if (SelectImm(Addr, Base)) {
-    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    assert(Addr.getValueType().isSimple() && "Type must be simple");
+
+    Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
+
     return true;
   }
 
@@ -146,6 +204,13 @@ bool PTXDAGToDAGISel::SelectImm(const SDValue &operand, SDValue &imm) {
     return false;
 
   ConstantSDNode *CN = cast<ConstantSDNode>(node);
-  imm = CurDAG->getTargetConstant(*CN->getConstantIntValue(), MVT::i32);
+  imm = CurDAG->getTargetConstant(*CN->getConstantIntValue(),
+                                  operand.getValueType());
   return true;
 }
+
+const PTXSubtarget& PTXDAGToDAGISel::getSubtarget() const
+{
+  return TM.getSubtarget<PTXSubtarget>();
+}
+
diff --git a/lib/Target/PTX/PTXISelLowering.cpp b/lib/Target/PTX/PTXISelLowering.cpp
index e6d4490..23b93da 100644
--- a/lib/Target/PTX/PTXISelLowering.cpp
+++ b/lib/Target/PTX/PTXISelLowering.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -27,21 +28,60 @@ PTXTargetLowering::PTXTargetLowering(TargetMachine &TM)
   : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
   // Set up the register classes.
   addRegisterClass(MVT::i1,  PTX::PredsRegisterClass);
-  addRegisterClass(MVT::i32, PTX::RRegs32RegisterClass);
-
+  addRegisterClass(MVT::i16, PTX::RRegu16RegisterClass);
+  addRegisterClass(MVT::i32, PTX::RRegu32RegisterClass);
+  addRegisterClass(MVT::i64, PTX::RRegu64RegisterClass);
+  addRegisterClass(MVT::f32, PTX::RRegf32RegisterClass);
+  addRegisterClass(MVT::f64, PTX::RRegf64RegisterClass);
+
+  setBooleanContents(ZeroOrOneBooleanContent);
+  
   setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
 
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+  
+  // Turn i16 (z)extload into load + (z)extend
+  setLoadExtAction(ISD::EXTLOAD, MVT::i16, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Expand);
+
+  // Turn f32 extload into load + fextend
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  
+  // Turn f64 truncstore into trunc + store.
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  
   // Customize translation of memory addresses
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
-
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+
+  // Expand BR_CC into BRCOND
+  setOperationAction(ISD::BR_CC, MVT::Other, Expand);
+
+  // Expand SELECT_CC into SETCC
+  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+  
+  // need to lower SETCC of Preds into bitwise logic
+  setOperationAction(ISD::SETCC, MVT::i1, Custom);
+  
   // Compute derived properties from the register classes
   computeRegisterProperties();
 }
 
+MVT::SimpleValueType PTXTargetLowering::getSetCCResultType(EVT VT) const {
+  return MVT::i1;
+}
+
 SDValue PTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
-    default:                 llvm_unreachable("Unimplemented operand");
-    case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
+    default:
+      llvm_unreachable("Unimplemented operand");
+    case ISD::SETCC:
+      return LowerSETCC(Op, DAG);
+    case ISD::GlobalAddress:
+      return LowerGlobalAddress(Op, DAG);
   }
 }
 
@@ -49,6 +89,8 @@ const char *PTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
     default:
       llvm_unreachable("Unknown opcode");
+    case PTXISD::COPY_ADDRESS:
+      return "PTXISD::COPY_ADDRESS";
     case PTXISD::READ_PARAM:
       return "PTXISD::READ_PARAM";
     case PTXISD::EXIT:
@@ -62,12 +104,43 @@ const char *PTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
 //                      Custom Lower Operation
 //===----------------------------------------------------------------------===//
 
+SDValue PTXTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::i1 && "SetCC type must be 1-bit integer");
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Op2 = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  
+  // Look for X == 0, X == 1, X != 0, or X != 1  
+  // We can simplify these to bitwise logic
+  
+  if (Op1.getOpcode() == ISD::Constant &&
+      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
+       cast<ConstantSDNode>(Op1)->isNullValue()) &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+
+	  return DAG.getNode(ISD::AND, dl, MVT::i1, Op0, Op1);
+  }
+  
+  return DAG.getNode(ISD::SETCC, dl, MVT::i1, Op0, Op1, Op2);
+}
+
 SDValue PTXTargetLowering::
 LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy();
   DebugLoc dl = Op.getDebugLoc();
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  return DAG.getTargetGlobalAddress(GV, dl, PtrVT);
+
+  assert(PtrVT.isSimple() && "Pointer must be to primitive type.");
+
+  SDValue targetGlobal = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
+  SDValue movInstr = DAG.getNode(PTXISD::COPY_ADDRESS,
+                                 dl,
+                                 PtrVT.getSimpleVT(),
+                                 targetGlobal);
+
+  return movInstr;
 }
 
 //===----------------------------------------------------------------------===//
@@ -87,9 +160,13 @@ struct argmap_entry {
   bool operator==(MVT::SimpleValueType _VT) const { return VT == _VT; }
 } argmap[] = {
   argmap_entry(MVT::i1,  PTX::PredsRegisterClass),
-  argmap_entry(MVT::i32, PTX::RRegs32RegisterClass)
+  argmap_entry(MVT::i16, PTX::RRegu16RegisterClass),
+  argmap_entry(MVT::i32, PTX::RRegu32RegisterClass),
+  argmap_entry(MVT::i64, PTX::RRegu64RegisterClass),
+  argmap_entry(MVT::f32, PTX::RRegf32RegisterClass),
+  argmap_entry(MVT::f64, PTX::RRegf64RegisterClass)
 };
-} // end anonymous namespace
+}                               // end anonymous namespace
 
 SDValue PTXTargetLowering::
   LowerFormalArguments(SDValue Chain,
@@ -185,10 +262,25 @@ SDValue PTXTargetLowering::
   if (Outs.size() == 0)
     return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain);
 
-  assert(Outs[0].VT == MVT::i32 && "Can return only basic types");
-
   SDValue Flag;
-  unsigned reg = PTX::R0;
+  unsigned reg;
+
+  if (Outs[0].VT == MVT::i16) {
+    reg = PTX::RH0;
+  }
+  else if (Outs[0].VT == MVT::i32) {
+    reg = PTX::R0;
+  }
+  else if (Outs[0].VT == MVT::i64) {
+    reg = PTX::RD0;
+  }
+  else if (Outs[0].VT == MVT::f32) {
+    reg = PTX::F0;
+  }
+  else {
+    assert(Outs[0].VT == MVT::f64 && "Can return only basic types");
+    reg = PTX::FD0;
+  }
 
   MachineFunction &MF = DAG.getMachineFunction();
   PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
diff --git a/lib/Target/PTX/PTXISelLowering.h b/lib/Target/PTX/PTXISelLowering.h
index b03a9f6..6a7e3e6 100644
--- a/lib/Target/PTX/PTXISelLowering.h
+++ b/lib/Target/PTX/PTXISelLowering.h
@@ -26,7 +26,8 @@ namespace PTXISD {
     FIRST_NUMBER = ISD::BUILTIN_OP_END,
     READ_PARAM,
     EXIT,
-    RET
+    RET,
+    COPY_ADDRESS
   };
 } // namespace PTXISD
 
@@ -41,6 +42,8 @@ class PTXTargetLowering : public TargetLowering {
 
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
 
+    virtual SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+    
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv,
@@ -58,7 +61,9 @@ class PTXTargetLowering : public TargetLowering {
                   const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl,
                   SelectionDAG &DAG) const;
-
+    
+    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    
   private:
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
 }; // class PTXTargetLowering
diff --git a/lib/Target/PTX/PTXInstrInfo.cpp b/lib/Target/PTX/PTXInstrInfo.cpp
index 805759b..a12a6d0 100644
--- a/lib/Target/PTX/PTXInstrInfo.cpp
+++ b/lib/Target/PTX/PTXInstrInfo.cpp
@@ -11,9 +11,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "ptx-instrinfo"
+
 #include "PTX.h"
 #include "PTXInstrInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -27,20 +33,27 @@ static const struct map_entry {
   const TargetRegisterClass *cls;
   const int opcode;
 } map[] = {
-  { &PTX::RRegs32RegClass, PTX::MOVrr },
-  { &PTX::PredsRegClass,   PTX::MOVpp }
+  { &PTX::RRegu16RegClass, PTX::MOVU16rr },
+  { &PTX::RRegu32RegClass, PTX::MOVU32rr },
+  { &PTX::RRegu64RegClass, PTX::MOVU64rr },
+  { &PTX::RRegf32RegClass, PTX::MOVF32rr },
+  { &PTX::RRegf64RegClass, PTX::MOVF64rr },
+  { &PTX::PredsRegClass,   PTX::MOVPREDrr }
 };
 
 void PTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I, DebugLoc DL,
                                unsigned DstReg, unsigned SrcReg,
                                bool KillSrc) const {
-  for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++ i)
-    if (PTX::RRegs32RegClass.contains(DstReg, SrcReg)) {
-      BuildMI(MBB, I, DL,
-              get(PTX::MOVrr), DstReg).addReg(SrcReg, getKillRegState(KillSrc));
+  for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++ i) {
+    if (map[i].cls->contains(DstReg, SrcReg)) {
+      const TargetInstrDesc &TID = get(map[i].opcode);
+      MachineInstr *MI = BuildMI(MBB, I, DL, TID, DstReg).
+        addReg(SrcReg, getKillRegState(KillSrc));
+      AddDefaultPredicate(MI);
       return;
     }
+  }
 
   llvm_unreachable("Impossible reg-to-reg copy");
 }
@@ -56,12 +69,9 @@ bool PTXInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
 
   for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++ i)
     if (DstRC == map[i].cls) {
-      MachineInstr *MI = BuildMI(MBB, I, DL, get(map[i].opcode),
-                                 DstReg).addReg(SrcReg);
-      if (MI->findFirstPredOperandIdx() == -1) {
-        MI->addOperand(MachineOperand::CreateReg(0, false));
-        MI->addOperand(MachineOperand::CreateImm(/*IsInv=*/0));
-      }
+      const TargetInstrDesc &TID = get(map[i].opcode);
+      MachineInstr *MI = BuildMI(MBB, I, DL, TID, DstReg).addReg(SrcReg);
+      AddDefaultPredicate(MI);
       return true;
     }
 
@@ -74,8 +84,12 @@ bool PTXInstrInfo::isMoveInstr(const MachineInstr& MI,
   switch (MI.getOpcode()) {
     default:
       return false;
-    case PTX::MOVpp:
-    case PTX::MOVrr:
+    case PTX::MOVU16rr:
+    case PTX::MOVU32rr:
+    case PTX::MOVU64rr:
+    case PTX::MOVF32rr:
+    case PTX::MOVF64rr:
+    case PTX::MOVPREDrr:
       assert(MI.getNumOperands() >= 2 &&
              MI.getOperand(0).isReg() && MI.getOperand(1).isReg() &&
              "Invalid register-register move instruction");
@@ -85,3 +99,239 @@ bool PTXInstrInfo::isMoveInstr(const MachineInstr& MI,
       return true;
   }
 }
+
+// predicate support
+
+bool PTXInstrInfo::isPredicated(const MachineInstr *MI) const {
+  int i = MI->findFirstPredOperandIdx();
+  return i != -1 && MI->getOperand(i).getReg() != PTX::NoRegister;
+}
+
+bool PTXInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  return !isPredicated(MI) && get(MI->getOpcode()).isTerminator();
+}
+
+bool PTXInstrInfo::
+PredicateInstruction(MachineInstr *MI,
+                     const SmallVectorImpl<MachineOperand> &Pred) const {
+  if (Pred.size() < 2)
+    llvm_unreachable("lesser than 2 predicate operands are provided");
+
+  int i = MI->findFirstPredOperandIdx();
+  if (i == -1)
+    llvm_unreachable("missing predicate operand");
+
+  MI->getOperand(i).setReg(Pred[0].getReg());
+  MI->getOperand(i+1).setImm(Pred[1].getImm());
+
+  return true;
+}
+
+bool PTXInstrInfo::
+SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                  const SmallVectorImpl<MachineOperand> &Pred2) const {
+  const MachineOperand &PredReg1 = Pred1[0];
+  const MachineOperand &PredReg2 = Pred2[0];
+  if (PredReg1.getReg() != PredReg2.getReg())
+    return false;
+
+  const MachineOperand &PredOp1 = Pred1[1];
+  const MachineOperand &PredOp2 = Pred2[1];
+  if (PredOp1.getImm() != PredOp2.getImm())
+    return false;
+
+  return true;
+}
+
+bool PTXInstrInfo::
+DefinesPredicate(MachineInstr *MI,
+                 std::vector<MachineOperand> &Pred) const {
+  // If an instruction sets a predicate register, it defines a predicate.
+
+  // TODO supprot 5-operand format of setp instruction
+
+  if (MI->getNumOperands() < 1)
+    return false;
+
+  const MachineOperand &MO = MI->getOperand(0);
+
+  if (!MO.isReg() || RI.getRegClass(MO.getReg()) != &PTX::PredsRegClass)
+    return false;
+
+  Pred.push_back(MO);
+  Pred.push_back(MachineOperand::CreateImm(PTX::PRED_NORMAL));
+  return true;
+}
+
+// branch support
+
+bool PTXInstrInfo::
+AnalyzeBranch(MachineBasicBlock &MBB,
+              MachineBasicBlock *&TBB,
+              MachineBasicBlock *&FBB,
+              SmallVectorImpl<MachineOperand> &Cond,
+              bool AllowModify) const {
+  // TODO implement cases when AllowModify is true
+
+  if (MBB.empty())
+    return true;
+
+  MachineBasicBlock::const_iterator iter = MBB.end();
+  const MachineInstr& instLast1 = *--iter;
+  const TargetInstrDesc &desc1 = instLast1.getDesc();
+  // for special case that MBB has only 1 instruction
+  const bool IsSizeOne = MBB.size() == 1;
+  // if IsSizeOne is true, *--iter and instLast2 are invalid
+  // we put a dummy value in instLast2 and desc2 since they are used
+  const MachineInstr& instLast2 = IsSizeOne ? instLast1 : *--iter;
+  const TargetInstrDesc &desc2 = IsSizeOne ? desc1 : instLast2.getDesc();
+
+  DEBUG(dbgs() << "\n");
+  DEBUG(dbgs() << "AnalyzeBranch: opcode: " << instLast1.getOpcode() << "\n");
+  DEBUG(dbgs() << "AnalyzeBranch: MBB:    " << MBB.getName().str() << "\n");
+  DEBUG(dbgs() << "AnalyzeBranch: TBB:    " << TBB << "\n");
+  DEBUG(dbgs() << "AnalyzeBranch: FBB:    " << FBB << "\n");
+
+  // this block ends with no branches
+  if (!IsAnyKindOfBranch(instLast1)) {
+    DEBUG(dbgs() << "AnalyzeBranch: ends with no branch\n");
+    return false;
+  }
+
+  // this block ends with only an unconditional branch
+  if (desc1.isUnconditionalBranch() &&
+      // when IsSizeOne is true, it "absorbs" the evaluation of instLast2
+      (IsSizeOne || !IsAnyKindOfBranch(instLast2))) {
+    DEBUG(dbgs() << "AnalyzeBranch: ends with only uncond branch\n");
+    TBB = GetBranchTarget(instLast1);
+    return false;
+  }
+
+  // this block ends with a conditional branch and
+  // it falls through to a successor block
+  if (desc1.isConditionalBranch() &&
+      IsAnySuccessorAlsoLayoutSuccessor(MBB)) {
+    DEBUG(dbgs() << "AnalyzeBranch: ends with cond branch and fall through\n");
+    TBB = GetBranchTarget(instLast1);
+    int i = instLast1.findFirstPredOperandIdx();
+    Cond.push_back(instLast1.getOperand(i));
+    Cond.push_back(instLast1.getOperand(i+1));
+    return false;
+  }
+
+  // when IsSizeOne is true, we are done
+  if (IsSizeOne)
+    return true;
+
+  // this block ends with a conditional branch
+  // followed by an unconditional branch
+  if (desc2.isConditionalBranch() &&
+      desc1.isUnconditionalBranch()) {
+    DEBUG(dbgs() << "AnalyzeBranch: ends with cond and uncond branch\n");
+    TBB = GetBranchTarget(instLast2);
+    FBB = GetBranchTarget(instLast1);
+    int i = instLast2.findFirstPredOperandIdx();
+    Cond.push_back(instLast2.getOperand(i));
+    Cond.push_back(instLast2.getOperand(i+1));
+    return false;
+  }
+
+  // branch cannot be understood
+  DEBUG(dbgs() << "AnalyzeBranch: cannot be understood\n");
+  return true;
+}
+
+unsigned PTXInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  unsigned count = 0;
+  while (!MBB.empty())
+    if (IsAnyKindOfBranch(MBB.back())) {
+      MBB.pop_back();
+      ++count;
+    } else
+      break;
+  DEBUG(dbgs() << "RemoveBranch: MBB:   " << MBB.getName().str() << "\n");
+  DEBUG(dbgs() << "RemoveBranch: remove " << count << " branch inst\n");
+  return count;
+}
+
+unsigned PTXInstrInfo::
+InsertBranch(MachineBasicBlock &MBB,
+             MachineBasicBlock *TBB,
+             MachineBasicBlock *FBB,
+             const SmallVectorImpl<MachineOperand> &Cond,
+             DebugLoc DL) const {
+  DEBUG(dbgs() << "InsertBranch: MBB: " << MBB.getName().str() << "\n");
+  DEBUG(if (TBB) dbgs() << "InsertBranch: TBB: " << TBB->getName().str()
+                        << "\n";
+        else     dbgs() << "InsertBranch: TBB: (NULL)\n");
+  DEBUG(if (FBB) dbgs() << "InsertBranch: FBB: " << FBB->getName().str()
+                        << "\n";
+        else     dbgs() << "InsertBranch: FBB: (NULL)\n");
+  DEBUG(dbgs() << "InsertBranch: Cond size: " << Cond.size() << "\n");
+
+  assert(TBB && "TBB is NULL");
+
+  if (FBB) {
+    BuildMI(&MBB, DL, get(PTX::BRAdp))
+      .addMBB(TBB).addReg(Cond[0].getReg()).addImm(Cond[1].getImm());
+    BuildMI(&MBB, DL, get(PTX::BRAd))
+      .addMBB(FBB).addReg(PTX::NoRegister).addImm(PTX::PRED_NORMAL);
+    return 2;
+  } else if (Cond.size()) {
+    BuildMI(&MBB, DL, get(PTX::BRAdp))
+      .addMBB(TBB).addReg(Cond[0].getReg()).addImm(Cond[1].getImm());
+    return 1;
+  } else {
+    BuildMI(&MBB, DL, get(PTX::BRAd))
+      .addMBB(TBB).addReg(PTX::NoRegister).addImm(PTX::PRED_NORMAL);
+    return 1;
+  }
+}
+
+// static helper routines
+
+MachineSDNode *PTXInstrInfo::
+GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
+                  DebugLoc dl, EVT VT, SDValue Op1) {
+  SDValue predReg = DAG->getRegister(PTX::NoRegister, MVT::i1);
+  SDValue predOp = DAG->getTargetConstant(PTX::PRED_NORMAL, MVT::i32);
+  SDValue ops[] = { Op1, predReg, predOp };
+  return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops));
+}
+
+MachineSDNode *PTXInstrInfo::
+GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
+                  DebugLoc dl, EVT VT, SDValue Op1, SDValue Op2) {
+  SDValue predReg = DAG->getRegister(PTX::NoRegister, MVT::i1);
+  SDValue predOp = DAG->getTargetConstant(PTX::PRED_NORMAL, MVT::i32);
+  SDValue ops[] = { Op1, Op2, predReg, predOp };
+  return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops));
+}
+
+void PTXInstrInfo::AddDefaultPredicate(MachineInstr *MI) {
+  if (MI->findFirstPredOperandIdx() == -1) {
+    MI->addOperand(MachineOperand::CreateReg(PTX::NoRegister, /*IsDef=*/false));
+    MI->addOperand(MachineOperand::CreateImm(PTX::PRED_NORMAL));
+  }
+}
+
+bool PTXInstrInfo::IsAnyKindOfBranch(const MachineInstr& inst) {
+  const TargetInstrDesc &desc = inst.getDesc();
+  return desc.isTerminator() || desc.isBranch() || desc.isIndirectBranch();
+}
+
+bool PTXInstrInfo::
+IsAnySuccessorAlsoLayoutSuccessor(const MachineBasicBlock& MBB) {
+  for (MachineBasicBlock::const_succ_iterator
+      i = MBB.succ_begin(), e = MBB.succ_end(); i != e; ++i)
+    if (MBB.isLayoutSuccessor((const MachineBasicBlock*) &*i))
+      return true;
+  return false;
+}
+
+MachineBasicBlock *PTXInstrInfo::GetBranchTarget(const MachineInstr& inst) {
+  // FIXME So far all branch instructions put destination in 1st operand
+  const MachineOperand& target = inst.getOperand(0);
+  assert(target.isMBB() && "FIXME: detect branch target operand");
+  return target.getMBB();
+}
diff --git a/lib/Target/PTX/PTXInstrInfo.h b/lib/Target/PTX/PTXInstrInfo.h
index e7f00f0..a04be77 100644
--- a/lib/Target/PTX/PTXInstrInfo.h
+++ b/lib/Target/PTX/PTXInstrInfo.h
@@ -15,61 +15,93 @@
 #define PTX_INSTR_INFO_H
 
 #include "PTXRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
 namespace llvm {
 class PTXTargetMachine;
 
+class MachineSDNode;
+class SDValue;
+class SelectionDAG;
+
 class PTXInstrInfo : public TargetInstrInfoImpl {
-  private:
-    const PTXRegisterInfo RI;
-    PTXTargetMachine &TM;
-
-  public:
-    explicit PTXInstrInfo(PTXTargetMachine &_TM);
-
-    virtual const PTXRegisterInfo &getRegisterInfo() const { return RI; }
-
-    virtual void copyPhysReg(MachineBasicBlock &MBB,
-                             MachineBasicBlock::iterator I, DebugLoc DL,
-                             unsigned DstReg, unsigned SrcReg,
-                             bool KillSrc) const;
-
-    virtual bool copyRegToReg(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I,
-                              unsigned DstReg, unsigned SrcReg,
-                              const TargetRegisterClass *DstRC,
-                              const TargetRegisterClass *SrcRC,
-                              DebugLoc DL) const;
-
-    virtual bool isMoveInstr(const MachineInstr& MI,
-                             unsigned &SrcReg, unsigned &DstReg,
-                             unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
-
-    // static helper routines
-
-    static MachineSDNode *GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
-                                            DebugLoc dl, EVT VT,
-                                            SDValue Op1) {
-      SDValue pred_reg = DAG->getRegister(0, MVT::i1);
-      SDValue pred_imm = DAG->getTargetConstant(0, MVT::i32);
-      SDValue ops[] = { Op1, pred_reg, pred_imm };
-      return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops));
-    }
-
-    static MachineSDNode *GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
-                                            DebugLoc dl, EVT VT,
-                                            SDValue Op1,
-                                            SDValue Op2) {
-      SDValue pred_reg = DAG->getRegister(0, MVT::i1);
-      SDValue pred_imm = DAG->getTargetConstant(0, MVT::i32);
-      SDValue ops[] = { Op1, Op2, pred_reg, pred_imm };
-      return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops));
-    }
-
-  }; // class PTXInstrInfo
+private:
+  const PTXRegisterInfo RI;
+  PTXTargetMachine &TM;
+
+public:
+  explicit PTXInstrInfo(PTXTargetMachine &_TM);
+
+  virtual const PTXRegisterInfo &getRegisterInfo() const { return RI; }
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DstReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator I,
+                            unsigned DstReg, unsigned SrcReg,
+                            const TargetRegisterClass *DstRC,
+                            const TargetRegisterClass *SrcRC,
+                            DebugLoc DL) const;
+
+  virtual bool isMoveInstr(const MachineInstr& MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+  // predicate support
+
+  virtual bool isPredicated(const MachineInstr *MI) const;
+
+  virtual bool isUnpredicatedTerminator(const MachineInstr *MI) const;
+
+  virtual
+  bool PredicateInstruction(MachineInstr *MI,
+                            const SmallVectorImpl<MachineOperand> &Pred) const;
+
+  virtual
+  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+
+  virtual bool DefinesPredicate(MachineInstr *MI,
+                                std::vector<MachineOperand> &Pred) const;
+
+  // PTX is fully-predicable
+  virtual bool isPredicable(MachineInstr *MI) const { return true; }
+
+  // branch support
+
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify = false) const;
+
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+
+  // static helper routines
+
+  static MachineSDNode *GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
+                                          DebugLoc dl, EVT VT,
+                                          SDValue Op1);
+
+  static MachineSDNode *GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
+                                          DebugLoc dl, EVT VT,
+                                          SDValue Op1, SDValue Op2);
+
+  static void AddDefaultPredicate(MachineInstr *MI);
+
+  static bool IsAnyKindOfBranch(const MachineInstr& inst);
+
+  static bool IsAnySuccessorAlsoLayoutSuccessor(const MachineBasicBlock& MBB);
+
+  static MachineBasicBlock *GetBranchTarget(const MachineInstr& inst);
+}; // class PTXInstrInfo
 } // namespace llvm
 
 #endif // PTX_INSTR_INFO_H
diff --git a/lib/Target/PTX/PTXInstrInfo.td b/lib/Target/PTX/PTXInstrInfo.td
index 9a74778..1ac9d3f 100644
--- a/lib/Target/PTX/PTXInstrInfo.td
+++ b/lib/Target/PTX/PTXInstrInfo.td
@@ -18,6 +18,26 @@
 include "PTXInstrFormats.td"
 
 //===----------------------------------------------------------------------===//
+// Code Generation Predicates
+//===----------------------------------------------------------------------===//
+
+// Addressing
+def Use32BitAddresses : Predicate<"!getSubtarget().is64Bit()">;
+def Use64BitAddresses : Predicate<"getSubtarget().is64Bit()">;
+
+// Shader Model Support
+def SupportsSM13       : Predicate<"getSubtarget().supportsSM13()">;
+def DoesNotSupportSM13 : Predicate<"!getSubtarget().supportsSM13()">;
+def SupportsSM20       : Predicate<"getSubtarget().supportsSM20()">;
+def DoesNotSupportSM20 : Predicate<"!getSubtarget().supportsSM20()">;
+
+// PTX Version Support
+def SupportsPTX21       : Predicate<"getSubtarget().supportsPTX21()">;
+def DoesNotSupportPTX21 : Predicate<"!getSubtarget().supportsPTX21()">;
+def SupportsPTX22       : Predicate<"getSubtarget().supportsPTX22()">;
+def DoesNotSupportPTX22 : Predicate<"!getSubtarget().supportsPTX22()">;
+
+//===----------------------------------------------------------------------===//
 // Instruction Pattern Stuff
 //===----------------------------------------------------------------------===//
 
@@ -107,24 +127,41 @@ def store_shared
 }]>;
 
 // Addressing modes.
-def ADDRrr : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
-def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [], []>;
-def ADDRii : ComplexPattern<i32, 2, "SelectADDRii", [], []>;
+def ADDRrr32 : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
+def ADDRrr64 : ComplexPattern<i64, 2, "SelectADDRrr", [], []>;
+def ADDRri32 : ComplexPattern<i32, 2, "SelectADDRri", [], []>;
+def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri", [], []>;
+def ADDRii32 : ComplexPattern<i32, 2, "SelectADDRii", [], []>;
+def ADDRii64 : ComplexPattern<i64, 2, "SelectADDRii", [], []>;
 
 // Address operands
-def MEMri : Operand<i32> {
+def MEMri32 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops RRegu32, i32imm);
+}
+def MEMri64 : Operand<i64> {
   let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops RRegs32, i32imm);
+  let MIOperandInfo = (ops RRegu64, i64imm);
 }
-def MEMii : Operand<i32> {
+def MEMii32 : Operand<i32> {
   let PrintMethod = "printMemOperand";
   let MIOperandInfo = (ops i32imm, i32imm);
 }
+def MEMii64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops i64imm, i64imm);
+}
+// The operand here does not correspond to an actual address, so we
+// can use i32 in 64-bit address modes.
 def MEMpi : Operand<i32> {
   let PrintMethod = "printParamOperand";
   let MIOperandInfo = (ops i32imm);
 }
 
+// Branch & call targets have OtherVT type.
+def brtarget   : Operand<OtherVT>;
+def calltarget : Operand<i32>;
+
 //===----------------------------------------------------------------------===//
 // PTX Specific Node Definitions
 //===----------------------------------------------------------------------===//
@@ -138,66 +175,389 @@ def PTXexit
   : SDNode<"PTXISD::EXIT", SDTNone, [SDNPHasChain]>;
 def PTXret
   : SDNode<"PTXISD::RET",  SDTNone, [SDNPHasChain]>;
+def PTXcopyaddress
+  : SDNode<"PTXISD::COPY_ADDRESS", SDTypeProfile<1, 1, []>, []>;
 
 //===----------------------------------------------------------------------===//
 // Instruction Class Templates
 //===----------------------------------------------------------------------===//
 
+//===- Floating-Point Instructions - 2 Operand Form -----------------------===//
+multiclass PTX_FLOAT_2OP<string opcstr, SDNode opnode> {
+  def rr32 : InstPTX<(outs RRegf32:$d),
+                     (ins RRegf32:$a),
+                     !strconcat(opcstr, ".f32\t$d, $a"),
+                     [(set RRegf32:$d, (opnode RRegf32:$a))]>;
+  def ri32 : InstPTX<(outs RRegf32:$d),
+                     (ins f32imm:$a),
+                     !strconcat(opcstr, ".f32\t$d, $a"),
+                     [(set RRegf32:$d, (opnode fpimm:$a))]>;
+  def rr64 : InstPTX<(outs RRegf64:$d),
+                     (ins RRegf64:$a),
+                     !strconcat(opcstr, ".f64\t$d, $a"),
+                     [(set RRegf64:$d, (opnode RRegf64:$a))]>;
+  def ri64 : InstPTX<(outs RRegf64:$d),
+                     (ins f64imm:$a),
+                     !strconcat(opcstr, ".f64\t$d, $a"),
+                     [(set RRegf64:$d, (opnode fpimm:$a))]>;
+}
+
+//===- Floating-Point Instructions - 3 Operand Form -----------------------===//
+multiclass PTX_FLOAT_3OP<string opcstr, SDNode opnode> {
+  def rr32 : InstPTX<(outs RRegf32:$d),
+                     (ins RRegf32:$a, RRegf32:$b),
+                     !strconcat(opcstr, ".f32\t$d, $a, $b"),
+                     [(set RRegf32:$d, (opnode RRegf32:$a, RRegf32:$b))]>;
+  def ri32 : InstPTX<(outs RRegf32:$d),
+                     (ins RRegf32:$a, f32imm:$b),
+                     !strconcat(opcstr, ".f32\t$d, $a, $b"),
+                     [(set RRegf32:$d, (opnode RRegf32:$a, fpimm:$b))]>;
+  def rr64 : InstPTX<(outs RRegf64:$d),
+                     (ins RRegf64:$a, RRegf64:$b),
+                     !strconcat(opcstr, ".f64\t$d, $a, $b"),
+                     [(set RRegf64:$d, (opnode RRegf64:$a, RRegf64:$b))]>;
+  def ri64 : InstPTX<(outs RRegf64:$d),
+                     (ins RRegf64:$a, f64imm:$b),
+                     !strconcat(opcstr, ".f64\t$d, $a, $b"),
+                     [(set RRegf64:$d, (opnode RRegf64:$a, fpimm:$b))]>;
+}
+
+//===- Floating-Point Instructions - 4 Operand Form -----------------------===//
+multiclass PTX_FLOAT_4OP<string opcstr, SDNode opnode1, SDNode opnode2> {
+  def rrr32 : InstPTX<(outs RRegf32:$d),
+                      (ins RRegf32:$a, RRegf32:$b, RRegf32:$c),
+                      !strconcat(opcstr, ".f32\t$d, $a, $b, $c"),
+                      [(set RRegf32:$d, (opnode2 (opnode1 RRegf32:$a,
+                                                          RRegf32:$b),
+                                                 RRegf32:$c))]>;
+  def rri32 : InstPTX<(outs RRegf32:$d),
+                      (ins RRegf32:$a, RRegf32:$b, f32imm:$c),
+                      !strconcat(opcstr, ".f32\t$d, $a, $b, $c"),
+                      [(set RRegf32:$d, (opnode2 (opnode1 RRegf32:$a,
+                                                          RRegf32:$b),
+                                                 fpimm:$c))]>;
+  def rrr64 : InstPTX<(outs RRegf64:$d),
+                      (ins RRegf64:$a, RRegf64:$b, RRegf64:$c),
+                      !strconcat(opcstr, ".f64\t$d, $a, $b, $c"),
+                      [(set RRegf64:$d, (opnode2 (opnode1 RRegf64:$a,
+                                                          RRegf64:$b),
+                                                 RRegf64:$c))]>;
+  def rri64 : InstPTX<(outs RRegf64:$d),
+                      (ins RRegf64:$a, RRegf64:$b, f64imm:$c),
+                      !strconcat(opcstr, ".f64\t$d, $a, $b, $c"),
+                      [(set RRegf64:$d, (opnode2 (opnode1 RRegf64:$a,
+                                                          RRegf64:$b),
+                                                 fpimm:$c))]>;
+}
+
 multiclass INT3<string opcstr, SDNode opnode> {
-  def rr : InstPTX<(outs RRegs32:$d),
-                   (ins RRegs32:$a, RRegs32:$b),
-                   !strconcat(opcstr, ".%type\t$d, $a, $b"),
-                   [(set RRegs32:$d, (opnode RRegs32:$a, RRegs32:$b))]>;
-  def ri : InstPTX<(outs RRegs32:$d),
-                   (ins RRegs32:$a, i32imm:$b),
-                   !strconcat(opcstr, ".%type\t$d, $a, $b"),
-                   [(set RRegs32:$d, (opnode RRegs32:$a, imm:$b))]>;
+  def rr16 : InstPTX<(outs RRegu16:$d),
+                     (ins RRegu16:$a, RRegu16:$b),
+                     !strconcat(opcstr, ".u16\t$d, $a, $b"),
+                     [(set RRegu16:$d, (opnode RRegu16:$a, RRegu16:$b))]>;
+  def ri16 : InstPTX<(outs RRegu16:$d),
+                     (ins RRegu16:$a, i16imm:$b),
+                     !strconcat(opcstr, ".u16\t$d, $a, $b"),
+                     [(set RRegu16:$d, (opnode RRegu16:$a, imm:$b))]>;
+  def rr32 : InstPTX<(outs RRegu32:$d),
+                     (ins RRegu32:$a, RRegu32:$b),
+                     !strconcat(opcstr, ".u32\t$d, $a, $b"),
+                     [(set RRegu32:$d, (opnode RRegu32:$a, RRegu32:$b))]>;
+  def ri32 : InstPTX<(outs RRegu32:$d),
+                     (ins RRegu32:$a, i32imm:$b),
+                     !strconcat(opcstr, ".u32\t$d, $a, $b"),
+                     [(set RRegu32:$d, (opnode RRegu32:$a, imm:$b))]>;
+  def rr64 : InstPTX<(outs RRegu64:$d),
+                     (ins RRegu64:$a, RRegu64:$b),
+                     !strconcat(opcstr, ".u64\t$d, $a, $b"),
+                     [(set RRegu64:$d, (opnode RRegu64:$a, RRegu64:$b))]>;
+  def ri64 : InstPTX<(outs RRegu64:$d),
+                     (ins RRegu64:$a, i64imm:$b),
+                     !strconcat(opcstr, ".u64\t$d, $a, $b"),
+                     [(set RRegu64:$d, (opnode RRegu64:$a, imm:$b))]>;
+}
+
+multiclass PTX_LOGIC<string opcstr, SDNode opnode> {
+  def ripreds : InstPTX<(outs Preds:$d),
+                     (ins Preds:$a, i1imm:$b),
+                     !strconcat(opcstr, ".pred\t$d, $a, $b"),
+                     [(set Preds:$d, (opnode Preds:$a, imm:$b))]>;
+  def rrpreds : InstPTX<(outs Preds:$d),
+                     (ins Preds:$a, Preds:$b),
+                     !strconcat(opcstr, ".pred\t$d, $a, $b"),
+                     [(set Preds:$d, (opnode Preds:$a, Preds:$b))]>;
+  def rr16 : InstPTX<(outs RRegu16:$d),
+                     (ins RRegu16:$a, RRegu16:$b),
+                     !strconcat(opcstr, ".b16\t$d, $a, $b"),
+                     [(set RRegu16:$d, (opnode RRegu16:$a, RRegu16:$b))]>;
+  def ri16 : InstPTX<(outs RRegu16:$d),
+                     (ins RRegu16:$a, i16imm:$b),
+                     !strconcat(opcstr, ".b16\t$d, $a, $b"),
+                     [(set RRegu16:$d, (opnode RRegu16:$a, imm:$b))]>;
+  def rr32 : InstPTX<(outs RRegu32:$d),
+                     (ins RRegu32:$a, RRegu32:$b),
+                     !strconcat(opcstr, ".b32\t$d, $a, $b"),
+                     [(set RRegu32:$d, (opnode RRegu32:$a, RRegu32:$b))]>;
+  def ri32 : InstPTX<(outs RRegu32:$d),
+                     (ins RRegu32:$a, i32imm:$b),
+                     !strconcat(opcstr, ".b32\t$d, $a, $b"),
+                     [(set RRegu32:$d, (opnode RRegu32:$a, imm:$b))]>;
+  def rr64 : InstPTX<(outs RRegu64:$d),
+                     (ins RRegu64:$a, RRegu64:$b),
+                     !strconcat(opcstr, ".b64\t$d, $a, $b"),
+                     [(set RRegu64:$d, (opnode RRegu64:$a, RRegu64:$b))]>;
+  def ri64 : InstPTX<(outs RRegu64:$d),
+                     (ins RRegu64:$a, i64imm:$b),
+                     !strconcat(opcstr, ".b64\t$d, $a, $b"),
+                     [(set RRegu64:$d, (opnode RRegu64:$a, imm:$b))]>;
 }
 
-// no %type directive, non-communtable
 multiclass INT3ntnc<string opcstr, SDNode opnode> {
-  def rr : InstPTX<(outs RRegs32:$d),
-                   (ins RRegs32:$a, RRegs32:$b),
-                   !strconcat(opcstr, "\t$d, $a, $b"),
-                   [(set RRegs32:$d, (opnode RRegs32:$a, RRegs32:$b))]>;
-  def ri : InstPTX<(outs RRegs32:$d),
-                   (ins RRegs32:$a, i32imm:$b),
-                   !strconcat(opcstr, "\t$d, $a, $b"),
-                   [(set RRegs32:$d, (opnode RRegs32:$a, imm:$b))]>;
-  def ir : InstPTX<(outs RRegs32:$d),
-                   (ins i32imm:$a, RRegs32:$b),
-                   !strconcat(opcstr, "\t$d, $a, $b"),
-                   [(set RRegs32:$d, (opnode imm:$a, RRegs32:$b))]>;
+  def rr16 : InstPTX<(outs RRegu16:$d),
+                     (ins RRegu16:$a, RRegu16:$b),
+                     !strconcat(opcstr, "16\t$d, $a, $b"),
+                     [(set RRegu16:$d, (opnode RRegu16:$a, RRegu16:$b))]>;
+  def rr32 : InstPTX<(outs RRegu32:$d),
+                     (ins RRegu32:$a, RRegu32:$b),
+                     !strconcat(opcstr, "32\t$d, $a, $b"),
+                     [(set RRegu32:$d, (opnode RRegu32:$a, RRegu32:$b))]>;
+  def rr64 : InstPTX<(outs RRegu64:$d),
+                     (ins RRegu64:$a, RRegu64:$b),
+                     !strconcat(opcstr, "64\t$d, $a, $b"),
+                     [(set RRegu64:$d, (opnode RRegu64:$a, RRegu64:$b))]>;
+  def ri16 : InstPTX<(outs RRegu16:$d),
+                     (ins RRegu16:$a, i16imm:$b),
+                     !strconcat(opcstr, "16\t$d, $a, $b"),
+                     [(set RRegu16:$d, (opnode RRegu16:$a, imm:$b))]>;
+  def ri32 : InstPTX<(outs RRegu32:$d),
+                     (ins RRegu32:$a, i32imm:$b),
+                     !strconcat(opcstr, "32\t$d, $a, $b"),
+                     [(set RRegu32:$d, (opnode RRegu32:$a, imm:$b))]>;
+  def ri64 : InstPTX<(outs RRegu64:$d),
+                     (ins RRegu64:$a, i64imm:$b),
+                     !strconcat(opcstr, "64\t$d, $a, $b"),
+                     [(set RRegu64:$d, (opnode RRegu64:$a, imm:$b))]>;
+  def ir16 : InstPTX<(outs RRegu16:$d),
+                     (ins i16imm:$a, RRegu16:$b),
+                     !strconcat(opcstr, "16\t$d, $a, $b"),
+                     [(set RRegu16:$d, (opnode imm:$a, RRegu16:$b))]>;
+  def ir32 : InstPTX<(outs RRegu32:$d),
+                     (ins i32imm:$a, RRegu32:$b),
+                     !strconcat(opcstr, "32\t$d, $a, $b"),
+                     [(set RRegu32:$d, (opnode imm:$a, RRegu32:$b))]>;
+  def ir64 : InstPTX<(outs RRegu64:$d),
+                     (ins i64imm:$a, RRegu64:$b),
+                     !strconcat(opcstr, "64\t$d, $a, $b"),
+                     [(set RRegu64:$d, (opnode imm:$a, RRegu64:$b))]>;
 }
 
-multiclass PTX_LD<string opstr, RegisterClass RC, PatFrag pat_load> {
-  def rr : InstPTX<(outs RC:$d),
-                   (ins MEMri:$a),
-                   !strconcat(opstr, ".%type\t$d, [$a]"),
-                   [(set RC:$d, (pat_load ADDRrr:$a))]>;
-  def ri : InstPTX<(outs RC:$d),
-                   (ins MEMri:$a),
-                   !strconcat(opstr, ".%type\t$d, [$a]"),
-                   [(set RC:$d, (pat_load ADDRri:$a))]>;
-  def ii : InstPTX<(outs RC:$d),
-                   (ins MEMii:$a),
-                   !strconcat(opstr, ".%type\t$d, [$a]"),
-                   [(set RC:$d, (pat_load ADDRii:$a))]>;
+multiclass PTX_SETP_I<RegisterClass RC, string regclsname, Operand immcls,
+                        CondCode cmp, string cmpstr> {
+  // TODO support 5-operand format: p|q, a, b, c
+
+  def rr
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b),
+              !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
+              [(set Preds:$p, (setcc RC:$a, RC:$b, cmp))]>;
+  def ri
+    : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b),
+              !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
+              [(set Preds:$p, (setcc RC:$a, imm:$b, cmp))]>;
+
+  def rr_and_r
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, $c"),
+              [(set Preds:$p, (and (setcc RC:$a, RC:$b, cmp), Preds:$c))]>;
+  def ri_and_r
+    : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, $c"),
+              [(set Preds:$p, (and (setcc RC:$a, imm:$b, cmp), Preds:$c))]>;
+  def rr_or_r
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, $c"),
+              [(set Preds:$p, (or (setcc RC:$a, RC:$b, cmp), Preds:$c))]>;
+  def ri_or_r
+    : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, $c"),
+              [(set Preds:$p, (or (setcc RC:$a, imm:$b, cmp), Preds:$c))]>;
+  def rr_xor_r
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, $c"),
+              [(set Preds:$p, (xor (setcc RC:$a, RC:$b, cmp), Preds:$c))]>;
+  def ri_xor_r
+    : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, $c"),
+              [(set Preds:$p, (xor (setcc RC:$a, imm:$b, cmp), Preds:$c))]>;
+
+  def rr_and_not_r
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, !$c"),
+              [(set Preds:$p, (and (setcc RC:$a, RC:$b, cmp), (not Preds:$c)))]>;
+  def ri_and_not_r
+    : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, !$c"),
+              [(set Preds:$p, (and (setcc RC:$a, imm:$b, cmp), (not Preds:$c)))]>;
+  def rr_or_not_r
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, !$c"),
+              [(set Preds:$p, (or (setcc RC:$a, RC:$b, cmp), (not Preds:$c)))]>;
+  def ri_or_not_r
+    : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, !$c"),
+              [(set Preds:$p, (or (setcc RC:$a, imm:$b, cmp), (not Preds:$c)))]>;
+  def rr_xor_not_r
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, !$c"),
+              [(set Preds:$p, (xor (setcc RC:$a, RC:$b, cmp), (not Preds:$c)))]>;
+  def ri_xor_not_r
+    : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, !$c"),
+              [(set Preds:$p, (xor (setcc RC:$a, imm:$b, cmp), (not Preds:$c)))]>;
 }
 
-multiclass PTX_ST<string opstr, RegisterClass RC, PatFrag pat_store> {
-  def rr : InstPTX<(outs),
-                   (ins RC:$d, MEMri:$a),
-                   !strconcat(opstr, ".%type\t[$a], $d"),
-                   [(pat_store RC:$d, ADDRrr:$a)]>;
-  def ri : InstPTX<(outs),
-                   (ins RC:$d, MEMri:$a),
-                   !strconcat(opstr, ".%type\t[$a], $d"),
-                   [(pat_store RC:$d, ADDRri:$a)]>;
-  def ii : InstPTX<(outs),
-                   (ins RC:$d, MEMii:$a),
-                   !strconcat(opstr, ".%type\t[$a], $d"),
-                   [(pat_store RC:$d, ADDRii:$a)]>;
+multiclass PTX_SETP_FP<RegisterClass RC, string regclsname,
+                        CondCode ucmp, CondCode ocmp, string cmpstr> {
+  // TODO support 5-operand format: p|q, a, b, c
+
+  def rr_u
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b),
+              !strconcat("setp.", cmpstr, "u.", regclsname, "\t$p, $a, $b"),
+              [(set Preds:$p, (setcc RC:$a, RC:$b, ucmp))]>;
+  def rr_o
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b),
+              !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
+              [(set Preds:$p, (setcc RC:$a, RC:$b, ocmp))]>;
+
+  def rr_and_r_u
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, "u.and.", regclsname, "\t$p, $a, $b, $c"),
+              [(set Preds:$p, (and (setcc RC:$a, RC:$b, ucmp), Preds:$c))]>;
+  def rr_and_r_o
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, $c"),
+              [(set Preds:$p, (and (setcc RC:$a, RC:$b, ocmp), Preds:$c))]>;
+
+  def rr_or_r_u
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, "u.or.", regclsname, "\t$p, $a, $b, $c"),
+              [(set Preds:$p, (or (setcc RC:$a, RC:$b, ucmp), Preds:$c))]>;
+  def rr_or_r_o
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, $c"),
+              [(set Preds:$p, (or (setcc RC:$a, RC:$b, ocmp), Preds:$c))]>;
+
+  def rr_xor_r_u
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, "u.xor.", regclsname, "\t$p, $a, $b, $c"),
+              [(set Preds:$p, (xor (setcc RC:$a, RC:$b, ucmp), Preds:$c))]>;
+  def rr_xor_r_o
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, $c"),
+              [(set Preds:$p, (xor (setcc RC:$a, RC:$b, ocmp), Preds:$c))]>;
+
+  def rr_and_not_r_u
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, "u.and.", regclsname, "\t$p, $a, $b, !$c"),
+              [(set Preds:$p, (and (setcc RC:$a, RC:$b, ucmp), (not Preds:$c)))]>;
+  def rr_and_not_r_o
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, !$c"),
+              [(set Preds:$p, (and (setcc RC:$a, RC:$b, ocmp), (not Preds:$c)))]>;
+
+  def rr_or_not_r_u
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, "u.or.", regclsname, "\t$p, $a, $b, !$c"),
+              [(set Preds:$p, (or (setcc RC:$a, RC:$b, ucmp), (not Preds:$c)))]>;
+  def rr_or_not_r_o
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, !$c"),
+              [(set Preds:$p, (or (setcc RC:$a, RC:$b, ocmp), (not Preds:$c)))]>;
+
+  def rr_xor_not_r_u
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, "u.xor.", regclsname, "\t$p, $a, $b, !$c"),
+              [(set Preds:$p, (xor (setcc RC:$a, RC:$b, ucmp), (not Preds:$c)))]>;
+  def rr_xor_not_r_o
+    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, !$c"),
+              [(set Preds:$p, (xor (setcc RC:$a, RC:$b, ocmp), (not Preds:$c)))]>;
+}
+
+multiclass PTX_SELP<RegisterClass RC, string regclsname> {
+  def rr
+    : InstPTX<(outs RC:$r), (ins Preds:$a, RC:$b, RC:$c),
+              !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"),
+              [(set RC:$r, (select Preds:$a, RC:$b, RC:$c))]>;
+}
+
+multiclass PTX_LD<string opstr, string typestr, RegisterClass RC, PatFrag pat_load> {
+  def rr32 : InstPTX<(outs RC:$d),
+                     (ins MEMri32:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRrr32:$a))]>, Requires<[Use32BitAddresses]>;
+  def rr64 : InstPTX<(outs RC:$d),
+                     (ins MEMri64:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRrr64:$a))]>, Requires<[Use64BitAddresses]>;
+  def ri32 : InstPTX<(outs RC:$d),
+                     (ins MEMri32:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRri32:$a))]>, Requires<[Use32BitAddresses]>;
+  def ri64 : InstPTX<(outs RC:$d),
+                     (ins MEMri64:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRri64:$a))]>, Requires<[Use64BitAddresses]>;
+  def ii32 : InstPTX<(outs RC:$d),
+                     (ins MEMii32:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRii32:$a))]>, Requires<[Use32BitAddresses]>;
+  def ii64 : InstPTX<(outs RC:$d),
+                     (ins MEMii64:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRii64:$a))]>, Requires<[Use64BitAddresses]>;
+}
+
+multiclass PTX_LD_ALL<string opstr, PatFrag pat_load> {
+  defm u16 : PTX_LD<opstr, ".u16", RRegu16, pat_load>;
+  defm u32 : PTX_LD<opstr, ".u32", RRegu32, pat_load>;
+  defm u64 : PTX_LD<opstr, ".u64", RRegu64, pat_load>;
+  defm f32 : PTX_LD<opstr, ".f32", RRegf32, pat_load>;
+  defm f64 : PTX_LD<opstr, ".f64", RRegf64, pat_load>;
+}
+
+multiclass PTX_ST<string opstr, string typestr, RegisterClass RC, PatFrag pat_store> {
+  def rr32 : InstPTX<(outs),
+                     (ins RC:$d, MEMri32:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                     [(pat_store RC:$d, ADDRrr32:$a)]>, Requires<[Use32BitAddresses]>;
+  def rr64 : InstPTX<(outs),
+                     (ins RC:$d, MEMri64:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                     [(pat_store RC:$d, ADDRrr64:$a)]>, Requires<[Use64BitAddresses]>;
+  def ri32 : InstPTX<(outs),
+                   (ins RC:$d, MEMri32:$a),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                   [(pat_store RC:$d, ADDRri32:$a)]>, Requires<[Use32BitAddresses]>;
+  def ri64 : InstPTX<(outs),
+                   (ins RC:$d, MEMri64:$a),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                   [(pat_store RC:$d, ADDRri64:$a)]>, Requires<[Use64BitAddresses]>;
+  def ii32 : InstPTX<(outs),
+                   (ins RC:$d, MEMii32:$a),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                   [(pat_store RC:$d, ADDRii32:$a)]>, Requires<[Use32BitAddresses]>;
+  def ii64 : InstPTX<(outs),
+                   (ins RC:$d, MEMii64:$a),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                   [(pat_store RC:$d, ADDRii64:$a)]>, Requires<[Use64BitAddresses]>;
+}
+
+multiclass PTX_ST_ALL<string opstr, PatFrag pat_store> {
+  defm u16 : PTX_ST<opstr, ".u16", RRegu16, pat_store>;
+  defm u32 : PTX_ST<opstr, ".u32", RRegu32, pat_store>;
+  defm u64 : PTX_ST<opstr, ".u64", RRegu64, pat_store>;
+  defm f32 : PTX_ST<opstr, ".f32", RRegf32, pat_store>;
+  defm f64 : PTX_ST<opstr, ".f64", RRegf64, pat_store>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -208,50 +568,392 @@ multiclass PTX_ST<string opstr, RegisterClass RC, PatFrag pat_store> {
 
 defm ADD : INT3<"add", add>;
 defm SUB : INT3<"sub", sub>;
+defm MUL : INT3<"mul.lo", mul>; // FIXME: Allow 32x32 -> 64 multiplies
+defm DIV : INT3<"div", udiv>;
+defm REM : INT3<"rem", urem>;
+
+///===- Floating-Point Arithmetic Instructions ----------------------------===//
+
+// Standard Unary Operations
+defm FNEG : PTX_FLOAT_2OP<"neg", fneg>;
+
+// Standard Binary Operations
+defm FADD : PTX_FLOAT_3OP<"add", fadd>;
+defm FSUB : PTX_FLOAT_3OP<"sub", fsub>;
+defm FMUL : PTX_FLOAT_3OP<"mul", fmul>;
+
+// TODO: Allow user selection of rounding modes for fdiv.
+// For division, we need to have f32 and f64 differently.
+// For f32, we just always use .approx since it is supported on all hardware
+// for PTX 1.4+, which is our minimum target.
+def FDIVrr32 : InstPTX<(outs RRegf32:$d),
+                       (ins RRegf32:$a, RRegf32:$b),
+                       "div.approx.f32\t$d, $a, $b",
+                       [(set RRegf32:$d, (fdiv RRegf32:$a, RRegf32:$b))]>;
+def FDIVri32 : InstPTX<(outs RRegf32:$d),
+                       (ins RRegf32:$a, f32imm:$b),
+                       "div.approx.f32\t$d, $a, $b",
+                       [(set RRegf32:$d, (fdiv RRegf32:$a, fpimm:$b))]>;
+
+// For f64, we must specify a rounding for sm 1.3+ but *not* for sm 1.0.
+def FDIVrr64SM13 : InstPTX<(outs RRegf64:$d),
+                           (ins RRegf64:$a, RRegf64:$b),
+                           "div.rn.f64\t$d, $a, $b",
+                           [(set RRegf64:$d, (fdiv RRegf64:$a, RRegf64:$b))]>,
+                   Requires<[SupportsSM13]>;
+def FDIVri64SM13 : InstPTX<(outs RRegf64:$d),
+                           (ins RRegf64:$a, f64imm:$b),
+                           "div.rn.f64\t$d, $a, $b",
+                           [(set RRegf64:$d, (fdiv RRegf64:$a, fpimm:$b))]>,
+                   Requires<[SupportsSM13]>;
+def FDIVrr64SM10 : InstPTX<(outs RRegf64:$d),
+                           (ins RRegf64:$a, RRegf64:$b),
+                           "div.f64\t$d, $a, $b",
+                           [(set RRegf64:$d, (fdiv RRegf64:$a, RRegf64:$b))]>,
+                   Requires<[DoesNotSupportSM13]>;
+def FDIVri64SM10 : InstPTX<(outs RRegf64:$d),
+                           (ins RRegf64:$a, f64imm:$b),
+                           "div.f64\t$d, $a, $b",
+                           [(set RRegf64:$d, (fdiv RRegf64:$a, fpimm:$b))]>,
+                   Requires<[DoesNotSupportSM13]>;
+
+
+
+// Multi-operation hybrid instructions
+
+// The selection of mad/fma is tricky.  In some cases, they are the *same*
+// instruction, but in other cases we may prefer one or the other.  Also,
+// different PTX versions differ on whether rounding mode flags are required.
+// In the short term, mad is supported on all PTX versions and we use a
+// default rounding mode no matter what shader model or PTX version.
+// TODO: Allow the rounding mode to be selectable through llc.
+defm FMADSM13 : PTX_FLOAT_4OP<"mad.rn", fmul, fadd>, Requires<[SupportsSM13]>;
+defm FMAD : PTX_FLOAT_4OP<"mad", fmul, fadd>, Requires<[DoesNotSupportSM13]>;
+
+///===- Floating-Point Intrinsic Instructions -----------------------------===//
+
+def FSQRT32 : InstPTX<(outs RRegf32:$d),
+                      (ins RRegf32:$a),
+                      "sqrt.rn.f32\t$d, $a",
+                      [(set RRegf32:$d, (fsqrt RRegf32:$a))]>;
+
+def FSQRT64 : InstPTX<(outs RRegf64:$d),
+                      (ins RRegf64:$a),
+                      "sqrt.rn.f64\t$d, $a",
+                      [(set RRegf64:$d, (fsqrt RRegf64:$a))]>;
+
+def FSIN32 : InstPTX<(outs RRegf32:$d),
+                     (ins RRegf32:$a),
+                     "sin.approx.f32\t$d, $a",
+                     [(set RRegf32:$d, (fsin RRegf32:$a))]>;
+
+def FSIN64 : InstPTX<(outs RRegf64:$d),
+                     (ins RRegf64:$a),
+                     "sin.approx.f64\t$d, $a",
+                     [(set RRegf64:$d, (fsin RRegf64:$a))]>;
+
+def FCOS32 : InstPTX<(outs RRegf32:$d),
+                     (ins RRegf32:$a),
+                     "cos.approx.f32\t$d, $a",
+                     [(set RRegf32:$d, (fcos RRegf32:$a))]>;
+
+def FCOS64 : InstPTX<(outs RRegf64:$d),
+                     (ins RRegf64:$a),
+                     "cos.approx.f64\t$d, $a",
+                     [(set RRegf64:$d, (fcos RRegf64:$a))]>;
+
+
+///===- Comparison and Selection Instructions -----------------------------===//
+
+// Compare u16
+
+defm SETPEQu16 : PTX_SETP_I<RRegu16, "u16", i16imm, SETEQ,  "eq">;
+defm SETPNEu16 : PTX_SETP_I<RRegu16, "u16", i16imm, SETNE,  "ne">;
+defm SETPLTu16 : PTX_SETP_I<RRegu16, "u16", i16imm, SETULT, "lt">;
+defm SETPLEu16 : PTX_SETP_I<RRegu16, "u16", i16imm, SETULE, "le">;
+defm SETPGTu16 : PTX_SETP_I<RRegu16, "u16", i16imm, SETUGT, "gt">;
+defm SETPGEu16 : PTX_SETP_I<RRegu16, "u16", i16imm, SETUGE, "ge">;
+
+// Compare u32
+
+defm SETPEQu32 : PTX_SETP_I<RRegu32, "u32", i32imm, SETEQ,  "eq">;
+defm SETPNEu32 : PTX_SETP_I<RRegu32, "u32", i32imm, SETNE,  "ne">;
+defm SETPLTu32 : PTX_SETP_I<RRegu32, "u32", i32imm, SETULT, "lt">;
+defm SETPLEu32 : PTX_SETP_I<RRegu32, "u32", i32imm, SETULE, "le">;
+defm SETPGTu32 : PTX_SETP_I<RRegu32, "u32", i32imm, SETUGT, "gt">;
+defm SETPGEu32 : PTX_SETP_I<RRegu32, "u32", i32imm, SETUGE, "ge">;
+
+// Compare u64
+
+defm SETPEQu64 : PTX_SETP_I<RRegu64, "u64", i64imm, SETEQ,  "eq">;
+defm SETPNEu64 : PTX_SETP_I<RRegu64, "u64", i64imm, SETNE,  "ne">;
+defm SETPLTu64 : PTX_SETP_I<RRegu64, "u64", i64imm, SETULT, "lt">;
+defm SETPLEu64 : PTX_SETP_I<RRegu64, "u64", i64imm, SETULE, "le">;
+defm SETPGTu64 : PTX_SETP_I<RRegu64, "u64", i64imm, SETUGT, "gt">;
+defm SETPGEu64 : PTX_SETP_I<RRegu64, "u64", i64imm, SETUGE, "ge">;
+
+// Compare f32
+
+defm SETPEQf32 : PTX_SETP_FP<RRegf32, "f32", SETUEQ, SETOEQ, "eq">;
+defm SETPNEf32 : PTX_SETP_FP<RRegf32, "f32", SETUNE, SETONE, "ne">;
+defm SETPLTf32 : PTX_SETP_FP<RRegf32, "f32", SETULT, SETOLT, "lt">;
+defm SETPLEf32 : PTX_SETP_FP<RRegf32, "f32", SETULE, SETOLE, "le">;
+defm SETPGTf32 : PTX_SETP_FP<RRegf32, "f32", SETUGT, SETOGT, "gt">;
+defm SETPGEf32 : PTX_SETP_FP<RRegf32, "f32", SETUGE, SETOGE, "ge">;
+
+// Compare f64
+
+defm SETPEQf64 : PTX_SETP_FP<RRegf64, "f64", SETUEQ, SETOEQ, "eq">;
+defm SETPNEf64 : PTX_SETP_FP<RRegf64, "f64", SETUNE, SETONE, "ne">;
+defm SETPLTf64 : PTX_SETP_FP<RRegf64, "f64", SETULT, SETOLT, "lt">;
+defm SETPLEf64 : PTX_SETP_FP<RRegf64, "f64", SETULE, SETOLE, "le">;
+defm SETPGTf64 : PTX_SETP_FP<RRegf64, "f64", SETUGT, SETOGT, "gt">;
+defm SETPGEf64 : PTX_SETP_FP<RRegf64, "f64", SETUGE, SETOGE, "ge">;
+
+// .selp
+
+defm PTX_SELPu16 : PTX_SELP<RRegu16, "u16">;
+defm PTX_SELPu32 : PTX_SELP<RRegu32, "u32">;
+defm PTX_SELPu64 : PTX_SELP<RRegu64, "u64">;
+defm PTX_SELPf32 : PTX_SELP<RRegf32, "f32">;
+defm PTX_SELPf64 : PTX_SELP<RRegf64, "f64">;
 
 ///===- Logic and Shift Instructions --------------------------------------===//
 
-defm SHL : INT3ntnc<"shl.b32", PTXshl>;
-defm SRL : INT3ntnc<"shr.u32", PTXsrl>;
-defm SRA : INT3ntnc<"shr.s32", PTXsra>;
+defm SHL : INT3ntnc<"shl.b", PTXshl>;
+defm SRL : INT3ntnc<"shr.u", PTXsrl>;
+defm SRA : INT3ntnc<"shr.s", PTXsra>;
+
+defm AND : PTX_LOGIC<"and", and>;
+defm OR  : PTX_LOGIC<"or",  or>;
+defm XOR : PTX_LOGIC<"xor", xor>;
 
 ///===- Data Movement and Conversion Instructions -------------------------===//
 
 let neverHasSideEffects = 1 in {
-  // rely on isMoveInstr to separate MOVpp, MOVrr, etc.
-  def MOVpp
+  def MOVPREDrr
     : InstPTX<(outs Preds:$d), (ins Preds:$a), "mov.pred\t$d, $a", []>;
-  def MOVrr
-    : InstPTX<(outs RRegs32:$d), (ins RRegs32:$a), "mov.%type\t$d, $a", []>;
+  def MOVU16rr
+    : InstPTX<(outs RRegu16:$d), (ins RRegu16:$a), "mov.u16\t$d, $a", []>;
+  def MOVU32rr
+    : InstPTX<(outs RRegu32:$d), (ins RRegu32:$a), "mov.u32\t$d, $a", []>;
+  def MOVU64rr
+    : InstPTX<(outs RRegu64:$d), (ins RRegu64:$a), "mov.u64\t$d, $a", []>;
+  def MOVF32rr
+    : InstPTX<(outs RRegf32:$d), (ins RRegf32:$a), "mov.f32\t$d, $a", []>;
+  def MOVF64rr
+    : InstPTX<(outs RRegf64:$d), (ins RRegf64:$a), "mov.f64\t$d, $a", []>;
 }
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-  def MOVpi
+  def MOVPREDri
     : InstPTX<(outs Preds:$d), (ins i1imm:$a), "mov.pred\t$d, $a",
               [(set Preds:$d, imm:$a)]>;
-  def MOVri
-    : InstPTX<(outs RRegs32:$d), (ins i32imm:$a), "mov.s32\t$d, $a",
-              [(set RRegs32:$d, imm:$a)]>;
+  def MOVU16ri
+    : InstPTX<(outs RRegu16:$d), (ins i16imm:$a), "mov.u16\t$d, $a",
+              [(set RRegu16:$d, imm:$a)]>;
+  def MOVU32ri
+    : InstPTX<(outs RRegu32:$d), (ins i32imm:$a), "mov.u32\t$d, $a",
+              [(set RRegu32:$d, imm:$a)]>;
+  def MOVU64ri
+    : InstPTX<(outs RRegu64:$d), (ins i64imm:$a), "mov.u64\t$d, $a",
+              [(set RRegu64:$d, imm:$a)]>;
+  def MOVF32ri
+    : InstPTX<(outs RRegf32:$d), (ins f32imm:$a), "mov.f32\t$d, $a",
+              [(set RRegf32:$d, fpimm:$a)]>;
+  def MOVF64ri
+    : InstPTX<(outs RRegf64:$d), (ins f64imm:$a), "mov.f64\t$d, $a",
+              [(set RRegf64:$d, fpimm:$a)]>;
 }
 
-defm LDg : PTX_LD<"ld.global", RRegs32, load_global>;
-defm LDc : PTX_LD<"ld.const",  RRegs32, load_constant>;
-defm LDl : PTX_LD<"ld.local",  RRegs32, load_local>;
-defm LDp : PTX_LD<"ld.param",  RRegs32, load_parameter>;
-defm LDs : PTX_LD<"ld.shared", RRegs32, load_shared>;
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+  def MOVaddr32
+    : InstPTX<(outs RRegu32:$d), (ins i32imm:$a), "mov.u32\t$d, $a",
+              [(set RRegu32:$d, (PTXcopyaddress tglobaladdr:$a))]>;
+  def MOVaddr64
+    : InstPTX<(outs RRegu64:$d), (ins i64imm:$a), "mov.u64\t$d, $a",
+              [(set RRegu64:$d, (PTXcopyaddress tglobaladdr:$a))]>;
+}
+
+// Loads
+defm LDg : PTX_LD_ALL<"ld.global", load_global>;
+defm LDc : PTX_LD_ALL<"ld.const",  load_constant>;
+defm LDl : PTX_LD_ALL<"ld.local",  load_local>;
+defm LDs : PTX_LD_ALL<"ld.shared", load_shared>;
+
+// This is a special instruction that is manually inserted for kernel parameters
+def LDpiU16 : InstPTX<(outs RRegu16:$d), (ins MEMpi:$a),
+                      "ld.param.u16\t$d, [$a]", []>;
+def LDpiU32 : InstPTX<(outs RRegu32:$d), (ins MEMpi:$a),
+                      "ld.param.u32\t$d, [$a]", []>;
+def LDpiU64 : InstPTX<(outs RRegu64:$d), (ins MEMpi:$a),
+                      "ld.param.u64\t$d, [$a]", []>;
+def LDpiF32 : InstPTX<(outs RRegf32:$d), (ins MEMpi:$a),
+                      "ld.param.f32\t$d, [$a]", []>;
+def LDpiF64 : InstPTX<(outs RRegf64:$d), (ins MEMpi:$a),
+                      "ld.param.f64\t$d, [$a]", []>;
+
+// Stores
+defm STg : PTX_ST_ALL<"st.global", store_global>;
+defm STl : PTX_ST_ALL<"st.local",  store_local>;
+defm STs : PTX_ST_ALL<"st.shared", store_shared>;
+
+// defm STp : PTX_ST_ALL<"st.param",  store_parameter>;
+// defm LDp : PTX_LD_ALL<"ld.param",  load_parameter>;
+// TODO: Do something with st.param if/when it is needed.
+
+// Conversion to pred
+
+def CVT_pred_u16
+  : InstPTX<(outs Preds:$d), (ins RRegu16:$a), "cvt.pred.u16\t$d, $a",
+            [(set Preds:$d, (trunc RRegu16:$a))]>;
 
-def LDpi : InstPTX<(outs RRegs32:$d), (ins MEMpi:$a),
-                   "ld.param.%type\t$d, [$a]", []>;
+def CVT_pred_u32
+  : InstPTX<(outs Preds:$d), (ins RRegu32:$a), "cvt.pred.u32\t$d, $a",
+            [(set Preds:$d, (trunc RRegu32:$a))]>;
 
-defm STg : PTX_ST<"st.global", RRegs32, store_global>;
-defm STl : PTX_ST<"st.local",  RRegs32, store_local>;
-// Store to parameter state space requires PTX 2.0 or higher?
-// defm STp : PTX_ST<"st.param",  RRegs32, store_parameter>;
-defm STs : PTX_ST<"st.shared", RRegs32, store_shared>;
+def CVT_pred_u64
+  : InstPTX<(outs Preds:$d), (ins RRegu64:$a), "cvt.pred.u64\t$d, $a",
+            [(set Preds:$d, (trunc RRegu64:$a))]>;
+
+def CVT_pred_f32
+  : InstPTX<(outs Preds:$d), (ins RRegf32:$a), "cvt.rni.pred.f32\t$d, $a",
+            [(set Preds:$d, (fp_to_uint RRegf32:$a))]>;
+
+def CVT_pred_f64
+  : InstPTX<(outs Preds:$d), (ins RRegf64:$a), "cvt.rni.pred.f64\t$d, $a",
+            [(set Preds:$d, (fp_to_uint RRegf64:$a))]>;
+
+// Conversion to u16
+
+def CVT_u16_pred
+  : InstPTX<(outs RRegu16:$d), (ins Preds:$a), "cvt.u16.pred\t$d, $a",
+            [(set RRegu16:$d, (zext Preds:$a))]>;
+
+def CVT_u16_u32
+  : InstPTX<(outs RRegu16:$d), (ins RRegu32:$a), "cvt.u16.u32\t$d, $a",
+            [(set RRegu16:$d, (trunc RRegu32:$a))]>;
+
+def CVT_u16_u64
+  : InstPTX<(outs RRegu16:$d), (ins RRegu64:$a), "cvt.u16.u64\t$d, $a",
+            [(set RRegu16:$d, (trunc RRegu64:$a))]>;
+
+def CVT_u16_f32
+  : InstPTX<(outs RRegu16:$d), (ins RRegf32:$a), "cvt.rni.u16.f32\t$d, $a",
+            [(set RRegu16:$d, (fp_to_uint RRegf32:$a))]>;
+
+def CVT_u16_f64
+  : InstPTX<(outs RRegu16:$d), (ins RRegf64:$a), "cvt.rni.u16.f64\t$d, $a",
+            [(set RRegu16:$d, (fp_to_uint RRegf64:$a))]>;
+
+// Conversion to u32
+
+def CVT_u32_pred
+  : InstPTX<(outs RRegu32:$d), (ins Preds:$a), "cvt.u32.pred\t$d, $a",
+            [(set RRegu32:$d, (zext Preds:$a))]>;
+
+def CVT_u32_u16
+  : InstPTX<(outs RRegu32:$d), (ins RRegu16:$a), "cvt.u32.u16\t$d, $a",
+            [(set RRegu32:$d, (zext RRegu16:$a))]>;
+
+def CVT_u32_u64
+  : InstPTX<(outs RRegu32:$d), (ins RRegu64:$a), "cvt.u32.u64\t$d, $a",
+            [(set RRegu32:$d, (trunc RRegu64:$a))]>;
+
+def CVT_u32_f32
+  : InstPTX<(outs RRegu32:$d), (ins RRegf32:$a), "cvt.rni.u32.f32\t$d, $a",
+            [(set RRegu32:$d, (fp_to_uint RRegf32:$a))]>;
+
+def CVT_u32_f64
+  : InstPTX<(outs RRegu32:$d), (ins RRegf64:$a), "cvt.rni.u32.f64\t$d, $a",
+            [(set RRegu32:$d, (fp_to_uint RRegf64:$a))]>;
+
+// Conversion to u64
+
+def CVT_u64_pred
+  : InstPTX<(outs RRegu64:$d), (ins Preds:$a), "cvt.u64.pred\t$d, $a",
+            [(set RRegu64:$d, (zext Preds:$a))]>;
+
+def CVT_u64_u16
+  : InstPTX<(outs RRegu64:$d), (ins RRegu16:$a), "cvt.u64.u16\t$d, $a",
+            [(set RRegu64:$d, (zext RRegu16:$a))]>;
+
+def CVT_u64_u32
+  : InstPTX<(outs RRegu64:$d), (ins RRegu32:$a), "cvt.u64.u32\t$d, $a",
+            [(set RRegu64:$d, (zext RRegu32:$a))]>;
+
+def CVT_u64_f32
+  : InstPTX<(outs RRegu64:$d), (ins RRegf32:$a), "cvt.rni.u64.f32\t$d, $a",
+            [(set RRegu64:$d, (fp_to_uint RRegf32:$a))]>;
+
+def CVT_u64_f64
+  : InstPTX<(outs RRegu64:$d), (ins RRegf64:$a), "cvt.rni.u64.f64\t$d, $a",
+            [(set RRegu64:$d, (fp_to_uint RRegf64:$a))]>;
+
+// Conversion to f32
+
+def CVT_f32_pred
+  : InstPTX<(outs RRegf32:$d), (ins Preds:$a), "cvt.rn.f32.pred\t$d, $a",
+            [(set RRegf32:$d, (uint_to_fp Preds:$a))]>;
+
+def CVT_f32_u16
+  : InstPTX<(outs RRegf32:$d), (ins RRegu16:$a), "cvt.rn.f32.u16\t$d, $a",
+            [(set RRegf32:$d, (uint_to_fp RRegu16:$a))]>;
+
+def CVT_f32_u32
+  : InstPTX<(outs RRegf32:$d), (ins RRegu32:$a), "cvt.rn.f32.u32\t$d, $a",
+            [(set RRegf32:$d, (uint_to_fp RRegu32:$a))]>;
+
+def CVT_f32_u64
+  : InstPTX<(outs RRegf32:$d), (ins RRegu64:$a), "cvt.rn.f32.u64\t$d, $a",
+            [(set RRegf32:$d, (uint_to_fp RRegu64:$a))]>;
+
+def CVT_f32_f64
+  : InstPTX<(outs RRegf32:$d), (ins RRegf64:$a), "cvt.rn.f32.f64\t$d, $a",
+            [(set RRegf32:$d, (fround RRegf64:$a))]>;
+
+// Conversion to f64
+
+def CVT_f64_pred
+  : InstPTX<(outs RRegf64:$d), (ins Preds:$a), "cvt.rn.f64.pred\t$d, $a",
+            [(set RRegf64:$d, (uint_to_fp Preds:$a))]>;
+
+def CVT_f64_u16
+  : InstPTX<(outs RRegf64:$d), (ins RRegu16:$a), "cvt.rn.f64.u16\t$d, $a",
+            [(set RRegf64:$d, (uint_to_fp RRegu16:$a))]>;
+
+def CVT_f64_u32
+  : InstPTX<(outs RRegf64:$d), (ins RRegu32:$a), "cvt.rn.f64.u32\t$d, $a",
+            [(set RRegf64:$d, (uint_to_fp RRegu32:$a))]>;
+
+def CVT_f64_u64
+  : InstPTX<(outs RRegf64:$d), (ins RRegu64:$a), "cvt.rn.f64.u64\t$d, $a",
+            [(set RRegf64:$d, (uint_to_fp RRegu64:$a))]>;
+
+def CVT_f64_f32
+  : InstPTX<(outs RRegf64:$d), (ins RRegf32:$a), "cvt.f64.f32\t$d, $a",
+            [(set RRegf64:$d, (fextend RRegf32:$a))]>;
 
 ///===- Control Flow Instructions -----------------------------------------===//
 
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+  def BRAd
+    : InstPTX<(outs), (ins brtarget:$d), "bra\t$d", [(br bb:$d)]>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+  // FIXME: The pattern part is blank because I cannot (or do not yet know
+  // how to) use the first operand of PredicateOperand (a Preds register) here
+  def BRAdp
+    : InstPTX<(outs), (ins brtarget:$d), "bra\t$d",
+              [/*(brcond pred:$_p, bb:$d)*/]>;
+}
+
 let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
   def EXIT : InstPTX<(outs), (ins), "exit", [(PTXexit)]>;
   def RET  : InstPTX<(outs), (ins), "ret",  [(PTXret)]>;
 }
+
+///===- Intrinsic Instructions --------------------------------------------===//
+
+include "PTXIntrinsicInstrInfo.td"
diff --git a/lib/Target/PTX/PTXIntrinsicInstrInfo.td b/lib/Target/PTX/PTXIntrinsicInstrInfo.td
new file mode 100644
index 0000000..320934a
--- /dev/null
+++ b/lib/Target/PTX/PTXIntrinsicInstrInfo.td
@@ -0,0 +1,84 @@
+//===- PTXIntrinsicInstrInfo.td - Defines PTX intrinsics ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the PTX-specific intrinsic instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// PTX Special Purpose Register Accessor Intrinsics
+
+class PTX_READ_SPECIAL_REGISTER_R64<string regname, Intrinsic intop>
+  : InstPTX<(outs RRegu64:$d), (ins),
+            !strconcat("mov.u64\t$d, %", regname),
+            [(set RRegu64:$d, (intop))]>;
+
+class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop>
+  : InstPTX<(outs RRegu32:$d), (ins),
+            !strconcat("mov.u32\t$d, %", regname),
+            [(set RRegu32:$d, (intop))]>;
+
+// TODO Add read vector-version of special registers
+
+//def PTX_READ_TID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"tid", int_ptx_read_tid_r64>;
+def PTX_READ_TID_X   : PTX_READ_SPECIAL_REGISTER_R32<"tid.x", int_ptx_read_tid_x>;
+def PTX_READ_TID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"tid.y", int_ptx_read_tid_y>;
+def PTX_READ_TID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"tid.z", int_ptx_read_tid_z>;
+def PTX_READ_TID_W   : PTX_READ_SPECIAL_REGISTER_R32<"tid.w", int_ptx_read_tid_w>;
+
+//def PTX_READ_NTID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"ntid", int_ptx_read_ntid_r64>;
+def PTX_READ_NTID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x", int_ptx_read_ntid_x>;
+def PTX_READ_NTID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y", int_ptx_read_ntid_y>;
+def PTX_READ_NTID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z", int_ptx_read_ntid_z>;
+def PTX_READ_NTID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w", int_ptx_read_ntid_w>;
+
+def PTX_READ_LANEID  : PTX_READ_SPECIAL_REGISTER_R32<"laneid", int_ptx_read_laneid>;
+def PTX_READ_WARPID  : PTX_READ_SPECIAL_REGISTER_R32<"warpid", int_ptx_read_warpid>;
+def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid", int_ptx_read_nwarpid>;
+
+//def PTX_READ_CTAID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"ctaid", int_ptx_read_ctaid_r64>;
+def PTX_READ_CTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x", int_ptx_read_ctaid_x>;
+def PTX_READ_CTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y", int_ptx_read_ctaid_y>;
+def PTX_READ_CTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z", int_ptx_read_ctaid_z>;
+def PTX_READ_CTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w", int_ptx_read_ctaid_w>;
+
+//def PTX_READ_NCTAID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"nctaid", int_ptx_read_nctaid_r64>;
+def PTX_READ_NCTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x", int_ptx_read_nctaid_x>;
+def PTX_READ_NCTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y", int_ptx_read_nctaid_y>;
+def PTX_READ_NCTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z", int_ptx_read_nctaid_z>;
+def PTX_READ_NCTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w", int_ptx_read_nctaid_w>;
+
+def PTX_READ_SMID  : PTX_READ_SPECIAL_REGISTER_R32<"smid", int_ptx_read_smid>;
+def PTX_READ_NSMID  : PTX_READ_SPECIAL_REGISTER_R32<"nsmid", int_ptx_read_nsmid>;
+def PTX_READ_GRIDID  : PTX_READ_SPECIAL_REGISTER_R32<"gridid", int_ptx_read_gridid>;
+
+def PTX_READ_LANEMASK_EQ
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_eq", int_ptx_read_lanemask_eq>;
+def PTX_READ_LANEMASK_LE
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_le", int_ptx_read_lanemask_le>;
+def PTX_READ_LANEMASK_LT
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_lt", int_ptx_read_lanemask_lt>;
+def PTX_READ_LANEMASK_GE
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_ge", int_ptx_read_lanemask_ge>;
+def PTX_READ_LANEMASK_GT
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_gt", int_ptx_read_lanemask_gt>;
+
+def PTX_READ_CLOCK
+  : PTX_READ_SPECIAL_REGISTER_R32<"clock", int_ptx_read_clock>;
+def PTX_READ_CLOCK64
+  : PTX_READ_SPECIAL_REGISTER_R64<"clock64", int_ptx_read_clock64>;
+
+def PTX_READ_PM0 : PTX_READ_SPECIAL_REGISTER_R32<"pm0", int_ptx_read_pm0>;
+def PTX_READ_PM1 : PTX_READ_SPECIAL_REGISTER_R32<"pm1", int_ptx_read_pm1>;
+def PTX_READ_PM2 : PTX_READ_SPECIAL_REGISTER_R32<"pm2", int_ptx_read_pm2>;
+def PTX_READ_PM3 : PTX_READ_SPECIAL_REGISTER_R32<"pm3", int_ptx_read_pm3>;
+
+// PTX Parallel Synchronization and Communication Intrinsics
+
+def PTX_BAR_SYNC : InstPTX<(outs), (ins i32imm:$i), "bar.sync\t$i",
+                           [(int_ptx_bar_sync imm:$i)]>;
diff --git a/lib/Target/PTX/PTXMCAsmStreamer.cpp b/lib/Target/PTX/PTXMCAsmStreamer.cpp
index 0886ba8..1574670 100644
--- a/lib/Target/PTX/PTXMCAsmStreamer.cpp
+++ b/lib/Target/PTX/PTXMCAsmStreamer.cpp
@@ -143,9 +143,9 @@ public:
   virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
 
   virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                             bool isPCRel, unsigned AddrSpace);
-  virtual void EmitULEB128Value(const MCExpr *Value, unsigned AddrSpace = 0);
-  virtual void EmitSLEB128Value(const MCExpr *Value, unsigned AddrSpace = 0);
+                             unsigned AddrSpace);
+  virtual void EmitULEB128Value(const MCExpr *Value);
+  virtual void EmitSLEB128Value(const MCExpr *Value);
   virtual void EmitGPRel32Value(const MCExpr *Value);
 
 
@@ -233,7 +233,7 @@ void PTXMCAsmStreamer::ChangeSection(const MCSection *Section) {
 void PTXMCAsmStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
-  assert(getCurrentSection() && "Cannot emit before setting section!");
+  //assert(getCurrentSection() && "Cannot emit before setting section!");
 
   OS << *Symbol << MAI.getLabelSuffix();
   EmitEOL();
@@ -352,9 +352,8 @@ void PTXMCAsmStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
 }
 
 void PTXMCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
-                                     bool isPCRel, unsigned AddrSpace) {
+                                     unsigned AddrSpace) {
   assert(getCurrentSection() && "Cannot emit contents before setting section!");
-  assert(!isPCRel && "Cannot emit pc relative relocations!");
   const char *Directive = 0;
   switch (Size) {
   default: break;
@@ -383,15 +382,13 @@ void PTXMCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
   EmitEOL();
 }
 
-void PTXMCAsmStreamer::EmitULEB128Value(const MCExpr *Value,
-                                        unsigned AddrSpace) {
+void PTXMCAsmStreamer::EmitULEB128Value(const MCExpr *Value) {
   assert(MAI.hasLEB128() && "Cannot print a .uleb");
   OS << ".uleb128 " << *Value;
   EmitEOL();
 }
 
-void PTXMCAsmStreamer::EmitSLEB128Value(const MCExpr *Value,
-                                        unsigned AddrSpace) {
+void PTXMCAsmStreamer::EmitSLEB128Value(const MCExpr *Value) {
   assert(MAI.hasLEB128() && "Cannot print a .sleb");
   OS << ".sleb128 " << *Value;
   EmitEOL();
@@ -423,7 +420,8 @@ void PTXMCAsmStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue,
   MCStreamer::EmitFill(NumBytes, FillValue, AddrSpace);
 }
 
-void PTXMCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
+void PTXMCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment,
+                                            int64_t Value,
                                             unsigned ValueSize,
                                             unsigned MaxBytesToEmit) {
   // Some assemblers don't support non-power of two alignments, so we always
@@ -532,7 +530,7 @@ void PTXMCAsmStreamer::Finish() {}
 namespace llvm {
   MCStreamer *createPTXAsmStreamer(MCContext &Context,
                                    formatted_raw_ostream &OS,
-                                   bool isVerboseAsm, bool useLoc,
+                                   bool isVerboseAsm, bool useLoc, bool useCFI,
                                    MCInstPrinter *IP,
                                    MCCodeEmitter *CE, TargetAsmBackend *TAB,
                                    bool ShowInst) {
diff --git a/lib/Target/PTX/PTXMFInfoExtract.cpp b/lib/Target/PTX/PTXMFInfoExtract.cpp
index b37c740..c5e1910 100644
--- a/lib/Target/PTX/PTXMFInfoExtract.cpp
+++ b/lib/Target/PTX/PTXMFInfoExtract.cpp
@@ -79,12 +79,12 @@ bool PTXMFInfoExtract::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(for (PTXMachineFunctionInfo::reg_iterator
              i = MFI->argRegBegin(), e = MFI->argRegEnd();
-	     i != e; ++i)
+             i != e; ++i)
         dbgs() << "Arg Reg: " << *i << "\n";);
 
   DEBUG(for (PTXMachineFunctionInfo::reg_iterator
              i = MFI->localVarRegBegin(), e = MFI->localVarRegEnd();
-	     i != e; ++i)
+             i != e; ++i)
         dbgs() << "Local Var Reg: " << *i << "\n";);
 
   return false;
diff --git a/lib/Target/PTX/PTXMachineFunctionInfo.h b/lib/Target/PTX/PTXMachineFunctionInfo.h
index 56d044b..81df1c2 100644
--- a/lib/Target/PTX/PTXMachineFunctionInfo.h
+++ b/lib/Target/PTX/PTXMachineFunctionInfo.h
@@ -42,36 +42,37 @@ public:
   void setRetReg(unsigned reg) { reg_ret = reg; }
 
   void doneAddArg(void) {
-    std::sort(reg_arg.begin(), reg_arg.end());
     _isDoneAddArg = true;
   }
-  void doneAddLocalVar(void) {
-    std::sort(reg_local_var.begin(), reg_local_var.end());
-  }
+  void doneAddLocalVar(void) {}
 
   bool isDoneAddArg(void) { return _isDoneAddArg; }
 
   bool isKernel() const { return is_kernel; }
 
-  typedef std::vector<unsigned>::const_iterator reg_iterator;
+  typedef std::vector<unsigned>::const_iterator         reg_iterator;
+  typedef std::vector<unsigned>::const_reverse_iterator reg_reverse_iterator;
 
-  bool argRegEmpty() const { return reg_arg.empty(); }
-  int getNumArg() const { return reg_arg.size(); }
+  bool         argRegEmpty() const { return reg_arg.empty(); }
+  int          getNumArg() const { return reg_arg.size(); }
   reg_iterator argRegBegin() const { return reg_arg.begin(); }
   reg_iterator argRegEnd()   const { return reg_arg.end(); }
+  reg_reverse_iterator argRegReverseBegin() const { return reg_arg.rbegin(); }
+  reg_reverse_iterator argRegReverseEnd() const { return reg_arg.rend(); }
 
-  bool localVarRegEmpty() const { return reg_local_var.empty(); }
+  bool         localVarRegEmpty() const { return reg_local_var.empty(); }
   reg_iterator localVarRegBegin() const { return reg_local_var.begin(); }
   reg_iterator localVarRegEnd()   const { return reg_local_var.end(); }
 
   unsigned retReg() const { return reg_ret; }
 
   bool isArgReg(unsigned reg) const {
-    return std::binary_search(reg_arg.begin(), reg_arg.end(), reg);
+    return std::find(reg_arg.begin(), reg_arg.end(), reg) != reg_arg.end();
   }
 
   bool isLocalVarReg(unsigned reg) const {
-    return std::binary_search(reg_local_var.begin(), reg_local_var.end(), reg);
+    return std::find(reg_local_var.begin(), reg_local_var.end(), reg)
+      != reg_local_var.end();
   }
 }; // class PTXMachineFunctionInfo
 } // namespace llvm
diff --git a/lib/Target/PTX/PTXRegisterInfo.td b/lib/Target/PTX/PTXRegisterInfo.td
index 22e2b34..f616141 100644
--- a/lib/Target/PTX/PTXRegisterInfo.td
+++ b/lib/Target/PTX/PTXRegisterInfo.td
@@ -19,6 +19,8 @@ class PTXReg<string n> : Register<n> {
 //  Registers
 //===----------------------------------------------------------------------===//
 
+///===- Predicate Registers -----------------------------------------------===//
+
 def P0  : PTXReg<"p0">;
 def P1  : PTXReg<"p1">;
 def P2  : PTXReg<"p2">;
@@ -51,6 +53,108 @@ def P28 : PTXReg<"p28">;
 def P29 : PTXReg<"p29">;
 def P30 : PTXReg<"p30">;
 def P31 : PTXReg<"p31">;
+def P32 : PTXReg<"p32">;
+def P33 : PTXReg<"p33">;
+def P34 : PTXReg<"p34">;
+def P35 : PTXReg<"p35">;
+def P36 : PTXReg<"p36">;
+def P37 : PTXReg<"p37">;
+def P38 : PTXReg<"p38">;
+def P39 : PTXReg<"p39">;
+def P40 : PTXReg<"p40">;
+def P41 : PTXReg<"p41">;
+def P42 : PTXReg<"p42">;
+def P43 : PTXReg<"p43">;
+def P44 : PTXReg<"p44">;
+def P45 : PTXReg<"p45">;
+def P46 : PTXReg<"p46">;
+def P47 : PTXReg<"p47">;
+def P48 : PTXReg<"p48">;
+def P49 : PTXReg<"p49">;
+def P50 : PTXReg<"p50">;
+def P51 : PTXReg<"p51">;
+def P52 : PTXReg<"p52">;
+def P53 : PTXReg<"p53">;
+def P54 : PTXReg<"p54">;
+def P55 : PTXReg<"p55">;
+def P56 : PTXReg<"p56">;
+def P57 : PTXReg<"p57">;
+def P58 : PTXReg<"p58">;
+def P59 : PTXReg<"p59">;
+def P60 : PTXReg<"p60">;
+def P61 : PTXReg<"p61">;
+def P62 : PTXReg<"p62">;
+def P63 : PTXReg<"p63">;
+
+///===- 16-bit Integer Registers ------------------------------------------===//
+
+def RH0  : PTXReg<"rh0">;
+def RH1  : PTXReg<"rh1">;
+def RH2  : PTXReg<"rh2">;
+def RH3  : PTXReg<"rh3">;
+def RH4  : PTXReg<"rh4">;
+def RH5  : PTXReg<"rh5">;
+def RH6  : PTXReg<"rh6">;
+def RH7  : PTXReg<"rh7">;
+def RH8  : PTXReg<"rh8">;
+def RH9  : PTXReg<"rh9">;
+def RH10 : PTXReg<"rh10">;
+def RH11 : PTXReg<"rh11">;
+def RH12 : PTXReg<"rh12">;
+def RH13 : PTXReg<"rh13">;
+def RH14 : PTXReg<"rh14">;
+def RH15 : PTXReg<"rh15">;
+def RH16 : PTXReg<"rh16">;
+def RH17 : PTXReg<"rh17">;
+def RH18 : PTXReg<"rh18">;
+def RH19 : PTXReg<"rh19">;
+def RH20 : PTXReg<"rh20">;
+def RH21 : PTXReg<"rh21">;
+def RH22 : PTXReg<"rh22">;
+def RH23 : PTXReg<"rh23">;
+def RH24 : PTXReg<"rh24">;
+def RH25 : PTXReg<"rh25">;
+def RH26 : PTXReg<"rh26">;
+def RH27 : PTXReg<"rh27">;
+def RH28 : PTXReg<"rh28">;
+def RH29 : PTXReg<"rh29">;
+def RH30 : PTXReg<"rh30">;
+def RH31 : PTXReg<"rh31">;
+def RH32 : PTXReg<"rh32">;
+def RH33 : PTXReg<"rh33">;
+def RH34 : PTXReg<"rh34">;
+def RH35 : PTXReg<"rh35">;
+def RH36 : PTXReg<"rh36">;
+def RH37 : PTXReg<"rh37">;
+def RH38 : PTXReg<"rh38">;
+def RH39 : PTXReg<"rh39">;
+def RH40 : PTXReg<"rh40">;
+def RH41 : PTXReg<"rh41">;
+def RH42 : PTXReg<"rh42">;
+def RH43 : PTXReg<"rh43">;
+def RH44 : PTXReg<"rh44">;
+def RH45 : PTXReg<"rh45">;
+def RH46 : PTXReg<"rh46">;
+def RH47 : PTXReg<"rh47">;
+def RH48 : PTXReg<"rh48">;
+def RH49 : PTXReg<"rh49">;
+def RH50 : PTXReg<"rh50">;
+def RH51 : PTXReg<"rh51">;
+def RH52 : PTXReg<"rh52">;
+def RH53 : PTXReg<"rh53">;
+def RH54 : PTXReg<"rh54">;
+def RH55 : PTXReg<"rh55">;
+def RH56 : PTXReg<"rh56">;
+def RH57 : PTXReg<"rh57">;
+def RH58 : PTXReg<"rh58">;
+def RH59 : PTXReg<"rh59">;
+def RH60 : PTXReg<"rh60">;
+def RH61 : PTXReg<"rh61">;
+def RH62 : PTXReg<"rh62">;
+def RH63 : PTXReg<"rh63">;
+
+
+///===- 32-bit Integer Registers ------------------------------------------===//
 
 def R0  : PTXReg<"r0">;
 def R1  : PTXReg<"r1">;
@@ -84,6 +188,243 @@ def R28 : PTXReg<"r28">;
 def R29 : PTXReg<"r29">;
 def R30 : PTXReg<"r30">;
 def R31 : PTXReg<"r31">;
+def R32 : PTXReg<"r32">;
+def R33 : PTXReg<"r33">;
+def R34 : PTXReg<"r34">;
+def R35 : PTXReg<"r35">;
+def R36 : PTXReg<"r36">;
+def R37 : PTXReg<"r37">;
+def R38 : PTXReg<"r38">;
+def R39 : PTXReg<"r39">;
+def R40 : PTXReg<"r40">;
+def R41 : PTXReg<"r41">;
+def R42 : PTXReg<"r42">;
+def R43 : PTXReg<"r43">;
+def R44 : PTXReg<"r44">;
+def R45 : PTXReg<"r45">;
+def R46 : PTXReg<"r46">;
+def R47 : PTXReg<"r47">;
+def R48 : PTXReg<"r48">;
+def R49 : PTXReg<"r49">;
+def R50 : PTXReg<"r50">;
+def R51 : PTXReg<"r51">;
+def R52 : PTXReg<"r52">;
+def R53 : PTXReg<"r53">;
+def R54 : PTXReg<"r54">;
+def R55 : PTXReg<"r55">;
+def R56 : PTXReg<"r56">;
+def R57 : PTXReg<"r57">;
+def R58 : PTXReg<"r58">;
+def R59 : PTXReg<"r59">;
+def R60 : PTXReg<"r60">;
+def R61 : PTXReg<"r61">;
+def R62 : PTXReg<"r62">;
+def R63 : PTXReg<"r63">;
+
+
+///===- 64-bit Integer Registers ------------------------------------------===//
+
+def RD0  : PTXReg<"rd0">;
+def RD1  : PTXReg<"rd1">;
+def RD2  : PTXReg<"rd2">;
+def RD3  : PTXReg<"rd3">;
+def RD4  : PTXReg<"rd4">;
+def RD5  : PTXReg<"rd5">;
+def RD6  : PTXReg<"rd6">;
+def RD7  : PTXReg<"rd7">;
+def RD8  : PTXReg<"rd8">;
+def RD9  : PTXReg<"rd9">;
+def RD10 : PTXReg<"rd10">;
+def RD11 : PTXReg<"rd11">;
+def RD12 : PTXReg<"rd12">;
+def RD13 : PTXReg<"rd13">;
+def RD14 : PTXReg<"rd14">;
+def RD15 : PTXReg<"rd15">;
+def RD16 : PTXReg<"rd16">;
+def RD17 : PTXReg<"rd17">;
+def RD18 : PTXReg<"rd18">;
+def RD19 : PTXReg<"rd19">;
+def RD20 : PTXReg<"rd20">;
+def RD21 : PTXReg<"rd21">;
+def RD22 : PTXReg<"rd22">;
+def RD23 : PTXReg<"rd23">;
+def RD24 : PTXReg<"rd24">;
+def RD25 : PTXReg<"rd25">;
+def RD26 : PTXReg<"rd26">;
+def RD27 : PTXReg<"rd27">;
+def RD28 : PTXReg<"rd28">;
+def RD29 : PTXReg<"rd29">;
+def RD30 : PTXReg<"rd30">;
+def RD31 : PTXReg<"rd31">;
+def RD32 : PTXReg<"rd32">;
+def RD33 : PTXReg<"rd33">;
+def RD34 : PTXReg<"rd34">;
+def RD35 : PTXReg<"rd35">;
+def RD36 : PTXReg<"rd36">;
+def RD37 : PTXReg<"rd37">;
+def RD38 : PTXReg<"rd38">;
+def RD39 : PTXReg<"rd39">;
+def RD40 : PTXReg<"rd40">;
+def RD41 : PTXReg<"rd41">;
+def RD42 : PTXReg<"rd42">;
+def RD43 : PTXReg<"rd43">;
+def RD44 : PTXReg<"rd44">;
+def RD45 : PTXReg<"rd45">;
+def RD46 : PTXReg<"rd46">;
+def RD47 : PTXReg<"rd47">;
+def RD48 : PTXReg<"rd48">;
+def RD49 : PTXReg<"rd49">;
+def RD50 : PTXReg<"rd50">;
+def RD51 : PTXReg<"rd51">;
+def RD52 : PTXReg<"rd52">;
+def RD53 : PTXReg<"rd53">;
+def RD54 : PTXReg<"rd54">;
+def RD55 : PTXReg<"rd55">;
+def RD56 : PTXReg<"rd56">;
+def RD57 : PTXReg<"rd57">;
+def RD58 : PTXReg<"rd58">;
+def RD59 : PTXReg<"rd59">;
+def RD60 : PTXReg<"rd60">;
+def RD61 : PTXReg<"rd61">;
+def RD62 : PTXReg<"rd62">;
+def RD63 : PTXReg<"rd63">;
+
+
+///===- 32-bit Floating-Point Registers -----------------------------------===//
+
+def F0  : PTXReg<"f0">;
+def F1  : PTXReg<"f1">;
+def F2  : PTXReg<"f2">;
+def F3  : PTXReg<"f3">;
+def F4  : PTXReg<"f4">;
+def F5  : PTXReg<"f5">;
+def F6  : PTXReg<"f6">;
+def F7  : PTXReg<"f7">;
+def F8  : PTXReg<"f8">;
+def F9  : PTXReg<"f9">;
+def F10 : PTXReg<"f10">;
+def F11 : PTXReg<"f11">;
+def F12 : PTXReg<"f12">;
+def F13 : PTXReg<"f13">;
+def F14 : PTXReg<"f14">;
+def F15 : PTXReg<"f15">;
+def F16 : PTXReg<"f16">;
+def F17 : PTXReg<"f17">;
+def F18 : PTXReg<"f18">;
+def F19 : PTXReg<"f19">;
+def F20 : PTXReg<"f20">;
+def F21 : PTXReg<"f21">;
+def F22 : PTXReg<"f22">;
+def F23 : PTXReg<"f23">;
+def F24 : PTXReg<"f24">;
+def F25 : PTXReg<"f25">;
+def F26 : PTXReg<"f26">;
+def F27 : PTXReg<"f27">;
+def F28 : PTXReg<"f28">;
+def F29 : PTXReg<"f29">;
+def F30 : PTXReg<"f30">;
+def F31 : PTXReg<"f31">;
+def F32 : PTXReg<"f32">;
+def F33 : PTXReg<"f33">;
+def F34 : PTXReg<"f34">;
+def F35 : PTXReg<"f35">;
+def F36 : PTXReg<"f36">;
+def F37 : PTXReg<"f37">;
+def F38 : PTXReg<"f38">;
+def F39 : PTXReg<"f39">;
+def F40 : PTXReg<"f40">;
+def F41 : PTXReg<"f41">;
+def F42 : PTXReg<"f42">;
+def F43 : PTXReg<"f43">;
+def F44 : PTXReg<"f44">;
+def F45 : PTXReg<"f45">;
+def F46 : PTXReg<"f46">;
+def F47 : PTXReg<"f47">;
+def F48 : PTXReg<"f48">;
+def F49 : PTXReg<"f49">;
+def F50 : PTXReg<"f50">;
+def F51 : PTXReg<"f51">;
+def F52 : PTXReg<"f52">;
+def F53 : PTXReg<"f53">;
+def F54 : PTXReg<"f54">;
+def F55 : PTXReg<"f55">;
+def F56 : PTXReg<"f56">;
+def F57 : PTXReg<"f57">;
+def F58 : PTXReg<"f58">;
+def F59 : PTXReg<"f59">;
+def F60 : PTXReg<"f60">;
+def F61 : PTXReg<"f61">;
+def F62 : PTXReg<"f62">;
+def F63 : PTXReg<"f63">;
+
+
+///===- 64-bit Floating-Point Registers -----------------------------------===//
+
+def FD0  : PTXReg<"fd0">;
+def FD1  : PTXReg<"fd1">;
+def FD2  : PTXReg<"fd2">;
+def FD3  : PTXReg<"fd3">;
+def FD4  : PTXReg<"fd4">;
+def FD5  : PTXReg<"fd5">;
+def FD6  : PTXReg<"fd6">;
+def FD7  : PTXReg<"fd7">;
+def FD8  : PTXReg<"fd8">;
+def FD9  : PTXReg<"fd9">;
+def FD10 : PTXReg<"fd10">;
+def FD11 : PTXReg<"fd11">;
+def FD12 : PTXReg<"fd12">;
+def FD13 : PTXReg<"fd13">;
+def FD14 : PTXReg<"fd14">;
+def FD15 : PTXReg<"fd15">;
+def FD16 : PTXReg<"fd16">;
+def FD17 : PTXReg<"fd17">;
+def FD18 : PTXReg<"fd18">;
+def FD19 : PTXReg<"fd19">;
+def FD20 : PTXReg<"fd20">;
+def FD21 : PTXReg<"fd21">;
+def FD22 : PTXReg<"fd22">;
+def FD23 : PTXReg<"fd23">;
+def FD24 : PTXReg<"fd24">;
+def FD25 : PTXReg<"fd25">;
+def FD26 : PTXReg<"fd26">;
+def FD27 : PTXReg<"fd27">;
+def FD28 : PTXReg<"fd28">;
+def FD29 : PTXReg<"fd29">;
+def FD30 : PTXReg<"fd30">;
+def FD31 : PTXReg<"fd31">;
+def FD32 : PTXReg<"fd32">;
+def FD33 : PTXReg<"fd33">;
+def FD34 : PTXReg<"fd34">;
+def FD35 : PTXReg<"fd35">;
+def FD36 : PTXReg<"fd36">;
+def FD37 : PTXReg<"fd37">;
+def FD38 : PTXReg<"fd38">;
+def FD39 : PTXReg<"fd39">;
+def FD40 : PTXReg<"fd40">;
+def FD41 : PTXReg<"fd41">;
+def FD42 : PTXReg<"fd42">;
+def FD43 : PTXReg<"fd43">;
+def FD44 : PTXReg<"fd44">;
+def FD45 : PTXReg<"fd45">;
+def FD46 : PTXReg<"f4d6">;
+def FD47 : PTXReg<"fd47">;
+def FD48 : PTXReg<"fd48">;
+def FD49 : PTXReg<"fd49">;
+def FD50 : PTXReg<"fd50">;
+def FD51 : PTXReg<"fd51">;
+def FD52 : PTXReg<"fd52">;
+def FD53 : PTXReg<"fd53">;
+def FD54 : PTXReg<"fd54">;
+def FD55 : PTXReg<"fd55">;
+def FD56 : PTXReg<"fd56">;
+def FD57 : PTXReg<"fd57">;
+def FD58 : PTXReg<"fd58">;
+def FD59 : PTXReg<"fd59">;
+def FD60 : PTXReg<"fd60">;
+def FD61 : PTXReg<"fd61">;
+def FD62 : PTXReg<"fd62">;
+def FD63 : PTXReg<"fd63">;
+
 
 //===----------------------------------------------------------------------===//
 //  Register classes
@@ -93,10 +434,58 @@ def Preds : RegisterClass<"PTX", [i1], 8,
                           [P0, P1, P2, P3, P4, P5, P6, P7,
                            P8, P9, P10, P11, P12, P13, P14, P15,
                            P16, P17, P18, P19, P20, P21, P22, P23,
-                           P24, P25, P26, P27, P28, P29, P30, P31]>;
+                           P24, P25, P26, P27, P28, P29, P30, P31,
+                           P32, P33, P34, P35, P36, P37, P38, P39,
+                           P40, P41, P42, P43, P44, P45, P46, P47,
+                           P48, P49, P50, P51, P52, P53, P54, P55,
+                           P56, P57, P58, P59, P60, P61, P62, P63]>;
 
-def RRegs32 : RegisterClass<"PTX", [i32], 32,
+def RRegu16 : RegisterClass<"PTX", [i16], 16,
+                            [RH0, RH1, RH2, RH3, RH4, RH5, RH6, RH7,
+                             RH8, RH9, RH10, RH11, RH12, RH13, RH14, RH15,
+                             RH16, RH17, RH18, RH19, RH20, RH21, RH22, RH23,
+                             RH24, RH25, RH26, RH27, RH28, RH29, RH30, RH31,
+                             RH32, RH33, RH34, RH35, RH36, RH37, RH38, RH39,
+                             RH40, RH41, RH42, RH43, RH44, RH45, RH46, RH47,
+                             RH48, RH49, RH50, RH51, RH52, RH53, RH54, RH55,
+                             RH56, RH57, RH58, RH59, RH60, RH61, RH62, RH63]>;
+
+def RRegu32 : RegisterClass<"PTX", [i32], 32,
                             [R0, R1, R2, R3, R4, R5, R6, R7,
                              R8, R9, R10, R11, R12, R13, R14, R15,
                              R16, R17, R18, R19, R20, R21, R22, R23,
-                             R24, R25, R26, R27, R28, R29, R30, R31]>;
+                             R24, R25, R26, R27, R28, R29, R30, R31,
+                             R32, R33, R34, R35, R36, R37, R38, R39,
+                             R40, R41, R42, R43, R44, R45, R46, R47,
+                             R48, R49, R50, R51, R52, R53, R54, R55,
+                             R56, R57, R58, R59, R60, R61, R62, R63]>;
+
+def RRegu64 : RegisterClass<"PTX", [i64], 64,
+                            [RD0, RD1, RD2, RD3, RD4, RD5, RD6, RD7,
+                             RD8, RD9, RD10, RD11, RD12, RD13, RD14, RD15,
+                             RD16, RD17, RD18, RD19, RD20, RD21, RD22, RD23,
+                             RD24, RD25, RD26, RD27, RD28, RD29, RD30, RD31,
+                             RD32, RD33, RD34, RD35, RD36, RD37, RD38, RD39,
+                             RD40, RD41, RD42, RD43, RD44, RD45, RD46, RD47,
+                             RD48, RD49, RD50, RD51, RD52, RD53, RD54, RD55,
+                             RD56, RD57, RD58, RD59, RD60, RD61, RD62, RD63]>;
+
+def RRegf32 : RegisterClass<"PTX", [f32], 32,
+                            [F0, F1, F2, F3, F4, F5, F6, F7,
+                             F8, F9, F10, F11, F12, F13, F14, F15,
+                             F16, F17, F18, F19, F20, F21, F22, F23,
+                             F24, F25, F26, F27, F28, F29, F30, F31,
+                             F32, F33, F34, F35, F36, F37, F38, F39,
+                             F40, F41, F42, F43, F44, F45, F46, F47,
+                             F48, F49, F50, F51, F52, F53, F54, F55,
+                             F56, F57, F58, F59, F60, F61, F62, F63]>;
+
+def RRegf64 : RegisterClass<"PTX", [f64], 64,
+                            [FD0, FD1, FD2, FD3, FD4, FD5, FD6, FD7,
+                             FD8, FD9, FD10, FD11, FD12, FD13, FD14, FD15,
+                             FD16, FD17, FD18, FD19, FD20, FD21, FD22, FD23,
+                             FD24, FD25, FD26, FD27, FD28, FD29, FD30, FD31,
+                             FD32, FD33, FD34, FD35, FD36, FD37, FD38, FD39,
+                             FD40, FD41, FD42, FD43, FD44, FD45, FD46, FD47,
+                             FD48, FD49, FD50, FD51, FD52, FD53, FD54, FD55,
+                             FD56, FD57, FD58, FD59, FD60, FD61, FD62, FD63]>;
diff --git a/lib/Target/PTX/PTXSubtarget.cpp b/lib/Target/PTX/PTXSubtarget.cpp
index 00e2c88..a224f2b 100644
--- a/lib/Target/PTX/PTXSubtarget.cpp
+++ b/lib/Target/PTX/PTXSubtarget.cpp
@@ -12,12 +12,36 @@
 //===----------------------------------------------------------------------===//
 
 #include "PTXSubtarget.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
-PTXSubtarget::PTXSubtarget(const std::string &TT, const std::string &FS) {
-  std::string TARGET = "sm_20";
-  // TODO: call ParseSubtargetFeatures(FS, TARGET);
+PTXSubtarget::PTXSubtarget(const std::string &TT, const std::string &FS,
+                           bool is64Bit)
+  : PTXShaderModel(PTX_SM_1_0),
+    PTXVersion(PTX_VERSION_2_0),
+    SupportsDouble(false),
+    Is64Bit(is64Bit) {
+  std::string TARGET = "generic";
+  ParseSubtargetFeatures(FS, TARGET);
+}
+
+std::string PTXSubtarget::getTargetString() const {
+  switch(PTXShaderModel) {
+    default: llvm_unreachable("Unknown shader model");
+    case PTX_SM_1_0: return "sm_10";
+    case PTX_SM_1_3: return "sm_13";
+    case PTX_SM_2_0: return "sm_20";
+  }
+}
+
+std::string PTXSubtarget::getPTXVersionString() const {
+  switch(PTXVersion) {
+    default: llvm_unreachable("Unknown PTX version");
+    case PTX_VERSION_2_0: return "2.0";
+    case PTX_VERSION_2_1: return "2.1";
+    case PTX_VERSION_2_2: return "2.2";
+  }
 }
 
 #include "PTXGenSubtarget.inc"
diff --git a/lib/Target/PTX/PTXSubtarget.h b/lib/Target/PTX/PTXSubtarget.h
index 7fd85f8..47d9842 100644
--- a/lib/Target/PTX/PTXSubtarget.h
+++ b/lib/Target/PTX/PTXSubtarget.h
@@ -19,10 +19,57 @@
 namespace llvm {
   class PTXSubtarget : public TargetSubtarget {
     private:
-      bool is_sm20;
+
+      /**
+       * Enumeration of Shader Models supported by the back-end.
+       */
+      enum PTXShaderModelEnum {
+        PTX_SM_1_0, /*< Shader Model 1.0 */
+        PTX_SM_1_3, /*< Shader Model 1.3 */
+        PTX_SM_2_0  /*< Shader Model 2.0 */
+      };
+
+      /**
+       * Enumeration of PTX versions supported by the back-end.
+       *
+       * Currently, PTX 2.0 is the minimum supported version.
+       */
+      enum PTXVersionEnum {
+        PTX_VERSION_2_0,  /*< PTX Version 2.0 */
+        PTX_VERSION_2_1,  /*< PTX Version 2.1 */
+        PTX_VERSION_2_2   /*< PTX Version 2.2 */
+      };
+
+      /// Shader Model supported on the target GPU.
+      PTXShaderModelEnum PTXShaderModel;
+
+      /// PTX Language Version.
+      PTXVersionEnum PTXVersion;
+
+      // The native .f64 type is supported on the hardware.
+      bool SupportsDouble;
+
+      // Use .u64 instead of .u32 for addresses.
+      bool Is64Bit;
 
     public:
-      PTXSubtarget(const std::string &TT, const std::string &FS);
+      PTXSubtarget(const std::string &TT, const std::string &FS, bool is64Bit);
+
+      std::string getTargetString() const;
+
+      std::string getPTXVersionString() const;
+
+      bool supportsDouble() const { return SupportsDouble; }
+
+      bool is64Bit() const { return Is64Bit; }
+
+      bool supportsSM13() const { return PTXShaderModel >= PTX_SM_1_3; }
+
+      bool supportsSM20() const { return PTXShaderModel >= PTX_SM_2_0; }
+
+      bool supportsPTX21() const { return PTXVersion >= PTX_VERSION_2_1; }
+
+      bool supportsPTX22() const { return PTXVersion >= PTX_VERSION_2_2; }
 
       std::string ParseSubtargetFeatures(const std::string &FS,
                                          const std::string &CPU);
diff --git a/lib/Target/PTX/PTXTargetMachine.cpp b/lib/Target/PTX/PTXTargetMachine.cpp
index b263813..1b737c9 100644
--- a/lib/Target/PTX/PTXTargetMachine.cpp
+++ b/lib/Target/PTX/PTXTargetMachine.cpp
@@ -16,12 +16,14 @@
 #include "PTXTargetMachine.h"
 #include "llvm/PassManager.h"
 #include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
 namespace llvm {
   MCStreamer *createPTXAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
                                    bool isVerboseAsm, bool useLoc,
+                                   bool useCFI,
                                    MCInstPrinter *InstPrint,
                                    MCCodeEmitter *CE,
                                    TargetAsmBackend *TAB,
@@ -29,21 +31,47 @@ namespace llvm {
 }
 
 extern "C" void LLVMInitializePTXTarget() {
-  RegisterTargetMachine<PTXTargetMachine> X(ThePTXTarget);
-  RegisterAsmInfo<PTXMCAsmInfo> Y(ThePTXTarget);
-  TargetRegistry::RegisterAsmStreamer(ThePTXTarget, createPTXAsmStreamer);
+
+  RegisterTargetMachine<PTX32TargetMachine> X(ThePTX32Target);
+  RegisterTargetMachine<PTX64TargetMachine> Y(ThePTX64Target);
+
+  RegisterAsmInfo<PTXMCAsmInfo> Z(ThePTX32Target);
+  RegisterAsmInfo<PTXMCAsmInfo> W(ThePTX64Target);
+
+  TargetRegistry::RegisterAsmStreamer(ThePTX32Target, createPTXAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(ThePTX64Target, createPTXAsmStreamer);
+}
+
+namespace {
+  const char* DataLayout32 =
+    "e-p:32:32-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64";
+  const char* DataLayout64 =
+    "e-p:64:64-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64";
 }
 
 // DataLayout and FrameLowering are filled with dummy data
 PTXTargetMachine::PTXTargetMachine(const Target &T,
                                    const std::string &TT,
-                                   const std::string &FS)
+                                   const std::string &FS,
+                                   bool is64Bit)
   : LLVMTargetMachine(T, TT),
-    DataLayout("e-p:32:32-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64"),
+    DataLayout(is64Bit ? DataLayout64 : DataLayout32),
+    Subtarget(TT, FS, is64Bit),
     FrameLowering(Subtarget),
     InstrInfo(*this),
-    TLInfo(*this),
-    Subtarget(TT, FS) {
+    TLInfo(*this) {
+}
+
+PTX32TargetMachine::PTX32TargetMachine(const Target &T,
+                                       const std::string& TT,
+                                       const std::string& FS)
+  : PTXTargetMachine(T, TT, FS, false) {
+}
+
+PTX64TargetMachine::PTX64TargetMachine(const Target &T,
+                                       const std::string& TT,
+                                       const std::string& FS)
+  : PTXTargetMachine(T, TT, FS, true) {
 }
 
 bool PTXTargetMachine::addInstSelector(PassManagerBase &PM,
diff --git a/lib/Target/PTX/PTXTargetMachine.h b/lib/Target/PTX/PTXTargetMachine.h
index 728e36f..149be8e 100644
--- a/lib/Target/PTX/PTXTargetMachine.h
+++ b/lib/Target/PTX/PTXTargetMachine.h
@@ -25,15 +25,15 @@
 namespace llvm {
 class PTXTargetMachine : public LLVMTargetMachine {
   private:
-    const TargetData DataLayout;
-    PTXFrameLowering FrameLowering;
-    PTXInstrInfo InstrInfo;
+    const TargetData  DataLayout;
+    PTXSubtarget      Subtarget; // has to be initialized before FrameLowering
+    PTXFrameLowering  FrameLowering;
+    PTXInstrInfo      InstrInfo;
     PTXTargetLowering TLInfo;
-    PTXSubtarget Subtarget;
 
   public:
     PTXTargetMachine(const Target &T, const std::string &TT,
-                     const std::string &FS);
+                     const std::string &FS, bool is64Bit);
 
     virtual const TargetData *getTargetData() const { return &DataLayout; }
 
@@ -55,6 +55,22 @@ class PTXTargetMachine : public LLVMTargetMachine {
     virtual bool addPostRegAlloc(PassManagerBase &PM,
                                  CodeGenOpt::Level OptLevel);
 }; // class PTXTargetMachine
+
+
+class PTX32TargetMachine : public PTXTargetMachine {
+public:
+
+  PTX32TargetMachine(const Target &T, const std::string &TT,
+                     const std::string& FS);
+}; // class PTX32TargetMachine
+
+class PTX64TargetMachine : public PTXTargetMachine {
+public:
+
+  PTX64TargetMachine(const Target &T, const std::string &TT,
+                     const std::string& FS);
+}; // class PTX32TargetMachine
+
 } // namespace llvm
 
 #endif // PTX_TARGET_MACHINE_H
diff --git a/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp b/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp
index a577d77..9df6c75 100644
--- a/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp
+++ b/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp
@@ -13,9 +13,13 @@
 
 using namespace llvm;
 
-Target llvm::ThePTXTarget;
+Target llvm::ThePTX32Target;
+Target llvm::ThePTX64Target;
 
 extern "C" void LLVMInitializePTXTargetInfo() {
   // see llvm/ADT/Triple.h
-  RegisterTarget<Triple::ptx> X(ThePTXTarget, "ptx", "PTX");
+  RegisterTarget<Triple::ptx32> X32(ThePTX32Target, "ptx32",
+                                    "PTX (32-bit) [Experimental]");
+  RegisterTarget<Triple::ptx64> X64(ThePTX64Target, "ptx64",
+                                    "PTX (64-bit) [Experimental]");
 }
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index ebc10da..9cf9db9 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -17,13 +17,16 @@
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
-  class MCOperand;
+
+class MCOperand;
+class TargetMachine;
 
 class PPCInstPrinter : public MCInstPrinter {
   // 0 -> AIX, 1 -> Darwin.
   unsigned SyntaxVariant;
 public:
-  PPCInstPrinter(const MCAsmInfo &MAI, unsigned syntaxVariant)
+  PPCInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI,
+                 unsigned syntaxVariant)
     : MCInstPrinter(MAI), SyntaxVariant(syntaxVariant) {}
   
   bool isDarwinSyntax() const {
diff --git a/lib/Target/PowerPC/PPCAsmBackend.cpp b/lib/Target/PowerPC/PPCAsmBackend.cpp
index c4d4ac9..f562a3f 100644
--- a/lib/Target/PowerPC/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/PPCAsmBackend.cpp
@@ -110,10 +110,8 @@ namespace {
 
 TargetAsmBackend *llvm::createPPCAsmBackend(const Target &T,
                                             const std::string &TT) {
-  switch (Triple(TT).getOS()) {
-  case Triple::Darwin:
+  if (Triple(TT).isOSDarwin())
     return new DarwinPPCAsmBackend(T);
-  default:
-    return 0;
-  }
+
+  return 0;
 }
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 8ed5d7f..09a9be9 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -680,9 +680,10 @@ static AsmPrinter *createPPCAsmPrinterPass(TargetMachine &tm,
 }
 
 static MCInstPrinter *createPPCMCInstPrinter(const Target &T,
+                                             TargetMachine &TM,
                                              unsigned SyntaxVariant,
                                              const MCAsmInfo &MAI) {
-  return new PPCInstPrinter(MAI, SyntaxVariant);
+  return new PPCInstPrinter(TM, MAI, SyntaxVariant);
 }
 
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 70d00e4..128522c 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -899,7 +899,8 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     short Imm;
     if (isIntS16Immediate(CN, Imm)) {
       Disp = DAG.getTargetConstant(Imm, CN->getValueType(0));
-      Base = DAG.getRegister(PPC::R0, CN->getValueType(0));
+      Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::X0 : PPC::R0,
+                             CN->getValueType(0));
       return true;
     }
 
@@ -947,7 +948,8 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
   }
 
   // Otherwise, do it the hard way, using R0 as the base register.
-  Base = DAG.getRegister(PPC::R0, N.getValueType());
+  Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::X0 : PPC::R0,
+                         N.getValueType());
   Index = N;
   return true;
 }
@@ -2153,7 +2155,7 @@ CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
 }
 
 /// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
-/// adjusted to accomodate the arguments for the tailcall.
+/// adjusted to accommodate the arguments for the tailcall.
 static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
                                    unsigned ParamSize) {
 
@@ -2394,7 +2396,7 @@ void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
   // Emit a sequence of copyto/copyfrom virtual registers for arguments that
   // might overwrite each other in case of tail call optimization.
   SmallVector<SDValue, 8> MemOpChains2;
-  // Do not flag preceeding copytoreg stuff together with the following stuff.
+  // Do not flag preceding copytoreg stuff together with the following stuff.
   InFlag = SDValue();
   StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
                                     MemOpChains2, dl);
@@ -2442,7 +2444,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     if (!DAG.getTarget().getSubtarget<PPCSubtarget>().isJITCodeModel()) {
       unsigned OpFlags = 0;
       if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
-          PPCSubTarget.getDarwinVers() < 9 &&
+          (!PPCSubTarget.getTargetTriple().isMacOSX() ||
+           PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
           (G->getGlobal()->isDeclaration() ||
            G->getGlobal()->isWeakForLinker())) {
         // PC-relative references to external symbols should go through $stub,
@@ -2465,7 +2468,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     unsigned char OpFlags = 0;
 
     if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
-        PPCSubTarget.getDarwinVers() < 9) {
+        (!PPCSubTarget.getTargetTriple().isMacOSX() ||
+         PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5))) {
       // PC-relative references to external symbols should go through $stub,
       // unless we're building with the leopard linker or later, which
       // automatically synthesizes these stubs.
@@ -4571,6 +4575,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
   // registers without caring whether they're 32 or 64, but here we're
   // doing actual arithmetic on the addresses.
   bool is64bit = PPCSubTarget.isPPC64();
+  unsigned ZeroReg = is64bit ? PPC::X0 : PPC::R0;
 
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction *F = BB->getParent();
@@ -4634,8 +4639,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
   //   bne- loopMBB
   //   fallthrough --> exitMBB
   //   srw dest, tmpDest, shift
-
-  if (ptrA!=PPC::R0) {
+  if (ptrA != ZeroReg) {
     Ptr1Reg = RegInfo.createVirtualRegister(RC);
     BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
       .addReg(ptrA).addReg(ptrB);
@@ -4665,7 +4669,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
 
   BB = loopMBB;
   BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
-    .addReg(PPC::R0).addReg(PtrReg);
+    .addReg(ZeroReg).addReg(PtrReg);
   if (BinOpcode)
     BuildMI(BB, dl, TII->get(BinOpcode), TmpReg)
       .addReg(Incr2Reg).addReg(TmpDestReg);
@@ -4676,7 +4680,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
   BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg)
     .addReg(Tmp3Reg).addReg(Tmp2Reg);
   BuildMI(BB, dl, TII->get(PPC::STWCX))
-    .addReg(Tmp4Reg).addReg(PPC::R0).addReg(PtrReg);
+    .addReg(Tmp4Reg).addReg(ZeroReg).addReg(PtrReg);
   BuildMI(BB, dl, TII->get(PPC::BCC))
     .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
   BB->addSuccessor(loopMBB);
@@ -4685,7 +4689,8 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
   //  exitMBB:
   //   ...
   BB = exitMBB;
-  BuildMI(BB, dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg).addReg(ShiftReg);
+  BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg)
+    .addReg(ShiftReg);
   return BB;
 }
 
@@ -4933,6 +4938,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
     unsigned Ptr1Reg;
     unsigned TmpReg = RegInfo.createVirtualRegister(RC);
+    unsigned ZeroReg = is64bit ? PPC::X0 : PPC::R0;
     //  thisMBB:
     //   ...
     //   fallthrough --> loopMBB
@@ -4965,7 +4971,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     //   stwcx. tmpDest, ptr
     // exitBB:
     //   srw dest, tmpDest, shift
-    if (ptrA!=PPC::R0) {
+    if (ptrA != ZeroReg) {
       Ptr1Reg = RegInfo.createVirtualRegister(RC);
       BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
         .addReg(ptrA).addReg(ptrB);
@@ -5002,7 +5008,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     BB = loop1MBB;
     BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
-        .addReg(PPC::R0).addReg(PtrReg);
+        .addReg(ZeroReg).addReg(PtrReg);
     BuildMI(BB, dl, TII->get(PPC::AND),TmpReg)
         .addReg(TmpDestReg).addReg(MaskReg);
     BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0)
@@ -5018,7 +5024,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg)
         .addReg(Tmp2Reg).addReg(NewVal3Reg);
     BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg)
-        .addReg(PPC::R0).addReg(PtrReg);
+        .addReg(ZeroReg).addReg(PtrReg);
     BuildMI(BB, dl, TII->get(PPC::BCC))
       .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
     BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
@@ -5027,13 +5033,14 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     BB = midMBB;
     BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg)
-      .addReg(PPC::R0).addReg(PtrReg);
+      .addReg(ZeroReg).addReg(PtrReg);
     BB->addSuccessor(exitMBB);
 
     //  exitMBB:
     //   ...
     BB = exitMBB;
-    BuildMI(BB, dl, TII->get(PPC::SRW),dest).addReg(TmpReg).addReg(ShiftReg);
+    BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg)
+      .addReg(ShiftReg);
   } else {
     llvm_unreachable("Unexpected instr type to insert");
   }
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 6636b69..9f0fae5 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -130,7 +130,7 @@ def : Pat<(PPCnop),
 
 // Atomic operations
 let usesCustomInserter = 1 in {
-  let Uses = [CR0] in {
+  let Defs = [CR0] in {
     def ATOMIC_LOAD_ADD_I64 : Pseudo<
       (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "",
       [(set G8RC:$dst, (atomic_load_add_64 xoaddr:$ptr, G8RC:$incr))]>;
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 82aadeb..24071b7 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -550,7 +550,7 @@ def DCBZL  : DCB_Form<1014, 1, (outs), (ins memrr:$dst),
 
 // Atomic operations
 let usesCustomInserter = 1 in {
-  let Uses = [CR0] in {
+  let Defs = [CR0] in {
     def ATOMIC_LOAD_ADD_I8 : Pseudo<
       (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
       [(set GPRC:$dst, (atomic_load_add_8 xoaddr:$ptr, GPRC:$incr))]>;
diff --git a/lib/Target/PowerPC/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/PPCMCAsmInfo.cpp
index d1178dd..9e508cc 100644
--- a/lib/Target/PowerPC/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/PPCMCAsmInfo.cpp
@@ -17,7 +17,7 @@ using namespace llvm;
 PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit) {
   PCSymbol = ".";
   CommentString = ";";
-  ExceptionsType = ExceptionHandling::DwarfTable;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
 
   if (!is64Bit)
     Data64bitsDirective = 0;      // We can't emit a 64-bit unit in PPC32 mode.
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index 72a1dee..5f3aa23 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -70,7 +70,7 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &FS,
   , HasSTFIWX(false)
   , HasLazyResolverStubs(false)
   , IsJITCodeModel(false)
-  , DarwinVers(0) {
+  , TargetTriple(TT) {
 
   // Determine default and user specified characteristics
   std::string CPU = "generic";
@@ -92,19 +92,6 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &FS,
   // support it, ignore.
   if (use64BitRegs() && !has64BitSupport())
     Use64BitRegs = false;
-  
-  // Set the boolean corresponding to the current target triple, or the default
-  // if one cannot be determined, to true.
-  if (TT.length() > 7) {
-    // Determine which version of darwin this is.
-    size_t DarwinPos = TT.find("-darwin");
-    if (DarwinPos != std::string::npos) {
-      if (isdigit(TT[DarwinPos+7]))
-        DarwinVers = atoi(&TT[DarwinPos+7]);
-      else
-        DarwinVers = 8;  // Minimum supported darwin is Tiger.
-    }
-  }
 
   // Set up darwin-specific properties.
   if (isDarwin())
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 00ec747..8fd1a44 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -14,6 +14,7 @@
 #ifndef POWERPCSUBTARGET_H
 #define POWERPCSUBTARGET_H
 
+#include "llvm/ADT/Triple.h"
 #include "llvm/Target/TargetInstrItineraries.h"
 #include "llvm/Target/TargetSubtarget.h"
 
@@ -65,9 +66,9 @@ protected:
   bool HasLazyResolverStubs;
   bool IsJITCodeModel;
   
-  /// DarwinVers - Nonzero if this is a darwin platform.  Otherwise, the numeric
-  /// version of the platform, e.g. 8 = 10.4 (Tiger), 9 = 10.5 (Leopard), etc.
-  unsigned char DarwinVers; // Is any darwin-ppc platform.
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
@@ -134,13 +135,10 @@ public:
   bool hasAltivec() const { return HasAltivec; }
   bool isGigaProcessor() const { return IsGigaProcessor; }
 
-  /// isDarwin - True if this is any darwin platform.
-  bool isDarwin() const { return DarwinVers != 0; }
-  /// isDarwin - True if this is darwin9 (leopard, 10.5) or above.
-  bool isDarwin9() const { return DarwinVers >= 9; }
+  const Triple &getTargetTriple() const { return TargetTriple; }
 
-  /// getDarwinVers - Return the darwin version number, 8 = tiger, 9 = leopard.
-  unsigned getDarwinVers() const { return DarwinVers; }
+  /// isDarwin - True if this is any darwin platform.
+  bool isDarwin() const { return TargetTriple.isMacOSX(); }
 
   bool isDarwinABI() const { return isDarwin(); }
   bool isSVR4ABI() const { return !isDarwin(); }
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 212b450..d27e54e 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -24,7 +24,7 @@ using namespace llvm;
 static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
   bool isPPC64 = TheTriple.getArch() == Triple::ppc64;
-  if (TheTriple.getOS() == Triple::Darwin)
+  if (TheTriple.isOSDarwin())
     return new PPCMCAsmInfoDarwin(isPPC64);
   return new PPCLinuxMCAsmInfo(isPPC64);
   
@@ -37,12 +37,10 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
                                     MCCodeEmitter *Emitter,
                                     bool RelaxAll,
                                     bool NoExecStack) {
-  switch (Triple(TT).getOS()) {
-  case Triple::Darwin:
+  if (Triple(TT).isOSDarwin())
     return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll);
-  default:
-    return NULL;
-  }
+
+  return NULL;
 }
 
 extern "C" void LLVMInitializePowerPCTarget() {
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index f85914b..ffe3fa4 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -392,34 +392,6 @@ PHI Slicing could be extended to do this.
 
 //===---------------------------------------------------------------------===//
 
-LSR should know what GPR types a target has from TargetData.  This code:
-
-volatile short X, Y; // globals
-
-void foo(int N) {
-  int i;
-  for (i = 0; i < N; i++) { X = i; Y = i*4; }
-}
-
-produces two near identical IV's (after promotion) on PPC/ARM:
-
-LBB1_2:
-	ldr r3, LCPI1_0
-	ldr r3, [r3]
-	strh r2, [r3]
-	ldr r3, LCPI1_1
-	ldr r3, [r3]
-	strh r1, [r3]
-	add r1, r1, #4
-	add r2, r2, #1   <- [0,+,1]
-	sub r0, r0, #1   <- [0,-,1]
-	cmp r0, #0
-	bne LBB1_2
-
-LSR should reuse the "+" IV for the exit test.
-
-//===---------------------------------------------------------------------===//
-
 Tail call elim should be more aggressive, checking to see if the call is
 followed by an uncond branch to an exit block.
 
@@ -1325,6 +1297,21 @@ codegen.
 
 //===---------------------------------------------------------------------===//
 
+simplifylibcalls should turn these snprintf idioms into memcpy (GCC PR47917)
+
+char buf1[6], buf2[6], buf3[4], buf4[4];
+int i;
+
+int foo (void) {
+  int ret = snprintf (buf1, sizeof buf1, "abcde");
+  ret += snprintf (buf2, sizeof buf2, "abcdef") * 16;
+  ret += snprintf (buf3, sizeof buf3, "%s", i++ < 6 ? "abc" : "def") * 256;
+  ret += snprintf (buf4, sizeof buf4, "%s", i++ > 10 ? "abcde" : "defgh")*4096;
+  return ret;
+}
+
+//===---------------------------------------------------------------------===//
+
 "gas" uses this idiom:
   else if (strchr ("+-/*%|&^:[]()~", *intel_parser.op_string))
 ..
@@ -1780,43 +1767,6 @@ case it choses instead to keep the max operation obvious.
 
 //===---------------------------------------------------------------------===//
 
-Take the following testcase on x86-64 (similar testcases exist for all targets
-with addc/adde):
-
-define void @a(i64* nocapture %s, i64* nocapture %t, i64 %a, i64 %b,
-i64 %c) nounwind {
-entry:
- %0 = zext i64 %a to i128                        ; <i128> [#uses=1]
- %1 = zext i64 %b to i128                        ; <i128> [#uses=1]
- %2 = add i128 %1, %0                            ; <i128> [#uses=2]
- %3 = zext i64 %c to i128                        ; <i128> [#uses=1]
- %4 = shl i128 %3, 64                            ; <i128> [#uses=1]
- %5 = add i128 %4, %2                            ; <i128> [#uses=1]
- %6 = lshr i128 %5, 64                           ; <i128> [#uses=1]
- %7 = trunc i128 %6 to i64                       ; <i64> [#uses=1]
- store i64 %7, i64* %s, align 8
- %8 = trunc i128 %2 to i64                       ; <i64> [#uses=1]
- store i64 %8, i64* %t, align 8
- ret void
-}
-
-Generated code:
-        addq	%rcx, %rdx
-        sbbq	%rax, %rax
-        subq	%rax, %r8
-        movq	%r8, (%rdi)
-        movq	%rdx, (%rsi)
-        ret
-
-Expected code:
-       addq    %rcx, %rdx
-       adcq    $0, %r8
-       movq    %r8, (%rdi)
-       movq    %rdx, (%rsi)
-       ret
-
-//===---------------------------------------------------------------------===//
-
 Switch lowering generates less than ideal code for the following switch:
 define void @a(i32 %x) nounwind {
 entry:
@@ -2124,11 +2074,12 @@ for.end:                                          ; preds = %entry
 }
 
 This shouldn't need the ((zext (%n - 1)) + 1) game, and it should ideally fold
-the two memset's together. The issue with %n seems to stem from poor handling
-of the original loop.
+the two memset's together.
 
-To simplify this, we need SCEV to know that "n != 0" because of the dominating
-conditional.  That would turn the second memset into a simple memset of 'n'.
+The issue with the addition only occurs in 64-bit mode, and appears to be at
+least partially caused by Scalar Evolution not keeping its cache updated: it
+returns the "wrong" result immediately after indvars runs, but figures out the
+expected result if it is run from scratch on IR resulting from running indvars.
 
 //===---------------------------------------------------------------------===//
 
@@ -2287,4 +2238,71 @@ missed cases:
 
 //===---------------------------------------------------------------------===//
 
+define i1 @test1(i32 %x) nounwind {
+  %and = and i32 %x, 3
+  %cmp = icmp ult i32 %and, 2
+  ret i1 %cmp
+}
+
+Can be folded to (x & 2) == 0.
+
+define i1 @test2(i32 %x) nounwind {
+  %and = and i32 %x, 3
+  %cmp = icmp ugt i32 %and, 1
+  ret i1 %cmp
+}
+
+Can be folded to (x & 2) != 0.
+
+SimplifyDemandedBits shrinks the "and" constant to 2 but instcombine misses the
+icmp transform.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+typedef struct {
+int f1:1;
+int f2:1;
+int f3:1;
+int f4:29;
+} t1;
+
+typedef struct {
+int f1:1;
+int f2:1;
+int f3:30;
+} t2;
+
+t1 s1;
+t2 s2;
+
+void func1(void)
+{
+s1.f1 = s2.f1;
+s1.f2 = s2.f2;
+}
+
+Compiles into this IR (on x86-64 at least):
+
+%struct.t1 = type { i8, [3 x i8] }
+@s2 = global %struct.t1 zeroinitializer, align 4
+@s1 = global %struct.t1 zeroinitializer, align 4
+define void @func1() nounwind ssp noredzone {
+entry:
+  %0 = load i32* bitcast (%struct.t1* @s2 to i32*), align 4
+  %bf.val.sext5 = and i32 %0, 1
+  %1 = load i32* bitcast (%struct.t1* @s1 to i32*), align 4
+  %2 = and i32 %1, -4
+  %3 = or i32 %2, %bf.val.sext5
+  %bf.val.sext26 = and i32 %0, 2
+  %4 = or i32 %3, %bf.val.sext26
+  store i32 %4, i32* bitcast (%struct.t1* @s1 to i32*), align 4
+  ret void
+}
+
+The two or/and's should be merged into one each.
+
+//===---------------------------------------------------------------------===//
+
 
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 70574c3..edb62fa 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -544,7 +544,7 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
-  // The InFlag in necessary since all emited instructions must be
+  // The InFlag in necessary since all emitted instructions must be
   // stuck together.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
diff --git a/lib/Target/SubtargetFeature.cpp b/lib/Target/SubtargetFeature.cpp
index 3cf95b5..e0a9de8 100644
--- a/lib/Target/SubtargetFeature.cpp
+++ b/lib/Target/SubtargetFeature.cpp
@@ -211,7 +211,7 @@ const std::string & SubtargetFeatures::getCPU() const {
 /// feature, set it.
 ///
 static
-void SetImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry,
+void SetImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry,
                     const SubtargetFeatureKV *FeatureTable,
                     size_t FeatureTableSize) {
   for (size_t i = 0; i < FeatureTableSize; ++i) {
@@ -230,7 +230,7 @@ void SetImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry,
 /// feature, clear it.
 /// 
 static
-void ClearImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry,
+void ClearImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry,
                       const SubtargetFeatureKV *FeatureTable,
                       size_t FeatureTableSize) {
   for (size_t i = 0; i < FeatureTableSize; ++i) {
@@ -247,7 +247,7 @@ void ClearImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry,
 
 /// getBits - Get feature bits.
 ///
-uint32_t SubtargetFeatures::getBits(const SubtargetFeatureKV *CPUTable,
+uint64_t SubtargetFeatures::getBits(const SubtargetFeatureKV *CPUTable,
                                           size_t CPUTableSize,
                                     const SubtargetFeatureKV *FeatureTable,
                                           size_t FeatureTableSize) {
@@ -263,7 +263,7 @@ uint32_t SubtargetFeatures::getBits(const SubtargetFeatureKV *CPUTable,
           "CPU features table is not sorted");
   }
 #endif
-  uint32_t Bits = 0;                    // Resulting bits
+  uint64_t Bits = 0;                    // Resulting bits
 
   // Check if help is needed
   if (Features[0] == "help")
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 90939c3..d331614 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -451,7 +451,7 @@ SystemZTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
 
   // Build a sequence of copy-to-reg nodes chained together with token chain and
   // flag operands which copy the outgoing args into registers.  The InFlag in
-  // necessary since all emited instructions must be stuck together.
+  // necessary since all emitted instructions must be stuck together.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp
index c628df0..1990bc7 100644
--- a/lib/Target/TargetData.cpp
+++ b/lib/Target/TargetData.cpp
@@ -617,10 +617,14 @@ uint64_t TargetData::getIndexedOffset(const Type *ptrTy, Value* const* Indices,
 unsigned TargetData::getPreferredAlignment(const GlobalVariable *GV) const {
   const Type *ElemType = GV->getType()->getElementType();
   unsigned Alignment = getPrefTypeAlignment(ElemType);
-  if (GV->getAlignment() > Alignment)
-    Alignment = GV->getAlignment();
+  unsigned GVAlignment = GV->getAlignment();
+  if (GVAlignment >= Alignment) {
+    Alignment = GVAlignment;
+  } else if (GVAlignment != 0) {
+    Alignment = std::max(GVAlignment, getABITypeAlignment(ElemType));
+  }
 
-  if (GV->hasInitializer()) {
+  if (GV->hasInitializer() && GVAlignment == 0) {
     if (Alignment < 16) {
       // If the global is not external, see if it is large.  If so, give it a
       // larger alignment.
diff --git a/lib/Target/TargetInstrInfo.cpp b/lib/Target/TargetInstrInfo.cpp
index 97f3bf6..d4b7697 100644
--- a/lib/Target/TargetInstrInfo.cpp
+++ b/lib/Target/TargetInstrInfo.cpp
@@ -149,10 +149,10 @@ bool TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
 
 /// Measure the specified inline asm to determine an approximation of its
 /// length.
-/// Comments (which run till the next SeparatorChar or newline) do not
+/// Comments (which run till the next SeparatorString or newline) do not
 /// count as an instruction.
 /// Any other non-whitespace text is considered an instruction, with
-/// multiple instructions separated by SeparatorChar or newlines.
+/// multiple instructions separated by SeparatorString or newlines.
 /// Variable-length instructions are not handled here; this function
 /// may be overloaded in the target code to do that.
 unsigned TargetInstrInfo::getInlineAsmLength(const char *Str,
@@ -163,7 +163,8 @@ unsigned TargetInstrInfo::getInlineAsmLength(const char *Str,
   bool atInsnStart = true;
   unsigned Length = 0;
   for (; *Str; ++Str) {
-    if (*Str == '\n' || *Str == MAI.getSeparatorChar())
+    if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(),
+                                strlen(MAI.getSeparatorString())) == 0)
       atInsnStart = true;
     if (atInsnStart && !std::isspace(*Str)) {
       Length += MAI.getMaxInstLength();
diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index c8bed18..e336b09 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp
@@ -28,9 +28,22 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T) {
 
   
   // memset_pattern16 is only available on iOS 3.0 and Mac OS/X 10.5 and later.
-  if (T.getOS() != Triple::Darwin || T.getDarwinMajorNumber() < 9)
+  if (T.isMacOSX()) {
+    if (T.isMacOSXVersionLT(10, 5))
+      TLI.setUnavailable(LibFunc::memset_pattern16);
+  } else if (T.getOS() == Triple::IOS) {
+    if (T.isOSVersionLT(3, 0))
+      TLI.setUnavailable(LibFunc::memset_pattern16);
+  } else {
     TLI.setUnavailable(LibFunc::memset_pattern16);
-  
+  }
+
+  // iprintf and friends are only available on XCore.
+  if (T.getArch() != Triple::xcore) {
+    TLI.setUnavailable(LibFunc::iprintf);
+    TLI.setUnavailable(LibFunc::siprintf);
+    TLI.setUnavailable(LibFunc::fiprintf);
+  }
 }
 
 
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 5d34c7d..717ad41 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -120,6 +120,18 @@ static bool IsNullTerminatedString(const Constant *C) {
   return false;
 }
 
+MCSymbol *TargetLoweringObjectFile::
+getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
+                        MachineModuleInfo *MMI) const {
+  return Mang->getSymbol(GV);
+}
+
+void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer,
+                                                    const TargetMachine &TM,
+                                                    const MCSymbol *Sym) const {
+}
+
+
 /// getKindForGlobal - This is a top-level target-independent classifier for
 /// a global variable.  Given an global variable and information from TM, it
 /// classifies the global in a variety of ways that make various target
@@ -305,16 +317,15 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
                                MachineModuleInfo *MMI, unsigned Encoding,
                                MCStreamer &Streamer) const {
   const MCSymbol *Sym = Mang->getSymbol(GV);
-  return getExprForDwarfReference(Sym, Mang, MMI, Encoding, Streamer);
+  return getExprForDwarfReference(Sym, Encoding, Streamer);
 }
 
 const MCExpr *TargetLoweringObjectFile::
-getExprForDwarfReference(const MCSymbol *Sym, Mangler *Mang,
-                         MachineModuleInfo *MMI, unsigned Encoding,
+getExprForDwarfReference(const MCSymbol *Sym, unsigned Encoding,
                          MCStreamer &Streamer) const {
   const MCExpr *Res = MCSymbolRefExpr::Create(Sym, getContext());
 
-  switch (Encoding & 0xF0) {
+  switch (Encoding & 0x70) {
   default:
     report_fatal_error("We do not support this DWARF encoding yet!");
   case dwarf::DW_EH_PE_absptr:
@@ -339,7 +350,7 @@ unsigned TargetLoweringObjectFile::getLSDAEncoding() const {
   return dwarf::DW_EH_PE_absptr;
 }
 
-unsigned TargetLoweringObjectFile::getFDEEncoding() const {
+unsigned TargetLoweringObjectFile::getFDEEncoding(bool CFI) const {
   return dwarf::DW_EH_PE_absptr;
 }
 
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index d579d95..76ccc09 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -48,6 +48,7 @@ namespace llvm {
   bool RealignStack;
   bool DisableJumpTables;
   bool StrongPHIElim;
+  bool HasDivModLibcall;
   bool AsmVerbosityDefault(false);
 }
 
@@ -205,6 +206,10 @@ EnableStrongPHIElim(cl::Hidden, "strong-phi-elim",
   cl::desc("Use strong PHI elimination."),
   cl::location(StrongPHIElim),
   cl::init(false));
+static cl::opt<std::string>
+TrapFuncName("trap-func", cl::Hidden,
+  cl::desc("Emit a call to trap function rather than a trap instruction"),
+  cl::init(""));
 static cl::opt<bool>
 DataSections("fdata-sections",
   cl::desc("Emit data into separate sections"),
@@ -221,7 +226,9 @@ TargetMachine::TargetMachine(const Target &T)
   : TheTarget(T), AsmInfo(0),
     MCRelaxAll(false),
     MCNoExecStack(false),
-    MCUseLoc(true) {
+    MCSaveTempLabels(false),
+    MCUseLoc(true),
+    MCUseCFI(true) {
   // Typically it will be subtargets that will adjust FloatABIType from Default
   // to Soft or Hard.
   if (UseSoftFloat)
@@ -303,4 +310,11 @@ namespace llvm {
   bool HonorSignDependentRoundingFPMath() {
     return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption;
   }
+
+  /// getTrapFunctionName - If this returns a non-empty string, this means isel
+  /// should lower Intrinsic::trap to a call to the specified function name
+  /// instead of an ISD::TRAP node.
+  StringRef getTrapFunctionName() {
+    return TrapFuncName;
+  }
 }
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 8fe549b..c352bfc 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -53,6 +53,14 @@ private:
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                MCStreamer &Out);
 
+  /// isSrcOp - Returns true if operand is either (%rsi) or %ds:%(rsi)
+  /// in 64bit mode or (%edi) or %es:(%edi) in 32bit mode.
+  bool isSrcOp(X86Operand &Op);
+
+  /// isDstOp - Returns true if operand is either %es:(%rdi) in 64bit mode
+  /// or %es:(%edi) in 32bit mode.
+  bool isDstOp(X86Operand &Op);
+
   /// @name Auto-generated Matcher Functions
   /// {
 
@@ -356,6 +364,24 @@ struct X86Operand : public MCParsedAsmOperand {
 
 } // end anonymous namespace.
 
+bool X86ATTAsmParser::isSrcOp(X86Operand &Op) {
+  unsigned basereg = Is64Bit ? X86::RSI : X86::ESI;
+
+  return (Op.isMem() &&
+    (Op.Mem.SegReg == 0 || Op.Mem.SegReg == X86::DS) &&
+    isa<MCConstantExpr>(Op.Mem.Disp) &&
+    cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+    Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0);
+}
+
+bool X86ATTAsmParser::isDstOp(X86Operand &Op) {
+  unsigned basereg = Is64Bit ? X86::RDI : X86::EDI;
+
+  return Op.isMem() && Op.Mem.SegReg == X86::ES &&
+    isa<MCConstantExpr>(Op.Mem.Disp) &&
+    cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+    Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0;
+}
 
 bool X86ATTAsmParser::ParseRegister(unsigned &RegNo,
                                     SMLoc &StartLoc, SMLoc &EndLoc) {
@@ -788,7 +814,106 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
       delete &Op;
     }
   }
-  
+  // Transform "ins[bwl] %dx, %es:(%edi)" into "ins[bwl]"
+  if (Name.startswith("ins") && Operands.size() == 3 &&
+      (Name == "insb" || Name == "insw" || Name == "insl")) {
+    X86Operand &Op = *(X86Operand*)Operands.begin()[1];
+    X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
+    if (Op.isReg() && Op.getReg() == X86::DX && isDstOp(Op2)) {
+      Operands.pop_back();
+      Operands.pop_back();
+      delete &Op;
+      delete &Op2;
+    }
+  }
+
+  // Transform "outs[bwl] %ds:(%esi), %dx" into "out[bwl]"
+  if (Name.startswith("outs") && Operands.size() == 3 &&
+      (Name == "outsb" || Name == "outsw" || Name == "outsl")) {
+    X86Operand &Op = *(X86Operand*)Operands.begin()[1];
+    X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
+    if (isSrcOp(Op) && Op2.isReg() && Op2.getReg() == X86::DX) {
+      Operands.pop_back();
+      Operands.pop_back();
+      delete &Op;
+      delete &Op2;
+    }
+  }
+
+  // Transform "movs[bwl] %ds:(%esi), %es:(%edi)" into "movs[bwl]"
+  if (Name.startswith("movs") && Operands.size() == 3 &&
+      (Name == "movsb" || Name == "movsw" || Name == "movsl" ||
+       (Is64Bit && Name == "movsq"))) {
+    X86Operand &Op = *(X86Operand*)Operands.begin()[1];
+    X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
+    if (isSrcOp(Op) && isDstOp(Op2)) {
+      Operands.pop_back();
+      Operands.pop_back();
+      delete &Op;
+      delete &Op2;
+    }
+  }
+  // Transform "lods[bwl] %ds:(%esi),{%al,%ax,%eax,%rax}" into "lods[bwl]"
+  if (Name.startswith("lods") && Operands.size() == 3 &&
+      (Name == "lods" || Name == "lodsb" || Name == "lodsw" ||
+       Name == "lodsl" || (Is64Bit && Name == "lodsq"))) {
+    X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
+    X86Operand *Op2 = static_cast<X86Operand*>(Operands[2]);
+    if (isSrcOp(*Op1) && Op2->isReg()) {
+      const char *ins;
+      unsigned reg = Op2->getReg();
+      bool isLods = Name == "lods";
+      if (reg == X86::AL && (isLods || Name == "lodsb"))
+        ins = "lodsb";
+      else if (reg == X86::AX && (isLods || Name == "lodsw"))
+        ins = "lodsw";
+      else if (reg == X86::EAX && (isLods || Name == "lodsl"))
+        ins = "lodsl";
+      else if (reg == X86::RAX && (isLods || Name == "lodsq"))
+        ins = "lodsq";
+      else
+        ins = NULL;
+      if (ins != NULL) {
+        Operands.pop_back();
+        Operands.pop_back();
+        delete Op1;
+        delete Op2;
+        if (Name != ins)
+          static_cast<X86Operand*>(Operands[0])->setTokenValue(ins);
+      }
+    }
+  }
+  // Transform "stos[bwl] {%al,%ax,%eax,%rax},%es:(%edi)" into "stos[bwl]"
+  if (Name.startswith("stos") && Operands.size() == 3 &&
+      (Name == "stos" || Name == "stosb" || Name == "stosw" ||
+       Name == "stosl" || (Is64Bit && Name == "stosq"))) {
+    X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
+    X86Operand *Op2 = static_cast<X86Operand*>(Operands[2]);
+    if (isDstOp(*Op2) && Op1->isReg()) {
+      const char *ins;
+      unsigned reg = Op1->getReg();
+      bool isStos = Name == "stos";
+      if (reg == X86::AL && (isStos || Name == "stosb"))
+        ins = "stosb";
+      else if (reg == X86::AX && (isStos || Name == "stosw"))
+        ins = "stosw";
+      else if (reg == X86::EAX && (isStos || Name == "stosl"))
+        ins = "stosl";
+      else if (reg == X86::RAX && (isStos || Name == "stosq"))
+        ins = "stosq";
+      else
+        ins = NULL;
+      if (ins != NULL) {
+        Operands.pop_back();
+        Operands.pop_back();
+        delete Op1;
+        delete Op2;
+        if (Name != ins)
+          static_cast<X86Operand*>(Operands[0])->setTokenValue(ins);
+      }
+    }
+  }
+
   // FIXME: Hack to handle recognize s{hr,ar,hl} $1, <op>.  Canonicalize to
   // "shift <op>".
   if ((Name.startswith("shr") || Name.startswith("sar") ||
@@ -803,6 +928,18 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
       Operands.erase(Operands.begin() + 1);
     }
   }
+  
+  // Transforms "int $3" into "int3" as a size optimization.  We can't write an
+  // instalias with an immediate operand yet.
+  if (Name == "int" && Operands.size() == 2) {
+    X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
+    if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
+        cast<MCConstantExpr>(Op1->getImm())->getValue() == 3) {
+      delete Operands[1];
+      Operands.erase(Operands.begin() + 1);
+      static_cast<X86Operand*>(Operands[0])->setTokenValue("int3");
+    }
+  }
 
   return false;
 }
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index f777756..d8a105e 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -409,6 +409,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
   case TYPE_XMM32:
   case TYPE_XMM64:
   case TYPE_XMM128:
+  case TYPE_XMM256:
   case TYPE_DEBUGREG:
   case TYPE_CONTROLREG:
     return translateRMRegister(mcInst, insn);
@@ -418,6 +419,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
   case TYPE_M32:
   case TYPE_M64:
   case TYPE_M128:
+  case TYPE_M256:
   case TYPE_M512:
   case TYPE_Mv:
   case TYPE_M32FP:
@@ -500,6 +502,9 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
   case ENCODING_Rv:
     translateRegister(mcInst, insn.opcodeRegister);
     return false;
+  case ENCODING_VVVV:
+    translateRegister(mcInst, insn.vvvv);
+    return false;
   case ENCODING_DUP:
     return translateOperand(mcInst,
                             insn.spec->operands[operand.type - TYPE_DUP0],
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
index b6546fc..de1610b 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -75,6 +75,12 @@ static int modRMRequired(OpcodeType type,
   case THREEBYTE_3A:
     decision = &THREEBYTE3A_SYM;
     break;
+  case THREEBYTE_A6:
+    decision = &THREEBYTEA6_SYM;
+    break;
+  case THREEBYTE_A7:
+    decision = &THREEBYTEA7_SYM;
+    break;
   }
   
   return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
@@ -115,6 +121,12 @@ static InstrUID decode(OpcodeType type,
   case THREEBYTE_3A:
     dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
     break;
+  case THREEBYTE_A6:
+    dec = &THREEBYTEA6_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case THREEBYTE_A7:
+    dec = &THREEBYTEA7_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
   }
   
   switch (dec->modrm_type) {
@@ -368,29 +380,109 @@ static int readPrefixes(struct InternalInstruction* insn) {
     if (isPrefix)
       dbgprintf(insn, "Found prefix 0x%hhx", byte);
   }
+    
+  insn->vexSize = 0;
   
-  if (insn->mode == MODE_64BIT) {
-    if ((byte & 0xf0) == 0x40) {
-      uint8_t opcodeByte;
+  if (byte == 0xc4) {
+    uint8_t byte1;
       
-      if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) {
-        dbgprintf(insn, "Redundant REX prefix");
-        return -1;
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of VEX");
+      return -1;
+    }
+    
+    if (insn->mode == MODE_64BIT || byte1 & 0x8) {
+      insn->vexSize = 3;
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+    else {
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+    
+    if (insn->vexSize == 3) {
+      insn->vexPrefix[0] = byte;
+      consumeByte(insn, &insn->vexPrefix[1]);
+      consumeByte(insn, &insn->vexPrefix[2]);
+
+      /* We simulate the REX prefix for simplicity's sake */
+    
+      insn->rexPrefix = 0x40 
+                      | (wFromVEX3of3(insn->vexPrefix[2]) << 3)
+                      | (rFromVEX2of3(insn->vexPrefix[1]) << 2)
+                      | (xFromVEX2of3(insn->vexPrefix[1]) << 1)
+                      | (bFromVEX2of3(insn->vexPrefix[1]) << 0);
+    
+      switch (ppFromVEX3of3(insn->vexPrefix[2]))
+      {
+      default:
+        break;
+      case VEX_PREFIX_66:
+        hasOpSize = TRUE;      
+        break;
       }
+    
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1], insn->vexPrefix[2]);
+    }
+  }
+  else if (byte == 0xc5) {
+    uint8_t byte1;
+    
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of VEX");
+      return -1;
+    }
       
-      insn->rexPrefix = byte;
-      insn->necessaryPrefixLocation = insn->readerCursor - 2;
-      
-      dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
-    } else {                
+    if (insn->mode == MODE_64BIT || byte1 & 0x8) {
+      insn->vexSize = 2;
+    }
+    else {
+      unconsumeByte(insn);
+    }
+    
+    if (insn->vexSize == 2) {
+      insn->vexPrefix[0] = byte;
+      consumeByte(insn, &insn->vexPrefix[1]);
+        
+      insn->rexPrefix = 0x40 
+                      | (rFromVEX2of2(insn->vexPrefix[1]) << 2);
+        
+      switch (ppFromVEX2of2(insn->vexPrefix[1]))
+      {
+      default:
+        break;
+      case VEX_PREFIX_66:
+        hasOpSize = TRUE;      
+        break;
+      }
+         
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1]);
+    }
+  }
+  else {
+    if (insn->mode == MODE_64BIT) {
+      if ((byte & 0xf0) == 0x40) {
+        uint8_t opcodeByte;
+          
+        if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) {
+          dbgprintf(insn, "Redundant REX prefix");
+          return -1;
+        }
+          
+        insn->rexPrefix = byte;
+        insn->necessaryPrefixLocation = insn->readerCursor - 2;
+          
+        dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
+      } else {                
+        unconsumeByte(insn);
+        insn->necessaryPrefixLocation = insn->readerCursor - 1;
+      }
+    } else {
       unconsumeByte(insn);
       insn->necessaryPrefixLocation = insn->readerCursor - 1;
     }
-  } else {
-    unconsumeByte(insn);
-    insn->necessaryPrefixLocation = insn->readerCursor - 1;
   }
-  
+
   if (insn->mode == MODE_16BIT) {
     insn->registerSize       = (hasOpSize ? 4 : 2);
     insn->addressSize        = (hasAdSize ? 4 : 2);
@@ -438,6 +530,39 @@ static int readOpcode(struct InternalInstruction* insn) {
   dbgprintf(insn, "readOpcode()");
   
   insn->opcodeType = ONEBYTE;
+    
+  if (insn->vexSize == 3)
+  {
+    switch (mmmmmFromVEX2of3(insn->vexPrefix[1]))
+    {
+    default:
+      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", mmmmmFromVEX2of3(insn->vexPrefix[1]));
+      return -1;      
+    case 0:
+      break;
+    case VEX_LOB_0F:
+      insn->twoByteEscape = 0x0f;
+      insn->opcodeType = TWOBYTE;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F38:
+      insn->twoByteEscape = 0x0f;
+      insn->threeByteEscape = 0x38;
+      insn->opcodeType = THREEBYTE_38;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F3A:    
+      insn->twoByteEscape = 0x0f;
+      insn->threeByteEscape = 0x3a;
+      insn->opcodeType = THREEBYTE_3A;
+      return consumeByte(insn, &insn->opcode);
+    }
+  }
+  else if (insn->vexSize == 2)
+  {
+    insn->twoByteEscape = 0x0f;
+    insn->opcodeType = TWOBYTE;
+    return consumeByte(insn, &insn->opcode);
+  }
+    
   if (consumeByte(insn, &current))
     return -1;
   
@@ -467,6 +592,24 @@ static int readOpcode(struct InternalInstruction* insn) {
         return -1;
       
       insn->opcodeType = THREEBYTE_3A;
+    } else if (current == 0xa6) {
+      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+      
+      insn->threeByteEscape = current;
+      
+      if (consumeByte(insn, &current))
+        return -1;
+      
+      insn->opcodeType = THREEBYTE_A6;
+    } else if (current == 0xa7) {
+      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+      
+      insn->threeByteEscape = current;
+      
+      if (consumeByte(insn, &current))
+        return -1;
+      
+      insn->opcodeType = THREEBYTE_A7;
     } else {
       dbgprintf(insn, "Didn't find a three-byte escape prefix");
       
@@ -600,20 +743,64 @@ static int getID(struct InternalInstruction* insn) {
   dbgprintf(insn, "getID()");
     
   attrMask = ATTR_NONE;
-  
+
   if (insn->mode == MODE_64BIT)
     attrMask |= ATTR_64BIT;
-  
-  if (insn->rexPrefix & 0x08)
-    attrMask |= ATTR_REXW;
-  
-  if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
-    attrMask |= ATTR_OPSIZE;
-  else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
-    attrMask |= ATTR_XS;
-  else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
-    attrMask |= ATTR_XD;
-  
+    
+  if (insn->vexSize) {
+    attrMask |= ATTR_VEX;
+
+    if (insn->vexSize == 3) {
+      switch (ppFromVEX3of3(insn->vexPrefix[2])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;    
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+    
+      if (wFromVEX3of3(insn->vexPrefix[2]))
+        attrMask |= ATTR_REXW;
+      if (lFromVEX3of3(insn->vexPrefix[2]))
+        attrMask |= ATTR_VEXL;
+    }
+    else if (insn->vexSize == 2) {
+      switch (ppFromVEX2of2(insn->vexPrefix[1])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;    
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+    
+      if (lFromVEX2of2(insn->vexPrefix[1]))
+        attrMask |= ATTR_VEXL;
+    }
+    else {
+      return -1;
+    }
+  }
+  else {
+    if (insn->rexPrefix & 0x08)
+      attrMask |= ATTR_REXW;
+  
+    if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_OPSIZE;
+    else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_XS;
+    else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_XD;
+    
+  }
+
   if (getIDWithAttrMask(&instructionID, insn, attrMask))
     return -1;
   
@@ -749,7 +936,7 @@ static int readSIB(struct InternalInstruction* insn) {
     insn->sibIndex = SIB_INDEX_NONE;
     break;
   default:
-    insn->sibIndex = (EABase)(sibIndexBase + index);
+    insn->sibIndex = (SIBIndex)(sibIndexBase + index);
     if (insn->sibIndex == SIB_INDEX_sib ||
         insn->sibIndex == SIB_INDEX_sib64)
       insn->sibIndex = SIB_INDEX_NONE;
@@ -796,7 +983,7 @@ static int readSIB(struct InternalInstruction* insn) {
     }
     break;
   default:
-    insn->sibBase = (EABase)(sibBaseBase + base);
+    insn->sibBase = (SIBBase)(sibBaseBase + base);
     break;
   }
   
@@ -1012,6 +1199,8 @@ static int readModRM(struct InternalInstruction* insn) {
       return prefix##_EAX + index;                        \
     case TYPE_R64:                                        \
       return prefix##_RAX + index;                        \
+    case TYPE_XMM256:                                     \
+      return prefix##_YMM0 + index;                       \
     case TYPE_XMM128:                                     \
     case TYPE_XMM64:                                      \
     case TYPE_XMM32:                                      \
@@ -1073,6 +1262,14 @@ static int fixupReg(struct InternalInstruction *insn,
   default:
     debug("Expected a REG or R/M encoding in fixupReg");
     return -1;
+  case ENCODING_VVVV:
+    insn->vvvv = (Reg)fixupRegValue(insn,
+                                    (OperandType)op->type,
+                                    insn->vvvv,
+                                    &valid);
+    if (!valid)
+      return -1;
+    break;
   case ENCODING_REG:
     insn->reg = (Reg)fixupRegValue(insn,
                                    (OperandType)op->type,
@@ -1237,6 +1434,27 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
 }
 
 /*
+ * readVVVV - Consumes an immediate operand from an instruction, given the
+ *   desired operand size.
+ *
+ * @param insn  - The instruction whose operand is to be read.
+ * @return      - 0 if the immediate was successfully consumed; nonzero
+ *                otherwise.
+ */
+static int readVVVV(struct InternalInstruction* insn) {
+  dbgprintf(insn, "readVVVV()");
+        
+  if (insn->vexSize == 3)
+    insn->vvvv = vvvvFromVEX3of3(insn->vexPrefix[2]);
+  else if (insn->vexSize == 2)
+    insn->vvvv = vvvvFromVEX2of2(insn->vexPrefix[1]);
+  else
+    return -1;
+
+  return 0;
+}
+
+/*
  * readOperands - Consults the specifier for an instruction and consumes all
  *   operands for that instruction, interpreting them as it goes.
  *
@@ -1317,6 +1535,13 @@ static int readOperands(struct InternalInstruction* insn) {
     case ENCODING_I:
       if (readOpcodeModifier(insn))
         return -1;
+      break;
+    case ENCODING_VVVV:
+      if (readVVVV(insn))
+        return -1;
+      if (fixupReg(insn, &insn->spec->operands[index]))
+        return -1;
+      break;
     case ENCODING_DUP:
       break;
     default:
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index d0dc8b5..a9c90f8 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -34,16 +34,30 @@ extern "C" {
 /*
  * Accessor functions for various fields of an Intel instruction
  */
-#define modFromModRM(modRM)  ((modRM & 0xc0) >> 6)
-#define regFromModRM(modRM)  ((modRM & 0x38) >> 3)
-#define rmFromModRM(modRM)   (modRM & 0x7)
-#define scaleFromSIB(sib)    ((sib & 0xc0) >> 6)
-#define indexFromSIB(sib)    ((sib & 0x38) >> 3)
-#define baseFromSIB(sib)     (sib & 0x7)
-#define wFromREX(rex)        ((rex & 0x8) >> 3)
-#define rFromREX(rex)        ((rex & 0x4) >> 2)
-#define xFromREX(rex)        ((rex & 0x2) >> 1)
-#define bFromREX(rex)        (rex & 0x1)
+#define modFromModRM(modRM)  (((modRM) & 0xc0) >> 6)
+#define regFromModRM(modRM)  (((modRM) & 0x38) >> 3)
+#define rmFromModRM(modRM)   ((modRM) & 0x7)
+#define scaleFromSIB(sib)    (((sib) & 0xc0) >> 6)
+#define indexFromSIB(sib)    (((sib) & 0x38) >> 3)
+#define baseFromSIB(sib)     ((sib) & 0x7)
+#define wFromREX(rex)        (((rex) & 0x8) >> 3)
+#define rFromREX(rex)        (((rex) & 0x4) >> 2)
+#define xFromREX(rex)        (((rex) & 0x2) >> 1)
+#define bFromREX(rex)        ((rex) & 0x1)
+    
+#define rFromVEX2of3(vex)       (((~(vex)) & 0x80) >> 7)
+#define xFromVEX2of3(vex)       (((~(vex)) & 0x40) >> 6)
+#define bFromVEX2of3(vex)       (((~(vex)) & 0x20) >> 5)
+#define mmmmmFromVEX2of3(vex)   ((vex) & 0x1f)
+#define wFromVEX3of3(vex)       (((vex) & 0x80) >> 7)
+#define vvvvFromVEX3of3(vex)    (((~(vex)) & 0x78) >> 3)
+#define lFromVEX3of3(vex)       (((vex) & 0x4) >> 2)
+#define ppFromVEX3of3(vex)      ((vex) & 0x3)
+
+#define rFromVEX2of2(vex)       (((~(vex)) & 0x80) >> 7)
+#define vvvvFromVEX2of2(vex)    (((~(vex)) & 0x78) >> 3)
+#define lFromVEX2of2(vex)       (((vex) & 0x4) >> 2)
+#define ppFromVEX2of2(vex)      ((vex) & 0x3)
 
 /*
  * These enums represent Intel registers for use by the decoder.
@@ -206,7 +220,25 @@ extern "C" {
   ENTRY(XMM13)    \
   ENTRY(XMM14)    \
   ENTRY(XMM15)
-  
+
+#define REGS_YMM  \
+  ENTRY(YMM0)     \
+  ENTRY(YMM1)     \
+  ENTRY(YMM2)     \
+  ENTRY(YMM3)     \
+  ENTRY(YMM4)     \
+  ENTRY(YMM5)     \
+  ENTRY(YMM6)     \
+  ENTRY(YMM7)     \
+  ENTRY(YMM8)     \
+  ENTRY(YMM9)     \
+  ENTRY(YMM10)    \
+  ENTRY(YMM11)    \
+  ENTRY(YMM12)    \
+  ENTRY(YMM13)    \
+  ENTRY(YMM14)    \
+  ENTRY(YMM15)
+    
 #define REGS_SEGMENT \
   ENTRY(ES)          \
   ENTRY(CS)          \
@@ -252,6 +284,7 @@ extern "C" {
   REGS_64BIT          \
   REGS_MMX            \
   REGS_XMM            \
+  REGS_YMM            \
   REGS_SEGMENT        \
   REGS_DEBUG          \
   REGS_CONTROL        \
@@ -332,6 +365,27 @@ typedef enum {
   SEG_OVERRIDE_GS,
   SEG_OVERRIDE_max
 } SegmentOverride;
+    
+/*
+ * VEXLeadingOpcodeByte - Possible values for the VEX.m-mmmm field
+ */
+
+typedef enum {
+  VEX_LOB_0F = 0x1,
+  VEX_LOB_0F38 = 0x2,
+  VEX_LOB_0F3A = 0x3
+} VEXLeadingOpcodeByte;
+
+/*
+ * VEXPrefixCode - Possible values for the VEX.pp field
+ */
+
+typedef enum {
+  VEX_PREFIX_NONE = 0x0,
+  VEX_PREFIX_66 = 0x1,
+  VEX_PREFIX_F3 = 0x2,
+  VEX_PREFIX_F2 = 0x3
+} VEXPrefixCode;
 
 typedef uint8_t BOOL;
 
@@ -389,10 +443,12 @@ struct InternalInstruction {
   uint8_t prefixPresent[0x100];
   /* contains the location (for use with the reader) of the prefix byte */
   uint64_t prefixLocations[0x100];
+  /* The value of the VEX prefix, if present */
+  uint8_t vexPrefix[3];
+  /* The length of the VEX prefix (0 if not present) */
+  uint8_t vexSize;
   /* The value of the REX prefix, if present */
   uint8_t rexPrefix;
-  /* The location of the REX prefix */
-  uint64_t rexLocation;
   /* The location where a mandatory prefix would have to be (i.e., right before
      the opcode, or right before the REX prefix if one is present) */
   uint64_t necessaryPrefixLocation;
@@ -428,6 +484,10 @@ struct InternalInstruction {
   /* state for additional bytes, consumed during operand decode.  Pattern:
      consumed___ indicates that the byte was already consumed and does not
      need to be consumed again */
+
+  /* The VEX.vvvv field, which contains a third register operand for some AVX
+     instructions */
+  Reg                           vvvv;
   
   /* The ModR/M byte, which contains most register operands and some portion of
      all memory operands */
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index 1425b86..70315ed 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -30,6 +30,8 @@
 #define TWOBYTE_SYM       x86DisassemblerTwoByteOpcodes
 #define THREEBYTE38_SYM   x86DisassemblerThreeByte38Opcodes
 #define THREEBYTE3A_SYM   x86DisassemblerThreeByte3AOpcodes
+#define THREEBYTEA6_SYM   x86DisassemblerThreeByteA6Opcodes
+#define THREEBYTEA7_SYM   x86DisassemblerThreeByteA7Opcodes
 
 #define INSTRUCTIONS_STR  "x86DisassemblerInstrSpecifiers"
 #define CONTEXTS_STR      "x86DisassemblerContexts"
@@ -37,6 +39,8 @@
 #define TWOBYTE_STR       "x86DisassemblerTwoByteOpcodes"
 #define THREEBYTE38_STR   "x86DisassemblerThreeByte38Opcodes"
 #define THREEBYTE3A_STR   "x86DisassemblerThreeByte3AOpcodes"
+#define THREEBYTEA6_STR   "x86DisassemblerThreeByteA6Opcodes"
+#define THREEBYTEA7_STR   "x86DisassemblerThreeByteA7Opcodes"
 
 /*
  * Attributes of an instruction that must be known before the opcode can be
@@ -49,7 +53,9 @@
   ENUM_ENTRY(ATTR_XS,     0x02) \
   ENUM_ENTRY(ATTR_XD,     0x04) \
   ENUM_ENTRY(ATTR_REXW,   0x08) \
-  ENUM_ENTRY(ATTR_OPSIZE, 0x10)
+  ENUM_ENTRY(ATTR_OPSIZE, 0x10) \
+  ENUM_ENTRY(ATTR_VEX,    0x20) \
+  ENUM_ENTRY(ATTR_VEXL,   0x40)
 
 #define ENUM_ENTRY(n, v) n = v,
 enum attributeBits {
@@ -87,7 +93,20 @@ enum attributeBits {
                                         "IC_64BIT_REXW_XS")                    \
   ENUM_ENTRY(IC_64BIT_REXW_OPSIZE,  7,  "The Dynamic Duo!  Prefer over all "   \
                                         "else because this changes most "      \
-                                        "operands' meaning")
+                                        "operands' meaning")                   \
+  ENUM_ENTRY(IC_VEX,                1,  "requires a VEX prefix")               \
+  ENUM_ENTRY(IC_VEX_XS,             2,  "requires VEX and the XS prefix")      \
+  ENUM_ENTRY(IC_VEX_XD,             2,  "requires VEX and the XD prefix")      \
+  ENUM_ENTRY(IC_VEX_OPSIZE,         2,  "requires VEX and the OpSize prefix")  \
+  ENUM_ENTRY(IC_VEX_W,              3,  "requires VEX and the W prefix")       \
+  ENUM_ENTRY(IC_VEX_W_XS,           4,  "requires VEX, W, and XS prefix")      \
+  ENUM_ENTRY(IC_VEX_W_XD,           4,  "requires VEX, W, and XD prefix")      \
+  ENUM_ENTRY(IC_VEX_W_OPSIZE,       4,  "requires VEX, W, and OpSize")         \
+  ENUM_ENTRY(IC_VEX_L,              3,  "requires VEX and the L prefix")       \
+  ENUM_ENTRY(IC_VEX_L_XS,           4,  "requires VEX and the L and XS prefix")\
+  ENUM_ENTRY(IC_VEX_L_XD,           4,  "requires VEX and the L and XS prefix")\
+  ENUM_ENTRY(IC_VEX_L_OPSIZE,       4,  "requires VEX, L, and OpSize")
+
 
 #define ENUM_ENTRY(n, r, d) n,    
 typedef enum {
@@ -104,7 +123,9 @@ typedef enum {
   ONEBYTE       = 0,
   TWOBYTE       = 1,
   THREEBYTE_38  = 2,
-  THREEBYTE_3A  = 3
+  THREEBYTE_3A  = 3,
+  THREEBYTE_A6  = 4,
+  THREEBYTE_A7  = 5
 } OpcodeType;
 
 /*
@@ -183,6 +204,7 @@ struct ContextDecision {
   ENUM_ENTRY(ENCODING_NONE,   "")                                              \
   ENUM_ENTRY(ENCODING_REG,    "Register operand in ModR/M byte.")              \
   ENUM_ENTRY(ENCODING_RM,     "R/M operand in ModR/M byte.")                   \
+  ENUM_ENTRY(ENCODING_VVVV,   "Register operand in VEX.vvvv byte.")            \
   ENUM_ENTRY(ENCODING_CB,     "1-byte code offset (possible new CS value)")    \
   ENUM_ENTRY(ENCODING_CW,     "2-byte")                                        \
   ENUM_ENTRY(ENCODING_CD,     "4-byte")                                        \
@@ -278,6 +300,7 @@ struct ContextDecision {
   ENUM_ENTRY(TYPE_XMM32,      "4-byte XMM register or memory operand")         \
   ENUM_ENTRY(TYPE_XMM64,      "8-byte")                                        \
   ENUM_ENTRY(TYPE_XMM128,     "16-byte")                                       \
+  ENUM_ENTRY(TYPE_XMM256,     "32-byte")                                       \
   ENUM_ENTRY(TYPE_XMM0,       "Implicit use of XMM0")                          \
   ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand")                      \
   ENUM_ENTRY(TYPE_DEBUGREG,   "Debug register operand")                        \
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index d6950f4..dd6e353 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -15,6 +15,7 @@
 #define DEBUG_TYPE "asm-printer"
 #include "X86ATTInstPrinter.h"
 #include "X86InstComments.h"
+#include "X86Subtarget.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
@@ -22,24 +23,38 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
 #include "X86GenInstrNames.inc"
+#include <map>
 using namespace llvm;
 
 // Include the auto-generated portion of the assembly writer.
 #define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "X86GenRegisterNames.inc"
 #include "X86GenAsmWriter.inc"
+#undef PRINT_ALIAS_INSTR
+#undef GET_INSTRUCTION_NAME
+
+X86ATTInstPrinter::X86ATTInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI)
+  : MCInstPrinter(MAI) {
+  // Initialize the set of available features.
+  setAvailableFeatures(ComputeAvailableFeatures(
+            &TM.getSubtarget<X86Subtarget>()));
+}
 
 void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) {
-  printInstruction(MI, OS);
+  // Try to print any aliases first.
+  if (!printAliasInstr(MI, OS))
+    printInstruction(MI, OS);
   
   // If verbose assembly is enabled, we can print some informative comments.
   if (CommentStream)
     EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
 }
+
 StringRef X86ATTInstPrinter::getOpcodeName(unsigned Opcode) const {
   return getInstructionName(Opcode);
 }
 
-
 void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
                                    raw_ostream &O) {
   switch (MI->getOperand(Op).getImm()) {
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index eb98664..8d69391 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -17,16 +17,24 @@
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
-  class MCOperand;
+
+class MCOperand;
+class X86Subtarget;
+class TargetMachine;
   
 class X86ATTInstPrinter : public MCInstPrinter {
 public:
-  X86ATTInstPrinter(const MCAsmInfo &MAI) : MCInstPrinter(MAI) {}
-
+  X86ATTInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI);
   
   virtual void printInst(const MCInst *MI, raw_ostream &OS);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
 
+  // Methods used to print the alias of an instruction.
+  unsigned ComputeAvailableFeatures(const X86Subtarget *Subtarget) const;
+  // Autogenerated by tblgen, returns true if we successfully printed an
+  // alias.
+  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &OS);
   static const char *getRegisterName(unsigned RegNo);
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 12144e3..c642acc 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -111,28 +111,28 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     // FALL THROUGH.
   case X86::PUNPCKLBWrm:
     Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodePUNPCKLMask(16, ShuffleMask);
+    DecodePUNPCKLBWMask(16, ShuffleMask);
     break;
   case X86::PUNPCKLWDrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::PUNPCKLWDrm:
     Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodePUNPCKLMask(8, ShuffleMask);
+    DecodePUNPCKLWDMask(8, ShuffleMask);
     break;
   case X86::PUNPCKLDQrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::PUNPCKLDQrm:
     Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodePUNPCKLMask(4, ShuffleMask);
+    DecodePUNPCKLDQMask(4, ShuffleMask);
     break;
   case X86::PUNPCKLQDQrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::PUNPCKLQDQrm:
     Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodePUNPCKLMask(2, ShuffleMask);
+    DecodePUNPCKLQDQMask(2, ShuffleMask);
     break;
 
   case X86::SHUFPDrri:
@@ -153,16 +153,44 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::UNPCKLPDrm:
-    DecodeUNPCKLPMask(2, ShuffleMask);
+    DecodeUNPCKLPDMask(2, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
     break;
+  case X86::VUNPCKLPDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKLPDrm:
+    DecodeUNPCKLPDMask(2, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    break;
+  case X86::VUNPCKLPDYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKLPDYrm:
+    DecodeUNPCKLPDMask(4, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    break;
   case X86::UNPCKLPSrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::UNPCKLPSrm:
-    DecodeUNPCKLPMask(4, ShuffleMask);
+    DecodeUNPCKLPSMask(4, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
     break;
+  case X86::VUNPCKLPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKLPSrm:
+    DecodeUNPCKLPSMask(4, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    break;
+  case X86::VUNPCKLPSYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKLPSYrm:
+    DecodeUNPCKLPSMask(8, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    break;
   case X86::UNPCKHPDrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 0484529..47253eb 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -15,6 +15,7 @@
 #define DEBUG_TYPE "asm-printer"
 #include "X86IntelInstPrinter.h"
 #include "X86InstComments.h"
+#include "X86Subtarget.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index 6f12032..ca99dc0 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -18,13 +18,15 @@
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
-  class MCOperand;
+
+class MCOperand;
+class TargetMachine;
   
 class X86IntelInstPrinter : public MCInstPrinter {
 public:
-  X86IntelInstPrinter(const MCAsmInfo &MAI)
+  X86IntelInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI)
     : MCInstPrinter(MAI) {}
-  
+
   virtual void printInst(const MCInst *MI, raw_ostream &OS);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
   
@@ -33,7 +35,6 @@ public:
   static const char *getRegisterName(unsigned RegNo);
   static const char *getInstructionName(unsigned Opcode);
 
-
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
   void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O);
diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt
index e21d69a..e7429a3 100644
--- a/lib/Target/X86/README-X86-64.txt
+++ b/lib/Target/X86/README-X86-64.txt
@@ -36,7 +36,7 @@ _conv:
 	cmovb %rcx, %rax
 	ret
 
-Seems like the jb branch has high likelyhood of being taken. It would have
+Seems like the jb branch has high likelihood of being taken. It would have
 saved a few instructions.
 
 //===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index abd1515..ea3014e 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -7,14 +7,6 @@ copy (3-addr bswap + memory support?)  This is available on Atom processors.
 
 //===---------------------------------------------------------------------===//
 
-CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
-backend knows how to three-addressify this shift, but it appears the register
-allocator isn't even asking it to do so in this case.  We should investigate
-why this isn't happening, it could have significant impact on other important
-cases for X86 as well.
-
-//===---------------------------------------------------------------------===//
-
 This should be one DIV/IDIV instruction, not a libcall:
 
 unsigned test(unsigned long long X, unsigned Y) {
@@ -1572,7 +1564,7 @@ Implement processor-specific optimizations for parity with GCC on these
 processors.  GCC does two optimizations:
 
 1. ix86_pad_returns inserts a noop before ret instructions if immediately
-   preceeded by a conditional branch or is the target of a jump.
+   preceded by a conditional branch or is the target of a jump.
 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
    code contains more than 3 branches.
    
@@ -1656,28 +1648,61 @@ information to add the "lock" prefix.
 
 //===---------------------------------------------------------------------===//
 
-_Bool bar(int *x) { return *x & 1; }
+struct B {
+  unsigned char y0 : 1;
+};
 
-define zeroext i1 @bar(i32* nocapture %x) nounwind readonly {
-entry:
-  %tmp1 = load i32* %x                            ; <i32> [#uses=1]
-  %and = and i32 %tmp1, 1                         ; <i32> [#uses=1]
-  %tobool = icmp ne i32 %and, 0                   ; <i1> [#uses=1]
-  ret i1 %tobool
+int bar(struct B* a) { return a->y0; }
+
+define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
+  %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
+  %2 = load i8* %1, align 1
+  %3 = and i8 %2, 1
+  %4 = zext i8 %3 to i32
+  ret i32 %4
 }
 
-bar:                                                        # @bar
-# BB#0:                                                     # %entry
-	movl	4(%esp), %eax
-	movb	(%eax), %al
-	andb	$1, %al
-	movzbl	%al, %eax
-	ret
+bar:                                    # @bar
+# BB#0:
+        movb    (%rdi), %al
+        andb    $1, %al
+        movzbl  %al, %eax
+        ret
 
 Missed optimization: should be movl+andl.
 
 //===---------------------------------------------------------------------===//
 
+The x86_64 abi says:
+
+Booleans, when stored in a memory object, are stored as single byte objects the
+value of which is always 0 (false) or 1 (true).
+
+We are not using this fact:
+
+int bar(_Bool *a) { return *a; }
+
+define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
+  %1 = load i8* %a, align 1, !tbaa !0
+  %tmp = and i8 %1, 1
+  %2 = zext i8 %tmp to i32
+  ret i32 %2
+}
+
+bar:
+        movb    (%rdi), %al
+        andb    $1, %al
+        movzbl  %al, %eax
+        ret
+
+GCC produces
+
+bar:
+        movzbl  (%rdi), %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
 Consider the following two functions compiled with clang:
 _Bool foo(int *x) { return !(*x & 4); }
 unsigned bar(int *x) { return !(*x & 4); }
@@ -1703,26 +1728,6 @@ are functionally identical.
 //===---------------------------------------------------------------------===//
 
 Take the following C code:
-int x(int y) { return (y & 63) << 14; }
-
-Code produced by gcc:
-	andl	$63, %edi
-	sall	$14, %edi
-	movl	%edi, %eax
-	ret
-
-Code produced by clang:
-	shll	$14, %edi
-	movl	%edi, %eax
-	andl	$1032192, %eax
-	ret
-
-The code produced by gcc is 3 bytes shorter.  This sort of construct often
-shows up with bitfields.
-
-//===---------------------------------------------------------------------===//
-
-Take the following C code:
 int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
 
 We generate the following IR with clang:
@@ -1947,3 +1952,91 @@ which is "perfect".
 
 //===---------------------------------------------------------------------===//
 
+For the branch in the following code:
+int a();
+int b(int x, int y) {
+  if (x & (1<<(y&7)))
+    return a();
+  return y;
+}
+
+We currently generate:
+	movb	%sil, %al
+	andb	$7, %al
+	movzbl	%al, %eax
+	btl	%eax, %edi
+	jae	.LBB0_2
+
+movl+andl would be shorter than the movb+andb+movzbl sequence.
+
+//===---------------------------------------------------------------------===//
+
+For the following:
+struct u1 {
+    float x, y;
+};
+float foo(struct u1 u) {
+    return u.x + u.y;
+}
+
+We currently generate:
+	movdqa	%xmm0, %xmm1
+	pshufd	$1, %xmm0, %xmm0        # xmm0 = xmm0[1,0,0,0]
+	addss	%xmm1, %xmm0
+	ret
+
+We could save an instruction here by commuting the addss.
+
+//===---------------------------------------------------------------------===//
+
+This (from PR9661):
+
+float clamp_float(float a) {
+        if (a > 1.0f)
+                return 1.0f;
+        else if (a < 0.0f)
+                return 0.0f;
+        else
+                return a;
+}
+
+Could compile to:
+
+clamp_float:                            # @clamp_float
+        movss   .LCPI0_0(%rip), %xmm1
+        minss   %xmm1, %xmm0
+        pxor    %xmm1, %xmm1
+        maxss   %xmm1, %xmm0
+        ret
+
+with -ffast-math.
+
+//===---------------------------------------------------------------------===//
+
+This function (from PR9803):
+
+int clamp2(int a) {
+        if (a > 5)
+                a = 5;
+        if (a < 0) 
+                return 0;
+        return a;
+}
+
+Compiles to:
+
+_clamp2:                                ## @clamp2
+        pushq   %rbp
+        movq    %rsp, %rbp
+        cmpl    $5, %edi
+        movl    $5, %ecx
+        cmovlel %edi, %ecx
+        testl   %ecx, %ecx
+        movl    $0, %eax
+        cmovnsl %ecx, %eax
+        popq    %rbp
+        ret
+
+The move of 0 could be scheduled above the test to make it is xor reg,reg.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index 1287977..cd06060 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -1,4 +1,4 @@
-//===-- X86ShuffleDecode.h - X86 shuffle decode logic ---------------------===//
+//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -95,12 +95,29 @@ void DecodePSHUFLWMask(unsigned Imm,
   ShuffleMask.push_back(7);
 }
 
-void DecodePUNPCKLMask(unsigned NElts,
+void DecodePUNPCKLBWMask(unsigned NElts,
+                         SmallVectorImpl<unsigned> &ShuffleMask) {
+  DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i8, NElts), ShuffleMask);
+}
+
+void DecodePUNPCKLWDMask(unsigned NElts,
+                         SmallVectorImpl<unsigned> &ShuffleMask) {
+  DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i16, NElts), ShuffleMask);
+}
+
+void DecodePUNPCKLDQMask(unsigned NElts,
+                         SmallVectorImpl<unsigned> &ShuffleMask) {
+  DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i32, NElts), ShuffleMask);
+}
+
+void DecodePUNPCKLQDQMask(unsigned NElts,
+                          SmallVectorImpl<unsigned> &ShuffleMask) {
+  DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i64, NElts), ShuffleMask);
+}
+
+void DecodePUNPCKLMask(EVT VT,
                        SmallVectorImpl<unsigned> &ShuffleMask) {
-  for (unsigned i = 0; i != NElts/2; ++i) {
-    ShuffleMask.push_back(i);
-    ShuffleMask.push_back(i+NElts);
-  }
+  DecodeUNPCKLPMask(VT, ShuffleMask);
 }
 
 void DecodePUNPCKHMask(unsigned NElts,
@@ -133,15 +150,40 @@ void DecodeUNPCKHPMask(unsigned NElts,
   }
 }
 
+void DecodeUNPCKLPSMask(unsigned NElts,
+                        SmallVectorImpl<unsigned> &ShuffleMask) {
+  DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i32, NElts), ShuffleMask);
+}
+
+void DecodeUNPCKLPDMask(unsigned NElts,
+                        SmallVectorImpl<unsigned> &ShuffleMask) {
+  DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i64, NElts), ShuffleMask);
+}
 
 /// DecodeUNPCKLPMask - This decodes the shuffle masks for unpcklps/unpcklpd
-/// etc.  NElts indicates the number of elements in the vector allowing it to
-/// handle different datatypes and vector widths.
-void DecodeUNPCKLPMask(unsigned NElts,
+/// etc.  VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodeUNPCKLPMask(EVT VT,
                        SmallVectorImpl<unsigned> &ShuffleMask) {
-  for (unsigned i = 0; i != NElts/2; ++i) {
-    ShuffleMask.push_back(i);        // Reads from dest
-    ShuffleMask.push_back(i+NElts);  // Reads from src
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Handle vector lengths > 128 bits.  Define a "section" as a set of
+  // 128 bits.  AVX defines UNPCK* to operate independently on 128-bit
+  // sections.
+  unsigned NumSections = VT.getSizeInBits() / 128;
+  if (NumSections == 0 ) NumSections = 1;  // Handle MMX
+  unsigned NumSectionElts = NumElts / NumSections;
+
+  unsigned Start = 0;
+  unsigned End = NumSectionElts / 2;
+  for (unsigned s = 0; s < NumSections; ++s) {
+    for (unsigned i = Start; i != End; ++i) {
+      ShuffleMask.push_back(i);                 // Reads from dest/src1
+      ShuffleMask.push_back(i+NumSectionElts);  // Reads from src/src2
+    }
+    // Process the next 128 bits.
+    Start += NumSectionElts;
+    End += NumSectionElts;
   }
 }
 
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 50d9ccb..b18f670 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -16,6 +16,7 @@
 #define X86_SHUFFLE_DECODE_H
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/ValueTypes.h"
 
 //===----------------------------------------------------------------------===//
 //  Vector Mask Decoding
@@ -45,7 +46,19 @@ void DecodePSHUFHWMask(unsigned Imm,
 void DecodePSHUFLWMask(unsigned Imm,
                        SmallVectorImpl<unsigned> &ShuffleMask);
 
-void DecodePUNPCKLMask(unsigned NElts,
+void DecodePUNPCKLBWMask(unsigned NElts,
+                         SmallVectorImpl<unsigned> &ShuffleMask);
+
+void DecodePUNPCKLWDMask(unsigned NElts,
+                         SmallVectorImpl<unsigned> &ShuffleMask);
+
+void DecodePUNPCKLDQMask(unsigned NElts,
+                         SmallVectorImpl<unsigned> &ShuffleMask);
+
+void DecodePUNPCKLQDQMask(unsigned NElts,
+                          SmallVectorImpl<unsigned> &ShuffleMask);
+
+void DecodePUNPCKLMask(EVT VT,
                        SmallVectorImpl<unsigned> &ShuffleMask);
 
 void DecodePUNPCKHMask(unsigned NElts,
@@ -57,11 +70,16 @@ void DecodeSHUFPSMask(unsigned NElts, unsigned Imm,
 void DecodeUNPCKHPMask(unsigned NElts,
                        SmallVectorImpl<unsigned> &ShuffleMask);
 
+void DecodeUNPCKLPSMask(unsigned NElts,
+                        SmallVectorImpl<unsigned> &ShuffleMask);
+
+void DecodeUNPCKLPDMask(unsigned NElts,
+                        SmallVectorImpl<unsigned> &ShuffleMask);
 
 /// DecodeUNPCKLPMask - This decodes the shuffle masks for unpcklps/unpcklpd
-/// etc.  NElts indicates the number of elements in the vector allowing it to
-/// handle different datatypes and vector widths.
-void DecodeUNPCKLPMask(unsigned NElts,
+/// etc.  VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodeUNPCKLPMask(EVT VT,
                        SmallVectorImpl<unsigned> &ShuffleMask);
 
 } // llvm namespace
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index efb6c8c..25b8d3e 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -1,13 +1,13 @@
 //===- X86.td - Target definition file for the Intel X86 ---*- tablegen -*-===//
-// 
+//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 //
-// This is a target description file for the Intel i386 architecture, refered to
+// This is a target description file for the Intel i386 architecture, referred to
 // here as the "X86" architecture.
 //
 //===----------------------------------------------------------------------===//
@@ -32,7 +32,7 @@ def FeatureMMX     : SubtargetFeature<"mmx","X86SSELevel", "MMX",
 def FeatureSSE1    : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
                                       "Enable SSE instructions",
                                       // SSE codegen depends on cmovs, and all
-                                      // SSE1+ processors support them. 
+                                      // SSE1+ processors support them.
                                       [FeatureMMX, FeatureCMOV]>;
 def FeatureSSE2    : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
                                       "Enable SSE2 instructions",
@@ -50,7 +50,8 @@ def FeatureSSE42   : SubtargetFeature<"sse42", "X86SSELevel", "SSE42",
                                       "Enable SSE 4.2 instructions",
                                       [FeatureSSE41, FeaturePOPCNT]>;
 def Feature3DNow   : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
-                                      "Enable 3DNow! instructions">;
+                                      "Enable 3DNow! instructions",
+                                      [FeatureMMX]>;
 def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
                                       "Enable 3DNow! Athlon instructions",
                                       [Feature3DNow]>;
@@ -125,10 +126,10 @@ def : Proc<"sandybridge",     [FeatureSSE42, Feature64Bit,
                                FeatureAES, FeatureCLMUL]>;
 
 def : Proc<"k6",              [FeatureMMX]>;
-def : Proc<"k6-2",            [FeatureMMX,    Feature3DNow]>;
-def : Proc<"k6-3",            [FeatureMMX,    Feature3DNow]>;
-def : Proc<"athlon",          [FeatureMMX,    Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-tbird",    [FeatureMMX,    Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"k6-2",            [Feature3DNow]>;
+def : Proc<"k6-3",            [Feature3DNow]>;
+def : Proc<"athlon",          [Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-tbird",    [Feature3DNowA, FeatureSlowBTMem]>;
 def : Proc<"athlon-4",        [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
 def : Proc<"athlon-xp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
 def : Proc<"athlon-mp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
@@ -156,8 +157,8 @@ def : Proc<"shanghai",        [Feature3DNowA, Feature64Bit, FeatureSSE4A,
                                Feature3DNowA]>;
 
 def : Proc<"winchip-c6",      [FeatureMMX]>;
-def : Proc<"winchip2",        [FeatureMMX, Feature3DNow]>;
-def : Proc<"c3",              [FeatureMMX, Feature3DNow]>;
+def : Proc<"winchip2",        [Feature3DNow]>;
+def : Proc<"c3",              [Feature3DNow]>;
 def : Proc<"c3-2",            [FeatureSSE1]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86AsmBackend.cpp b/lib/Target/X86/X86AsmBackend.cpp
index da5f5b1..4d7d96d 100644
--- a/lib/Target/X86/X86AsmBackend.cpp
+++ b/lib/Target/X86/X86AsmBackend.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -28,6 +29,13 @@
 #include "llvm/Target/TargetAsmBackend.h"
 using namespace llvm;
 
+// Option to allow disabling arithmetic relaxation to workaround PR9807, which
+// is useful when running bitwise comparison experiments on Darwin. We should be
+// able to remove this once PR9807 is resolved.
+static cl::opt<bool>
+MCDisableArithRelaxation("mc-x86-disable-arith-relaxation",
+         cl::desc("Disable relaxation of arithmetic instruction for X86"));
+
 static unsigned getFixupKindLog2Size(unsigned Kind) {
   switch (Kind) {
   default: assert(0 && "invalid fixup kind!");
@@ -201,6 +209,9 @@ bool X86AsmBackend::MayNeedRelaxation(const MCInst &Inst) const {
   if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode())
     return true;
 
+  if (MCDisableArithRelaxation)
+    return false;
+
   // Check if this instruction is ever relaxable.
   if (getRelaxedOpcodeArith(Inst.getOpcode()) == Inst.getOpcode())
     return false;
@@ -307,10 +318,13 @@ public:
     : ELFX86AsmBackend(T, OSType) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createELFObjectWriter(new X86ELFObjectWriter(false, OSType,
-                                                        ELF::EM_386, false),
+    return createELFObjectWriter(createELFObjectTargetWriter(),
                                  OS, /*IsLittleEndian*/ true);
   }
+
+  MCELFObjectTargetWriter *createELFObjectTargetWriter() const {
+    return new X86ELFObjectWriter(false, OSType, ELF::EM_386, false);
+  }
 };
 
 class ELFX86_64AsmBackend : public ELFX86AsmBackend {
@@ -319,10 +333,13 @@ public:
     : ELFX86AsmBackend(T, OSType) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createELFObjectWriter(new X86ELFObjectWriter(true, OSType,
-                                                        ELF::EM_X86_64, true),
+    return createELFObjectWriter(createELFObjectTargetWriter(),
                                  OS, /*IsLittleEndian*/ true);
   }
+
+  MCELFObjectTargetWriter *createELFObjectTargetWriter() const {
+    return new X86ELFObjectWriter(true, OSType, ELF::EM_X86_64, true);
+  }
 };
 
 class WindowsX86AsmBackend : public X86AsmBackend {
@@ -408,34 +425,26 @@ public:
 
 TargetAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
                                                const std::string &TT) {
-  switch (Triple(TT).getOS()) {
-  case Triple::Darwin:
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
     return new DarwinX86_32AsmBackend(T);
-  case Triple::MinGW32:
-  case Triple::Cygwin:
-  case Triple::Win32:
-    if (Triple(TT).getEnvironment() == Triple::MachO)
-      return new DarwinX86_32AsmBackend(T);
-    else
-      return new WindowsX86AsmBackend(T, false);
-  default:
-    return new ELFX86_32AsmBackend(T, Triple(TT).getOS());
-  }
+
+  if (TheTriple.isOSWindows())
+    return new WindowsX86AsmBackend(T, false);
+
+  return new ELFX86_32AsmBackend(T, TheTriple.getOS());
 }
 
 TargetAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
                                                const std::string &TT) {
-  switch (Triple(TT).getOS()) {
-  case Triple::Darwin:
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
     return new DarwinX86_64AsmBackend(T);
-  case Triple::MinGW32:
-  case Triple::Cygwin:
-  case Triple::Win32:
-    if (Triple(TT).getEnvironment() == Triple::MachO)
-      return new DarwinX86_64AsmBackend(T);
-    else
-      return new WindowsX86AsmBackend(T, true);
-  default:
-    return new ELFX86_64AsmBackend(T, Triple(TT).getOS());
-  }
+
+  if (TheTriple.isOSWindows())
+    return new WindowsX86AsmBackend(T, true);
+
+  return new ELFX86_64AsmBackend(T, TheTriple.getOS());
 }
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 99b4479..c2d53c4 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -709,12 +709,13 @@ void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
 //===----------------------------------------------------------------------===//
 
 static MCInstPrinter *createX86MCInstPrinter(const Target &T,
+                                             TargetMachine &TM,
                                              unsigned SyntaxVariant,
                                              const MCAsmInfo &MAI) {
   if (SyntaxVariant == 0)
-    return new X86ATTInstPrinter(MAI);
+    return new X86ATTInstPrinter(TM, MAI);
   if (SyntaxVariant == 1)
-    return new X86IntelInstPrinter(MAI);
+    return new X86IntelInstPrinter(TM, MAI);
   return 0;
 }
 
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index a44fb69..5635175 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -215,6 +215,13 @@ def CC_X86_Win64_C : CallingConv<[
   // The first 4 integer arguments are passed in integer registers.
   CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
                                           [XMM0, XMM1, XMM2, XMM3]>>,
+  
+  // Do not pass the sret argument in RCX, the Win64 thiscall calling
+  // convention requires "this" to be passed in RCX.                                        
+  CCIfCC<"CallingConv::X86_ThisCall", 
+    CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8  , R9  ],
+                                                     [XMM1, XMM2, XMM3]>>>>,
+
   CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8  , R9  ],
                                           [XMM0, XMM1, XMM2, XMM3]>>,
 
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index 60d9d4a..421e221 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -652,6 +652,8 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
   case X86II::TB:  // Two-byte opcode prefix
   case X86II::T8:  // 0F 38
   case X86II::TA:  // 0F 3A
+  case X86II::A6:  // 0F A6
+  case X86II::A7:  // 0F A7
     Need0FPrefix = true;
     break;
   case X86II::TF: // F2 0F 38
@@ -695,6 +697,12 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
   case X86II::TA:    // 0F 3A
     MCE.emitByte(0x3A);
     break;
+  case X86II::A6:    // 0F A6
+    MCE.emitByte(0xA6);
+    break;
+  case X86II::A7:    // 0F A7
+    MCE.emitByte(0xA7);
+    break;
   }
 
   // If this is a two-address instruction, skip one of the register operands.
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 6fa9284..1382f18 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -23,6 +23,7 @@
 #include "llvm/GlobalVariable.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Operator.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -77,10 +78,8 @@ private:
 
   bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR);
 
-  bool X86FastEmitStore(EVT VT, const Value *Val,
-                        const X86AddressMode &AM);
-  bool X86FastEmitStore(EVT VT, unsigned Val,
-                        const X86AddressMode &AM);
+  bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM);
+  bool X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM);
 
   bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
                          unsigned &ResultReg);
@@ -125,6 +124,8 @@ private:
 
   unsigned TargetMaterializeAlloca(const AllocaInst *C);
 
+  unsigned TargetMaterializeFloatZero(const ConstantFP *CF);
+
   /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
   /// computed in an SSE register, not on the X87 floating point stack.
   bool isScalarFPTypeInSSEReg(EVT VT) const {
@@ -133,6 +134,9 @@ private:
   }
 
   bool isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1 = false);
+
+  bool TryEmitSmallMemcpy(X86AddressMode DestAM,
+                          X86AddressMode SrcAM, uint64_t Len);
 };
 
 } // end anonymous namespace.
@@ -224,8 +228,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
 /// and a displacement offset, or a GlobalAddress,
 /// i.e. V. Return true if it is possible.
 bool
-X86FastISel::X86FastEmitStore(EVT VT, unsigned Val,
-                              const X86AddressMode &AM) {
+X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
   // Get opcode and regclass of the output for the given store instruction.
   unsigned Opc = 0;
   switch (VT.getSimpleVT().SimpleTy) {
@@ -395,37 +398,45 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
       const Value *Op = *i;
       if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
         const StructLayout *SL = TD.getStructLayout(STy);
-        unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
-        Disp += SL->getElementOffset(Idx);
-      } else {
-        uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
-        SmallVector<const Value *, 4> Worklist;
-        Worklist.push_back(Op);
-        do {
-          Op = Worklist.pop_back_val();
-          if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
-            // Constant-offset addressing.
-            Disp += CI->getSExtValue() * S;
-          } else if (isa<AddOperator>(Op) &&
-                     isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
-            // An add with a constant operand. Fold the constant.
-            ConstantInt *CI =
-              cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
-            Disp += CI->getSExtValue() * S;
-            // Add the other operand back to the work list.
-            Worklist.push_back(cast<AddOperator>(Op)->getOperand(0));
-          } else if (IndexReg == 0 &&
-                     (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
-                     (S == 1 || S == 2 || S == 4 || S == 8)) {
-            // Scaled-index addressing.
-            Scale = S;
-            IndexReg = getRegForGEPIndex(Op).first;
-            if (IndexReg == 0)
-              return false;
-          } else
-            // Unsupported.
-            goto unsupported_gep;
-        } while (!Worklist.empty());
+        Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
+        continue;
+      }
+      
+      // A array/variable index is always of the form i*S where S is the
+      // constant scale size.  See if we can push the scale into immediates.
+      uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
+      for (;;) {
+        if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+          // Constant-offset addressing.
+          Disp += CI->getSExtValue() * S;
+          break;
+        }
+        if (isa<AddOperator>(Op) &&
+            (!isa<Instruction>(Op) ||
+             FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()]
+               == FuncInfo.MBB) &&
+            isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
+          // An add (in the same block) with a constant operand. Fold the
+          // constant.
+          ConstantInt *CI =
+            cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+          Disp += CI->getSExtValue() * S;
+          // Iterate on the other operand.
+          Op = cast<AddOperator>(Op)->getOperand(0);
+          continue;
+        }
+        if (IndexReg == 0 &&
+            (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
+            (S == 1 || S == 2 || S == 4 || S == 8)) {
+          // Scaled-index addressing.
+          Scale = S;
+          IndexReg = getRegForGEPIndex(Op).first;
+          if (IndexReg == 0)
+            return false;
+          break;
+        }
+        // Unsupported.
+        goto unsupported_gep;
       }
     }
     // Check for displacement overflow.
@@ -439,7 +450,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
     if (X86SelectAddress(U->getOperand(0), AM))
       return true;
 
-    // If we couldn't merge the sub value into this addr mode, revert back to
+    // If we couldn't merge the gep value into this addr mode, revert back to
     // our address and just match the value instead of completely failing.
     AM = SavedAM;
     break;
@@ -451,91 +462,91 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
 
   // Handle constant address.
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
-    // Can't handle alternate code models yet.
+    // Can't handle alternate code models or TLS yet.
     if (TM.getCodeModel() != CodeModel::Small)
       return false;
 
-    // RIP-relative addresses can't have additional register operands.
-    if (Subtarget->isPICStyleRIPRel() &&
-        (AM.Base.Reg != 0 || AM.IndexReg != 0))
-      return false;
-
-    // Can't handle TLS yet.
     if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
       if (GVar->isThreadLocal())
         return false;
+    
+    // RIP-relative addresses can't have additional register operands, so if
+    // we've already folded stuff into the addressing mode, just force the
+    // global value into its own register, which we can use as the basereg.
+    if (!Subtarget->isPICStyleRIPRel() ||
+        (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
+      // Okay, we've committed to selecting this global. Set up the address.
+      AM.GV = GV;
+
+      // Allow the subtarget to classify the global.
+      unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+      // If this reference is relative to the pic base, set it now.
+      if (isGlobalRelativeToPICBase(GVFlags)) {
+        // FIXME: How do we know Base.Reg is free??
+        AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+      }
 
-    // Okay, we've committed to selecting this global. Set up the basic address.
-    AM.GV = GV;
-
-    // Allow the subtarget to classify the global.
-    unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
-
-    // If this reference is relative to the pic base, set it now.
-    if (isGlobalRelativeToPICBase(GVFlags)) {
-      // FIXME: How do we know Base.Reg is free??
-      AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
-    }
-
-    // Unless the ABI requires an extra load, return a direct reference to
-    // the global.
-    if (!isGlobalStubReference(GVFlags)) {
-      if (Subtarget->isPICStyleRIPRel()) {
-        // Use rip-relative addressing if we can.  Above we verified that the
-        // base and index registers are unused.
-        assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
-        AM.Base.Reg = X86::RIP;
+      // Unless the ABI requires an extra load, return a direct reference to
+      // the global.
+      if (!isGlobalStubReference(GVFlags)) {
+        if (Subtarget->isPICStyleRIPRel()) {
+          // Use rip-relative addressing if we can.  Above we verified that the
+          // base and index registers are unused.
+          assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+          AM.Base.Reg = X86::RIP;
+        }
+        AM.GVOpFlags = GVFlags;
+        return true;
       }
-      AM.GVOpFlags = GVFlags;
-      return true;
-    }
 
-    // Ok, we need to do a load from a stub.  If we've already loaded from this
-    // stub, reuse the loaded pointer, otherwise emit the load now.
-    DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V);
-    unsigned LoadReg;
-    if (I != LocalValueMap.end() && I->second != 0) {
-      LoadReg = I->second;
-    } else {
-      // Issue load from stub.
-      unsigned Opc = 0;
-      const TargetRegisterClass *RC = NULL;
-      X86AddressMode StubAM;
-      StubAM.Base.Reg = AM.Base.Reg;
-      StubAM.GV = GV;
-      StubAM.GVOpFlags = GVFlags;
-
-      // Prepare for inserting code in the local-value area.
-      SavePoint SaveInsertPt = enterLocalValueArea();
-
-      if (TLI.getPointerTy() == MVT::i64) {
-        Opc = X86::MOV64rm;
-        RC  = X86::GR64RegisterClass;
-
-        if (Subtarget->isPICStyleRIPRel())
-          StubAM.Base.Reg = X86::RIP;
+      // Ok, we need to do a load from a stub.  If we've already loaded from
+      // this stub, reuse the loaded pointer, otherwise emit the load now.
+      DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V);
+      unsigned LoadReg;
+      if (I != LocalValueMap.end() && I->second != 0) {
+        LoadReg = I->second;
       } else {
-        Opc = X86::MOV32rm;
-        RC  = X86::GR32RegisterClass;
-      }
+        // Issue load from stub.
+        unsigned Opc = 0;
+        const TargetRegisterClass *RC = NULL;
+        X86AddressMode StubAM;
+        StubAM.Base.Reg = AM.Base.Reg;
+        StubAM.GV = GV;
+        StubAM.GVOpFlags = GVFlags;
+
+        // Prepare for inserting code in the local-value area.
+        SavePoint SaveInsertPt = enterLocalValueArea();
+
+        if (TLI.getPointerTy() == MVT::i64) {
+          Opc = X86::MOV64rm;
+          RC  = X86::GR64RegisterClass;
+
+          if (Subtarget->isPICStyleRIPRel())
+            StubAM.Base.Reg = X86::RIP;
+        } else {
+          Opc = X86::MOV32rm;
+          RC  = X86::GR32RegisterClass;
+        }
 
-      LoadReg = createResultReg(RC);
-      MachineInstrBuilder LoadMI =
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg);
-      addFullAddress(LoadMI, StubAM);
+        LoadReg = createResultReg(RC);
+        MachineInstrBuilder LoadMI =
+          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg);
+        addFullAddress(LoadMI, StubAM);
 
-      // Ok, back to normal mode.
-      leaveLocalValueArea(SaveInsertPt);
+        // Ok, back to normal mode.
+        leaveLocalValueArea(SaveInsertPt);
 
-      // Prevent loading GV stub multiple times in same MBB.
-      LocalValueMap[V] = LoadReg;
-    }
+        // Prevent loading GV stub multiple times in same MBB.
+        LocalValueMap[V] = LoadReg;
+      }
 
-    // Now construct the final address. Note that the Disp, Scale,
-    // and Index values may already be set here.
-    AM.Base.Reg = LoadReg;
-    AM.GV = 0;
-    return true;
+      // Now construct the final address. Note that the Disp, Scale,
+      // and Index values may already be set here.
+      AM.Base.Reg = LoadReg;
+      AM.GV = 0;
+      return true;
+    }
   }
 
   // If all else fails, try to materialize the value in a register.
@@ -856,12 +867,9 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
 
     unsigned NEReg = createResultReg(&X86::GR8RegClass);
     unsigned PReg = createResultReg(&X86::GR8RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-            TII.get(X86::SETNEr), NEReg);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-            TII.get(X86::SETPr), PReg);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-            TII.get(X86::OR8rr), ResultReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETNEr), NEReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETPr), PReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::OR8rr),ResultReg)
       .addReg(PReg).addReg(NEReg);
     UpdateValueMap(I, ResultReg);
     return true;
@@ -1059,14 +1067,49 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
         }
       }
     }
+  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+    // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
+    // typically happen for _Bool and C++ bools.
+    MVT SourceVT;
+    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+        isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
+      unsigned TestOpc = 0;
+      switch (SourceVT.SimpleTy) {
+      default: break;
+      case MVT::i8:  TestOpc = X86::TEST8ri; break;
+      case MVT::i16: TestOpc = X86::TEST16ri; break;
+      case MVT::i32: TestOpc = X86::TEST32ri; break;
+      case MVT::i64: TestOpc = X86::TEST64ri32; break;
+      }
+      if (TestOpc) {
+        unsigned OpReg = getRegForValue(TI->getOperand(0));
+        if (OpReg == 0) return false;
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TestOpc))
+          .addReg(OpReg).addImm(1);
+        
+        unsigned JmpOpc = X86::JNE_4;
+        if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
+          std::swap(TrueMBB, FalseMBB);
+          JmpOpc = X86::JE_4;
+        }
+        
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(JmpOpc))
+          .addMBB(TrueMBB);
+        FastEmitBranch(FalseMBB, DL);
+        FuncInfo.MBB->addSuccessor(TrueMBB);
+        return true;
+      }
+    }
   }
 
   // Otherwise do a clumsy setcc and re-test it.
+  // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
+  // in an explicit cast, so make sure to handle that correctly.
   unsigned OpReg = getRegForValue(BI->getCondition());
   if (OpReg == 0) return false;
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8rr))
-    .addReg(OpReg).addReg(OpReg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8ri))
+    .addReg(OpReg).addImm(1);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::JNE_4))
     .addMBB(TrueMBB);
   FastEmitBranch(FalseMBB, DL);
@@ -1075,42 +1118,42 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
 }
 
 bool X86FastISel::X86SelectShift(const Instruction *I) {
-  unsigned CReg = 0, OpReg = 0, OpImm = 0;
+  unsigned CReg = 0, OpReg = 0;
   const TargetRegisterClass *RC = NULL;
   if (I->getType()->isIntegerTy(8)) {
     CReg = X86::CL;
     RC = &X86::GR8RegClass;
     switch (I->getOpcode()) {
-    case Instruction::LShr: OpReg = X86::SHR8rCL; OpImm = X86::SHR8ri; break;
-    case Instruction::AShr: OpReg = X86::SAR8rCL; OpImm = X86::SAR8ri; break;
-    case Instruction::Shl:  OpReg = X86::SHL8rCL; OpImm = X86::SHL8ri; break;
+    case Instruction::LShr: OpReg = X86::SHR8rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR8rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL8rCL; break;
     default: return false;
     }
   } else if (I->getType()->isIntegerTy(16)) {
     CReg = X86::CX;
     RC = &X86::GR16RegClass;
     switch (I->getOpcode()) {
-    case Instruction::LShr: OpReg = X86::SHR16rCL; OpImm = X86::SHR16ri; break;
-    case Instruction::AShr: OpReg = X86::SAR16rCL; OpImm = X86::SAR16ri; break;
-    case Instruction::Shl:  OpReg = X86::SHL16rCL; OpImm = X86::SHL16ri; break;
+    case Instruction::LShr: OpReg = X86::SHR16rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR16rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL16rCL; break;
     default: return false;
     }
   } else if (I->getType()->isIntegerTy(32)) {
     CReg = X86::ECX;
     RC = &X86::GR32RegClass;
     switch (I->getOpcode()) {
-    case Instruction::LShr: OpReg = X86::SHR32rCL; OpImm = X86::SHR32ri; break;
-    case Instruction::AShr: OpReg = X86::SAR32rCL; OpImm = X86::SAR32ri; break;
-    case Instruction::Shl:  OpReg = X86::SHL32rCL; OpImm = X86::SHL32ri; break;
+    case Instruction::LShr: OpReg = X86::SHR32rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR32rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL32rCL; break;
     default: return false;
     }
   } else if (I->getType()->isIntegerTy(64)) {
     CReg = X86::RCX;
     RC = &X86::GR64RegClass;
     switch (I->getOpcode()) {
-    case Instruction::LShr: OpReg = X86::SHR64rCL; OpImm = X86::SHR64ri; break;
-    case Instruction::AShr: OpReg = X86::SAR64rCL; OpImm = X86::SAR64ri; break;
-    case Instruction::Shl:  OpReg = X86::SHL64rCL; OpImm = X86::SHL64ri; break;
+    case Instruction::LShr: OpReg = X86::SHR64rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR64rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL64rCL; break;
     default: return false;
     }
   } else {
@@ -1124,15 +1167,6 @@ bool X86FastISel::X86SelectShift(const Instruction *I) {
   unsigned Op0Reg = getRegForValue(I->getOperand(0));
   if (Op0Reg == 0) return false;
 
-  // Fold immediate in shl(x,3).
-  if (const ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
-    unsigned ResultReg = createResultReg(RC);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpImm),
-            ResultReg).addReg(Op0Reg).addImm(CI->getZExtValue() & 0xff);
-    UpdateValueMap(I, ResultReg);
-    return true;
-  }
-
   unsigned Op1Reg = getRegForValue(I->getOperand(1));
   if (Op1Reg == 0) return false;
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
@@ -1294,10 +1328,61 @@ bool X86FastISel::X86SelectExtractValue(const Instruction *I) {
   return false;
 }
 
+bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
+                                     X86AddressMode SrcAM, uint64_t Len) {
+  // Make sure we don't bloat code by inlining very large memcpy's.
+  bool i64Legal = TLI.isTypeLegal(MVT::i64);
+  if (Len > (i64Legal ? 32 : 16)) return false;
+
+  // We don't care about alignment here since we just emit integer accesses.
+  while (Len) {
+    MVT VT;
+    if (Len >= 8 && i64Legal)
+      VT = MVT::i64;
+    else if (Len >= 4)
+      VT = MVT::i32;
+    else if (Len >= 2)
+      VT = MVT::i16;
+    else {
+      assert(Len == 1);
+      VT = MVT::i8;
+    }
+
+    unsigned Reg;
+    bool RV = X86FastEmitLoad(VT, SrcAM, Reg);
+    RV &= X86FastEmitStore(VT, Reg, DestAM);
+    assert(RV && "Failed to emit load or store??");
+
+    unsigned Size = VT.getSizeInBits()/8;
+    Len -= Size;
+    DestAM.Disp += Size;
+    SrcAM.Disp += Size;
+  }
+
+  return true;
+}
+
 bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
   // FIXME: Handle more intrinsics.
   switch (I.getIntrinsicID()) {
   default: return false;
+  case Intrinsic::memcpy: {
+    const MemCpyInst &MCI = cast<MemCpyInst>(I);
+    // Don't handle volatile or variable length memcpys.
+    if (MCI.isVolatile() || !isa<ConstantInt>(MCI.getLength()))
+      return false;
+
+    uint64_t Len = cast<ConstantInt>(MCI.getLength())->getZExtValue();
+    
+    // Get the address of the dest and source addresses.
+    X86AddressMode DestAM, SrcAM;
+    if (!X86SelectAddress(MCI.getRawDest(), DestAM) ||
+        !X86SelectAddress(MCI.getRawSource(), SrcAM))
+      return false;
+
+    return TryEmitSmallMemcpy(DestAM, SrcAM, Len);
+  }
+      
   case Intrinsic::stackprotector: {
     // Emit code inline code to store the stack guard onto the stack.
     EVT PtrTy = TLI.getPointerTy();
@@ -1308,17 +1393,14 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     // Grab the frame index.
     X86AddressMode AM;
     if (!X86SelectAddress(Slot, AM)) return false;
-
     if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
-
     return true;
   }
   case Intrinsic::objectsize: {
-    ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(1));
+    // FIXME: This should be moved to generic code!
+    ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
     const Type *Ty = I.getCalledFunction()->getReturnType();
 
-    assert(CI && "Non-constant type in Intrinsic::objectsize?");
-
     MVT VT;
     if (!isTypeLegal(Ty, VT))
       return false;
@@ -1356,6 +1438,8 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
   }
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::uadd_with_overflow: {
+    // FIXME: Should fold immediates.
+    
     // Replace "add with overflow" intrinsics with an "add" instruction followed
     // by a seto/setc instruction. Later on, when the "extractvalue"
     // instructions are encountered, we use the fact that two registers were
@@ -1427,8 +1511,7 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   // Handle only C and fastcc calling conventions for now.
   ImmutableCallSite CS(CI);
   CallingConv::ID CC = CS.getCallingConv();
-  if (CC != CallingConv::C &&
-      CC != CallingConv::Fast &&
+  if (CC != CallingConv::C && CC != CallingConv::Fast &&
       CC != CallingConv::X86_FastCall)
     return false;
 
@@ -1437,14 +1520,17 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   if (CC == CallingConv::Fast && GuaranteedTailCallOpt)
     return false;
 
-  // Let SDISel handle vararg functions.
   const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
   const FunctionType *FTy = cast<FunctionType>(PT->getElementType());
-  if (FTy->isVarArg())
+  bool isVarArg = FTy->isVarArg();
+
+  // Don't know how to handle Win64 varargs yet.  Nothing special needed for
+  // x86-32.  Special handling for x86-64 is implemented.
+  if (isVarArg && Subtarget->isTargetWin64())
     return false;
 
   // Fast-isel doesn't know about callee-pop yet.
-  if (Subtarget->IsCalleePop(FTy->isVarArg(), CC))
+  if (Subtarget->IsCalleePop(isVarArg, CC))
     return false;
 
   // Handle *simple* calls for now.
@@ -1487,9 +1573,7 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   ArgFlags.reserve(CS.arg_size());
   for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
        i != e; ++i) {
-    unsigned Arg = getRegForValue(*i);
-    if (Arg == 0)
-      return false;
+    Value *ArgVal = *i;
     ISD::ArgFlagsTy Flags;
     unsigned AttrInd = i - CS.arg_begin() + 1;
     if (CS.paramHasAttr(AttrInd, Attribute::SExt))
@@ -1497,34 +1581,67 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
     if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
       Flags.setZExt();
 
+    // If this is an i1/i8/i16 argument, promote to i32 to avoid an extra
+    // instruction.  This is safe because it is common to all fastisel supported
+    // calling conventions on x86.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(ArgVal)) {
+      if (CI->getBitWidth() == 1 || CI->getBitWidth() == 8 ||
+          CI->getBitWidth() == 16) {
+        if (Flags.isSExt())
+          ArgVal = ConstantExpr::getSExt(CI,Type::getInt32Ty(CI->getContext()));
+        else
+          ArgVal = ConstantExpr::getZExt(CI,Type::getInt32Ty(CI->getContext()));
+      }
+    }
+    
+    unsigned ArgReg;
+    
+    // Passing bools around ends up doing a trunc to i1 and passing it.
+    // Codegen this as an argument + "and 1".
+    if (ArgVal->getType()->isIntegerTy(1) && isa<TruncInst>(ArgVal) &&
+        cast<TruncInst>(ArgVal)->getParent() == I->getParent() &&
+        ArgVal->hasOneUse()) {
+      ArgVal = cast<TruncInst>(ArgVal)->getOperand(0);
+      ArgReg = getRegForValue(ArgVal);
+      if (ArgReg == 0) return false;
+      
+      MVT ArgVT;
+      if (!isTypeLegal(ArgVal->getType(), ArgVT)) return false;
+      
+      ArgReg = FastEmit_ri(ArgVT, ArgVT, ISD::AND, ArgReg,
+                           ArgVal->hasOneUse(), 1);
+    } else {
+      ArgReg = getRegForValue(ArgVal);
+    }
+
+    if (ArgReg == 0) return false;
+
     // FIXME: Only handle *easy* calls for now.
     if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
-        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
         CS.paramHasAttr(AttrInd, Attribute::Nest) ||
         CS.paramHasAttr(AttrInd, Attribute::ByVal))
       return false;
 
-    const Type *ArgTy = (*i)->getType();
+    const Type *ArgTy = ArgVal->getType();
     MVT ArgVT;
     if (!isTypeLegal(ArgTy, ArgVT))
       return false;
     unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
     Flags.setOrigAlign(OriginalAlignment);
 
-    Args.push_back(Arg);
-    ArgVals.push_back(*i);
+    Args.push_back(ArgReg);
+    ArgVals.push_back(ArgVal);
     ArgVTs.push_back(ArgVT);
     ArgFlags.push_back(Flags);
   }
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, false, TM, ArgLocs, I->getParent()->getContext());
+  CCState CCInfo(CC, isVarArg, TM, ArgLocs, I->getParent()->getContext());
 
   // Allocate shadow area for Win64
-  if (Subtarget->isTargetWin64()) {
+  if (Subtarget->isTargetWin64())
     CCInfo.AllocateStack(32, 8);
-  }
 
   CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_X86);
 
@@ -1618,6 +1735,17 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
             X86::EBX).addReg(Base);
   }
 
+  if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) {
+    // Count the number of XMM registers allocated.
+    static const unsigned XMMArgRegs[] = {
+      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+    };
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::MOV8ri),
+            X86::AL).addImm(NumXMMRegs);
+  }
+
   // Issue the call.
   MachineInstrBuilder MIB;
   if (CalleeOp) {
@@ -1656,7 +1784,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
       OpFlags = X86II::MO_PLT;
     } else if (Subtarget->isPICStyleStubAny() &&
                (GV->isDeclaration() || GV->isWeakForLinker()) &&
-               Subtarget->getDarwinVers() < 9) {
+               (!Subtarget->getTargetTriple().isMacOSX() ||
+                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
       // PC-relative references to external symbols should go through $stub,
       // unless we're building with the leopard linker or later, which
       // automatically synthesizes these stubs.
@@ -1672,14 +1801,20 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   if (Subtarget->isPICStyleGOT())
     MIB.addReg(X86::EBX);
 
+  if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64())
+    MIB.addReg(X86::AL);
+
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
     MIB.addReg(RegArgs[i]);
 
   // Issue CALLSEQ_END
   unsigned AdjStackUp = TM.getRegisterInfo()->getCallFrameDestroyOpcode();
+  unsigned NumBytesCallee = 0;
+  if (!Subtarget->is64Bit() && CS.paramHasAttr(1, Attribute::StructRet))
+    NumBytesCallee = 4;
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp))
-    .addImm(NumBytes).addImm(0);
+    .addImm(NumBytes).addImm(NumBytesCallee);
 
   // Now handle call return value (if any).
   SmallVector<unsigned, 4> UsedRegs;
@@ -1850,10 +1985,13 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
   if (isa<GlobalValue>(C)) {
     X86AddressMode AM;
     if (X86SelectAddress(C, AM)) {
-      if (TLI.getPointerTy() == MVT::i32)
-        Opc = X86::LEA32r;
-      else
-        Opc = X86::LEA64r;
+      // If the expression is just a basereg, then we're done, otherwise we need
+      // to emit an LEA.
+      if (AM.BaseType == X86AddressMode::RegBase &&
+          AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == 0)
+        return AM.Base.Reg;
+      
+      Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
       unsigned ResultReg = createResultReg(RC);
       addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                              TII.get(Opc), ResultReg), AM);
@@ -1915,6 +2053,45 @@ unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) {
   return ResultReg;
 }
 
+unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
+  MVT VT;
+  if (!isTypeLegal(CF->getType(), VT))
+    return false;
+
+  // Get opcode and regclass for the given zero.
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = NULL;
+  switch (VT.SimpleTy) {
+    default: return false;
+    case MVT::f32:
+      if (Subtarget->hasSSE1()) {
+        Opc = X86::FsFLD0SS;
+        RC  = X86::FR32RegisterClass;
+      } else {
+        Opc = X86::LD_Fp032;
+        RC  = X86::RFP32RegisterClass;
+      }
+      break;
+    case MVT::f64:
+      if (Subtarget->hasSSE2()) {
+        Opc = X86::FsFLD0SD;
+        RC  = X86::FR64RegisterClass;
+      } else {
+        Opc = X86::LD_Fp064;
+        RC  = X86::RFP64RegisterClass;
+      }
+      break;
+    case MVT::f80:
+      // No f80 support yet.
+      return false;
+  }
+
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg);
+  return ResultReg;
+}
+
+
 /// TryToFoldLoad - The specified machine instr operand is a vreg, and that
 /// vreg is being provided by the specified load instruction.  If possible,
 /// try to fold the load as an operand to the instruction, returning true if
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 3aaa693..325d061 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -1307,7 +1307,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
     // set up by FpSET_ST0, and our StackTop is off by one because of it.
     unsigned Op0 = getFPReg(MI->getOperand(0));
     // Restore the actual StackTop from before Fp_SET_ST0.
-    // Note we can't handle Fp_SET_ST1 without a preceeding Fp_SET_ST0, and we
+    // Note we can't handle Fp_SET_ST1 without a preceding Fp_SET_ST0, and we
     // are not enforcing the constraint.
     ++StackTop;
     unsigned RegOnTop = getStackEntry(0); // This reg must remain in st(0).
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 0a3f931..06d12fc 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
@@ -296,7 +297,7 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
   // FIXME: This is dirty hack. The code itself is pretty mess right now.
   // It should be rewritten from scratch and generalized sometimes.
 
-  // Determine maximum offset (minumum due to stack growth).
+  // Determine maximum offset (minimum due to stack growth).
   int64_t MaxOffset = 0;
   for (std::vector<CalleeSavedInfo>::const_iterator
          I = CSI.begin(), E = CSI.end(); I != E; ++I)
@@ -551,65 +552,71 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   // responsible for adjusting the stack pointer.  Touching the stack at 4K
   // increments is necessary to ensure that the guard pages used by the OS
   // virtual memory manager are allocated in correct sequence.
-  if (NumBytes >= 4096 &&
-      (STI.isTargetCygMing() || STI.isTargetWin32()) &&
-      !STI.isTargetEnvMacho()) {
+  if (NumBytes >= 4096 && STI.isTargetCOFF() && !STI.isTargetEnvMacho()) {
+    const char *StackProbeSymbol;
+    bool isSPUpdateNeeded = false;
+
+    if (Is64Bit) {
+      if (STI.isTargetCygMing())
+        StackProbeSymbol = "___chkstk";
+      else {
+        StackProbeSymbol = "__chkstk";
+        isSPUpdateNeeded = true;
+      }
+    } else if (STI.isTargetCygMing())
+      StackProbeSymbol = "_alloca";
+    else
+      StackProbeSymbol = "_chkstk";
+
     // Check whether EAX is livein for this function.
     bool isEAXAlive = isEAXLiveIn(MF);
 
-    const char *StackProbeSymbol =
-      STI.isTargetWindows() ? "_chkstk" : "_alloca";
-    if (Is64Bit && STI.isTargetCygMing())
-      StackProbeSymbol = "__chkstk";
-    unsigned CallOp = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
-    if (!isEAXAlive) {
-      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
-        .addImm(NumBytes);
-      BuildMI(MBB, MBBI, DL, TII.get(CallOp))
-        .addExternalSymbol(StackProbeSymbol)
-        .addReg(StackPtr,    RegState::Define | RegState::Implicit)
-        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
-    } else {
+    if (isEAXAlive) {
+      // Sanity check that EAX is not livein for this function.
+      // It should not be, so throw an assert.
+      assert(!Is64Bit && "EAX is livein in x64 case!");
+
       // Save EAX
       BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
         .addReg(X86::EAX, RegState::Kill);
+    }
 
-      // Allocate NumBytes-4 bytes on stack. We'll also use 4 already
-      // allocated bytes for EAX.
+    if (Is64Bit) {
+      // Handle the 64-bit Windows ABI case where we need to call __chkstk.
+      // Function prologue is responsible for adjusting the stack pointer.
+      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
+        .addImm(NumBytes);
+    } else {
+      // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
+      // We'll also use 4 already allocated bytes for EAX.
       BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
-        .addImm(NumBytes - 4);
-      BuildMI(MBB, MBBI, DL, TII.get(CallOp))
-        .addExternalSymbol(StackProbeSymbol)
-        .addReg(StackPtr,    RegState::Define | RegState::Implicit)
-        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
-
-      // Restore EAX
-      MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
-                                              X86::EAX),
-                                      StackPtr, false, NumBytes - 4);
-      MBB.insert(MBBI, MI);
+        .addImm(isEAXAlive ? NumBytes - 4 : NumBytes);
+    }
+
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::W64ALLOCA : X86::CALLpcrel32))
+      .addExternalSymbol(StackProbeSymbol)
+      .addReg(StackPtr,    RegState::Define | RegState::Implicit)
+      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+
+    // MSVC x64's __chkstk needs to adjust %rsp.
+    // FIXME: %rax preserves the offset and should be available.
+    if (isSPUpdateNeeded)
+      emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit,
+                   TII, *RegInfo);
+
+    if (isEAXAlive) {
+        // Restore EAX
+        MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
+                                                X86::EAX),
+                                        StackPtr, false, NumBytes - 4);
+        MBB.insert(MBBI, MI);
     }
-  } else if (NumBytes >= 4096 &&
-             STI.isTargetWin64() &&
-             !STI.isTargetEnvMacho()) {
-    // Sanity check that EAX is not livein for this function.  It should
-    // not be, so throw an assert.
-    assert(!isEAXLiveIn(MF) && "EAX is livein in the Win64 case!");
-
-    // Handle the 64-bit Windows ABI case where we need to call __chkstk.
-    // Function prologue is responsible for adjusting the stack pointer.
-    BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
-      .addImm(NumBytes);
-    BuildMI(MBB, MBBI, DL, TII.get(X86::WINCALL64pcrel32))
-      .addExternalSymbol("__chkstk")
-      .addReg(StackPtr, RegState::Define | RegState::Implicit);
-    emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit,
-                 TII, *RegInfo);
   } else if (NumBytes)
     emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit,
                  TII, *RegInfo);
 
-  if ((NumBytes || PushedRegs) && needsFrameMoves) {
+  if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) {
     // Mark end of stack pointer adjustment.
     MCSymbol *Label = MMI.getContext().CreateTempSymbol();
     BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label);
@@ -779,7 +786,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     assert(Offset >= 0 && "Offset should never be negative");
 
     if (Offset) {
-      // Check for possible merge with preceeding ADD instruction.
+      // Check for possible merge with preceding ADD instruction.
       Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
       emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, TII, *RegInfo);
     }
@@ -823,7 +830,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     int delta = -1*X86FI->getTCReturnAddrDelta();
     MBBI = MBB.getLastNonDebugInstr();
 
-    // Check for possible merge with preceeding ADD instruction.
+    // Check for possible merge with preceding ADD instruction.
     delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
     emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, TII, *RegInfo);
   }
@@ -892,7 +899,6 @@ bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
 
   MachineFunction &MF = *MBB.getParent();
 
-  bool isWin64 = STI.isTargetWin64();
   unsigned SlotSize = STI.is64Bit() ? 8 : 4;
   unsigned FPReg = TRI->getFrameRegister(MF);
   unsigned CalleeFrameSize = 0;
@@ -900,25 +906,39 @@ bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
 
+  // Push GPRs. It increases frame size.
   unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
   for (unsigned i = CSI.size(); i != 0; --i) {
     unsigned Reg = CSI[i-1].getReg();
+    if (!X86::GR64RegClass.contains(Reg) &&
+        !X86::GR32RegClass.contains(Reg))
+      continue;
     // Add the callee-saved register as live-in. It's killed at the spill.
     MBB.addLiveIn(Reg);
     if (Reg == FPReg)
       // X86RegisterInfo::emitPrologue will handle spilling of frame register.
       continue;
-    if (!X86::VR128RegClass.contains(Reg) && !isWin64) {
-      CalleeFrameSize += SlotSize;
-      BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill);
-    } else {
-      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-      TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(),
-                              RC, TRI);
-    }
+    CalleeFrameSize += SlotSize;
+    BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill);
   }
 
   X86FI->setCalleeSavedFrameSize(CalleeFrameSize);
+
+  // Make XMM regs spilled. X86 does not have ability of push/pop XMM.
+  // It can be done by spilling XMMs to stack frame.
+  // Note that only Win64 ABI might spill XMMs.
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    if (X86::GR64RegClass.contains(Reg) ||
+        X86::GR32RegClass.contains(Reg))
+      continue;
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(),
+                            RC, TRI);
+  }
+
   return true;
 }
 
@@ -933,21 +953,30 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  // Reload XMMs from stack frame.
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    if (X86::GR64RegClass.contains(Reg) ||
+        X86::GR32RegClass.contains(Reg))
+      continue;
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(),
+                             RC, TRI);
+  }
+
+  // POP GPRs.
   unsigned FPReg = TRI->getFrameRegister(MF);
-  bool isWin64 = STI.isTargetWin64();
   unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
+    if (!X86::GR64RegClass.contains(Reg) &&
+        !X86::GR32RegClass.contains(Reg))
+      continue;
     if (Reg == FPReg)
       // X86RegisterInfo::emitEpilogue will handle restoring of frame register.
       continue;
-    if (!X86::VR128RegClass.contains(Reg) && !isWin64) {
-      BuildMI(MBB, MI, DL, TII.get(Opc), Reg);
-    } else {
-      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-      TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(),
-                               RC, TRI);
-    }
+    BuildMI(MBB, MI, DL, TII.get(Opc), Reg);
   }
   return true;
 }
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 9b0ec6e..4534e85 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1580,6 +1580,81 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       return RetVal;
     break;
   }
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    // For operations of the form (x << C1) op C2, check if we can use a smaller
+    // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse())
+      break;
+
+    // i8 is unshrinkable, i16 should be promoted to i32.
+    if (NVT != MVT::i32 && NVT != MVT::i64)
+      break;
+
+    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
+    ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+    if (!Cst || !ShlCst)
+      break;
+
+    int64_t Val = Cst->getSExtValue();
+    uint64_t ShlVal = ShlCst->getZExtValue();
+
+    // Make sure that we don't change the operation by removing bits.
+    // This only matters for OR and XOR, AND is unaffected.
+    if (Opcode != ISD::AND && ((Val >> ShlVal) << ShlVal) != Val)
+      break;
+
+    unsigned ShlOp, Op = 0;
+    EVT CstVT = NVT;
+
+    // Check the minimum bitwidth for the new constant.
+    // TODO: AND32ri is the same as AND64ri32 with zext imm.
+    // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
+    // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
+    if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal))
+      CstVT = MVT::i8;
+    else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal))
+      CstVT = MVT::i32;
+
+    // Bail if there is no smaller encoding.
+    if (NVT == CstVT)
+      break;
+
+    switch (NVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i32:
+      assert(CstVT == MVT::i8);
+      ShlOp = X86::SHL32ri;
+
+      switch (Opcode) {
+      case ISD::AND: Op = X86::AND32ri8; break;
+      case ISD::OR:  Op =  X86::OR32ri8; break;
+      case ISD::XOR: Op = X86::XOR32ri8; break;
+      }
+      break;
+    case MVT::i64:
+      assert(CstVT == MVT::i8 || CstVT == MVT::i32);
+      ShlOp = X86::SHL64ri;
+
+      switch (Opcode) {
+      case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
+      case ISD::OR:  Op = CstVT==MVT::i8?  X86::OR64ri8 :  X86::OR64ri32; break;
+      case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
+      }
+      break;
+    }
+
+    // Emit the smaller op and the shift.
+    SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, CstVT);
+    SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
+    return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
+                                getI8Imm(ShlVal));
+    break;
+  }
   case X86ISD::UMUL: {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 2f49dbc..703c01d 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -45,6 +45,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/VectorExtras.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -221,7 +222,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   // X86 is weird, it always uses i8 for shift amounts and setcc results.
   setBooleanContents(ZeroOrOneBooleanContent);
-  setSchedulingPreference(Sched::RegPressure);
+    
+  // For 64-bit since we have so many registers use the ILP scheduler, for
+  // 32-bit code use the register pressure specific scheduling.
+  if (Subtarget->is64Bit())
+    setSchedulingPreference(Sched::ILP);
+  else
+    setSchedulingPreference(Sched::RegPressure);
   setStackPointerRegisterToSaveRestore(X86StackPtr);
 
   if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
@@ -543,12 +550,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
-  if (Subtarget->is64Bit())
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
-  if (Subtarget->isTargetCygMing() || Subtarget->isTargetWindows())
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
-  else
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC,
+                     (Subtarget->is64Bit() ? MVT::i64 : MVT::i32),
+                     (Subtarget->isTargetCOFF()
+                      && !Subtarget->isTargetEnvMacho()
+                      ? Custom : Expand));
 
   if (!UseSoftFloat && X86ScalarSSEf64) {
     // f32 and f64 use SSE.
@@ -921,6 +927,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     // Can turn SHL into an integer multiply.
     setOperationAction(ISD::SHL,                MVT::v4i32, Custom);
     setOperationAction(ISD::SHL,                MVT::v16i8, Custom);
+    setOperationAction(ISD::SRL,                MVT::v4i32, Legal);
 
     // i8 and i16 vectors are custom , because the source register and source
     // source memory operand types are not the same width.  f32 vectors are
@@ -1271,27 +1278,6 @@ X86TargetLowering::findRepresentativeClass(EVT VT) const{
   return std::make_pair(RRC, Cost);
 }
 
-// FIXME: Why this routine is here? Move to RegInfo!
-unsigned
-X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC,
-                                       MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0;
-  switch (RC->getID()) {
-  default:
-    return 0;
-  case X86::GR32RegClassID:
-    return 4 - FPDiff;
-  case X86::GR64RegClassID:
-    return 8 - FPDiff;
-  case X86::VR128RegClassID:
-    return Subtarget->is64Bit() ? 10 : 4;
-  case X86::VR64RegClassID:
-    return 4;
-  }
-}
-
 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
                                                unsigned &Offset) const {
   if (!Subtarget->isTargetLinux())
@@ -1463,6 +1449,20 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const {
   return HasRet;
 }
 
+EVT
+X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
+                                            ISD::NodeType ExtendKind) const {
+  MVT ReturnMVT;
+  // TODO: Is this also valid on 32-bit?
+  if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
+    ReturnMVT = MVT::i8;
+  else
+    ReturnMVT = MVT::i32;
+
+  EVT MinVT = getRegisterType(Context, ReturnMVT);
+  return VT.bitsLT(MinVT) ? MinVT : VT;
+}
+
 /// LowerCallResult - Lower the result values of a call into the
 /// appropriate copies out of appropriate physical registers.
 ///
@@ -1595,6 +1595,18 @@ static bool IsTailCallConvention(CallingConv::ID CC) {
   return (CC == CallingConv::Fast || CC == CallingConv::GHC);
 }
 
+bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  if (!CI->isTailCall())
+    return false;
+
+  CallSite CS(CI);
+  CallingConv::ID CalleeCC = CS.getCallingConv();
+  if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
+    return false;
+
+  return true;
+}
+
 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
 /// a tailcall target by changing its ABI.
 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) {
@@ -1627,8 +1639,9 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
   // In case of tail call optimization mark all arguments mutable. Since they
   // could be overwritten by lowering of arguments in case of a tail call.
   if (Flags.isByVal()) {
-    int FI = MFI->CreateFixedObject(Flags.getByValSize(),
-                                    VA.getLocMemOffset(), isImmutable);
+    unsigned Bytes = Flags.getByValSize();
+    if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
+    int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
     return DAG.getFrameIndex(FI, getPointerTy());
   } else {
     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
@@ -1765,8 +1778,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
   // If the function takes variable number of arguments, make a frame index for
   // the start of the first vararg value... for expansion of llvm.va_start.
   if (isVarArg) {
-    if (!IsWin64 && (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
-                    CallConv != CallingConv::X86_ThisCall))) {
+    if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
+                    CallConv != CallingConv::X86_ThisCall)) {
       FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
     }
     if (Is64Bit) {
@@ -1818,7 +1831,9 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
         FuncInfo->setRegSaveFrameIndex(
           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
-        FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
+        // Fixup to set vararg frame on shadow area (4 x i64).
+        if (NumIntRegs < 4)
+          FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
       } else {
         // For X86-64, if there are vararg parameters that are passed via
         // registers, then we must store them to their spots on the stack so they
@@ -1937,7 +1952,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
   return SDValue(OutRetAddr.getNode(), 1);
 }
 
-/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call
+/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
 /// optimization is performed and it is required (FPDiff!=0).
 static SDValue
 EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
@@ -2028,7 +2043,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
 
   SDValue RetAddrFrIdx;
-  // Load return adress for tail calls.
+  // Load return address for tail calls.
   if (isTailCall && FPDiff)
     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
                                     Is64Bit, FPDiff, dl);
@@ -2185,7 +2200,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     SmallVector<SDValue, 8> MemOpChains2;
     SDValue FIN;
     int FI = 0;
-    // Do not flag preceeding copytoreg stuff together with the following stuff.
+    // Do not flag preceding copytoreg stuff together with the following stuff.
     InFlag = SDValue();
     if (GuaranteedTailCallOpt) {
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -2266,7 +2281,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
         OpFlags = X86II::MO_PLT;
       } else if (Subtarget->isPICStyleStubAny() &&
                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
-                 Subtarget->getDarwinVers() < 9) {
+                 (!Subtarget->getTargetTriple().isMacOSX() ||
+                  Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
         // PC-relative references to external symbols should go through $stub,
         // unless we're building with the leopard linker or later, which
         // automatically synthesizes these stubs.
@@ -2285,7 +2301,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
         getTargetMachine().getRelocationModel() == Reloc::PIC_) {
       OpFlags = X86II::MO_PLT;
     } else if (Subtarget->isPICStyleStubAny() &&
-               Subtarget->getDarwinVers() < 9) {
+               (!Subtarget->getTargetTriple().isMacOSX() ||
+                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
       // PC-relative references to external symbols should go through $stub,
       // unless we're building with the leopard linker or later, which
       // automatically synthesizes these stubs.
@@ -3173,7 +3190,8 @@ bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
 bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
   unsigned NumElems = N->getValueType(0).getVectorNumElements();
 
-  if (NumElems != 2 && NumElems != 4)
+  if ((NumElems != 2 && NumElems != 4)
+      || N->getValueType(0).getSizeInBits() > 128)
     return false;
 
   for (unsigned i = 0; i < NumElems/2; ++i)
@@ -3195,19 +3213,36 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
   if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
     return false;
 
-  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
-    int BitI  = Mask[i];
-    int BitI1 = Mask[i+1];
-    if (!isUndefOrEqual(BitI, j))
-      return false;
-    if (V2IsSplat) {
-      if (!isUndefOrEqual(BitI1, NumElts))
-        return false;
-    } else {
-      if (!isUndefOrEqual(BitI1, j + NumElts))
+  // Handle vector lengths > 128 bits.  Define a "section" as a set of
+  // 128 bits.  AVX defines UNPCK* to operate independently on 128-bit
+  // sections.
+  unsigned NumSections = VT.getSizeInBits() / 128;
+  if (NumSections == 0 ) NumSections = 1;  // Handle MMX
+  unsigned NumSectionElts = NumElts / NumSections;
+
+  unsigned Start = 0;
+  unsigned End = NumSectionElts;
+  for (unsigned s = 0; s < NumSections; ++s) {
+    for (unsigned i = Start, j = s * NumSectionElts;
+         i != End;
+         i += 2, ++j) {
+      int BitI  = Mask[i];
+      int BitI1 = Mask[i+1];
+      if (!isUndefOrEqual(BitI, j))
         return false;
+      if (V2IsSplat) {
+        if (!isUndefOrEqual(BitI1, NumElts))
+          return false;
+      } else {
+        if (!isUndefOrEqual(BitI1, j + NumElts))
+          return false;
+      }
     }
+    // Process the next 128 bits.
+    Start += NumSectionElts;
+    End += NumSectionElts;
   }
+
   return true;
 }
 
@@ -3255,14 +3290,27 @@ static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
   if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
     return false;
 
-  for (int i = 0, j = 0; i != NumElems; i += 2, ++j) {
-    int BitI  = Mask[i];
-    int BitI1 = Mask[i+1];
-    if (!isUndefOrEqual(BitI, j))
-      return false;
-    if (!isUndefOrEqual(BitI1, j))
-      return false;
+  // Handle vector lengths > 128 bits.  Define a "section" as a set of
+  // 128 bits.  AVX defines UNPCK* to operate independently on 128-bit
+  // sections.
+  unsigned NumSections = VT.getSizeInBits() / 128;
+  if (NumSections == 0 ) NumSections = 1;  // Handle MMX
+  unsigned NumSectionElts = NumElems / NumSections;
+
+  for (unsigned s = 0; s < NumSections; ++s) {
+    for (unsigned i = s * NumSectionElts, j = s * NumSectionElts;
+         i != NumSectionElts * (s + 1);
+         i += 2, ++j) {
+      int BitI  = Mask[i];
+      int BitI1 = Mask[i+1];
+
+      if (!isUndefOrEqual(BitI, j))
+        return false;
+      if (!isUndefOrEqual(BitI1, j))
+        return false;
+    }
   }
+
   return true;
 }
 
@@ -3846,8 +3894,8 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
 
 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
 /// element of the result of the vector shuffle.
-SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
-                            unsigned Depth) {
+static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
+                                   unsigned Depth) {
   if (Depth == 6)
     return SDValue();  // Limit search depth.
 
@@ -3895,11 +3943,15 @@ SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
     case X86ISD::PUNPCKLWD:
     case X86ISD::PUNPCKLDQ:
     case X86ISD::PUNPCKLQDQ:
-      DecodePUNPCKLMask(NumElems, ShuffleMask);
+      DecodePUNPCKLMask(VT, ShuffleMask);
       break;
     case X86ISD::UNPCKLPS:
     case X86ISD::UNPCKLPD:
-      DecodeUNPCKLPMask(NumElems, ShuffleMask);
+    case X86ISD::VUNPCKLPS:
+    case X86ISD::VUNPCKLPD:
+    case X86ISD::VUNPCKLPSY:
+    case X86ISD::VUNPCKLPDY:
+      DecodeUNPCKLPMask(VT, ShuffleMask);
       break;
     case X86ISD::MOVHLPS:
       DecodeMOVHLPSMask(NumElems, ShuffleMask);
@@ -3968,7 +4020,7 @@ SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
 
 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
 /// shuffle operation which come from a consecutively from a zero. The
-/// search can start in two diferent directions, from left or right.
+/// search can start in two different directions, from left or right.
 static
 unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems,
                                   bool ZerosFromLeft, SelectionDAG &DAG) {
@@ -5263,6 +5315,7 @@ LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
 
   // Break it into (shuffle shuffle_hi, shuffle_lo).
   Locs.clear();
+  Locs.resize(4);
   SmallVector<int,8> LoMask(4U, -1);
   SmallVector<int,8> HiMask(4U, -1);
 
@@ -5508,12 +5561,16 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
                               X86::getShuffleSHUFImmediate(SVOp), DAG);
 }
 
-static inline unsigned getUNPCKLOpcode(EVT VT) {
+static inline unsigned getUNPCKLOpcode(EVT VT, const X86Subtarget *Subtarget) {
   switch(VT.getSimpleVT().SimpleTy) {
   case MVT::v4i32: return X86ISD::PUNPCKLDQ;
   case MVT::v2i64: return X86ISD::PUNPCKLQDQ;
-  case MVT::v4f32: return X86ISD::UNPCKLPS;
-  case MVT::v2f64: return X86ISD::UNPCKLPD;
+  case MVT::v4f32:
+    return Subtarget->hasAVX() ? X86ISD::VUNPCKLPS : X86ISD::UNPCKLPS;
+  case MVT::v2f64:
+    return Subtarget->hasAVX() ? X86ISD::VUNPCKLPD : X86ISD::UNPCKLPD;
+  case MVT::v8f32: return X86ISD::VUNPCKLPSY;
+  case MVT::v4f64: return X86ISD::VUNPCKLPDY;
   case MVT::v16i8: return X86ISD::PUNPCKLBW;
   case MVT::v8i16: return X86ISD::PUNPCKLWD;
   default:
@@ -5641,7 +5698,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   // unpckh_undef). Only use pshufd if speed is more important than size.
   if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp))
     if (VT != MVT::v2i64 && VT != MVT::v2f64)
-      return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
+      return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), dl, VT, V1, V1, DAG);
   if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp))
     if (VT != MVT::v2i64 && VT != MVT::v2f64)
       return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
@@ -5762,7 +5819,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (X86::isUNPCKLMask(SVOp))
-    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG);
+    return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()),
+                                dl, VT, V1, V2, DAG);
 
   if (X86::isUNPCKHMask(SVOp))
     return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG);
@@ -5789,7 +5847,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
 
     if (X86::isUNPCKLMask(NewSVOp))
-      return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG);
+      return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()),
+                                  dl, VT, V2, V1, DAG);
 
     if (X86::isUNPCKHMask(NewSVOp))
       return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG);
@@ -5812,8 +5871,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
 
   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
       SVOp->getSplatIndex() == 0 && V2IsUndef) {
-    if (VT == MVT::v2f64)
-      return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG);
+    if (VT == MVT::v2f64) {
+      X86ISD::NodeType Opcode =
+        getSubtarget()->hasAVX() ? X86ISD::VUNPCKLPD : X86ISD::UNPCKLPD;
+      return getTargetShuffleNode(Opcode, dl, VT, V1, V1, DAG);
+    }
     if (VT == MVT::v2i64)
       return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG);
   }
@@ -5840,7 +5902,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
 
   if (X86::isUNPCKL_v_undef_Mask(SVOp))
     if (VT != MVT::v2i64 && VT != MVT::v2f64)
-      return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
+      return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()),
+                                  dl, VT, V1, V1, DAG);
   if (X86::isUNPCKH_v_undef_Mask(SVOp))
     if (VT != MVT::v2i64 && VT != MVT::v2f64)
       return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
@@ -7868,6 +7931,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                            SelectionDAG &DAG) const {
   assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) &&
          "This should be used only on Windows targets");
+  assert(!Subtarget->isTargetEnvMacho());
   DebugLoc dl = Op.getDebugLoc();
 
   // Get the inputs.
@@ -7878,8 +7942,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   SDValue Flag;
 
   EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
+  unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
 
-  Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag);
+  Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
   Flag = Chain.getValue(1);
 
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
@@ -8809,8 +8874,8 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SADDO:
     // A subtract of one will be selected as a INC. Note that INC doesn't
     // set CF, so we can't do this for UADDO.
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
-      if (C->getAPIntValue() == 1) {
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
+      if (C->isOne()) {
         BaseOp = X86ISD::INC;
         Cond = X86::COND_O;
         break;
@@ -8825,8 +8890,8 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SSUBO:
     // A subtract of one will be selected as a DEC. Note that DEC doesn't
     // set CF, so we can't do this for USUBO.
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
-      if (C->getAPIntValue() == 1) {
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
+      if (C->isOne()) {
         BaseOp = X86ISD::DEC;
         Cond = X86::COND_O;
         break;
@@ -10351,21 +10416,48 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
+  assert(!Subtarget->isTargetEnvMacho());
+
   // The lowering is pretty easy: we're just emitting the call to _alloca.  The
   // non-trivial part is impdef of ESP.
-  // FIXME: The code should be tweaked as soon as we'll try to do codegen for
-  // mingw-w64.
 
-  const char *StackProbeSymbol =
+  if (Subtarget->isTargetWin64()) {
+    if (Subtarget->isTargetCygMing()) {
+      // ___chkstk(Mingw64):
+      // Clobbers R10, R11, RAX and EFLAGS.
+      // Updates RSP.
+      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
+        .addExternalSymbol("___chkstk")
+        .addReg(X86::RAX, RegState::Implicit)
+        .addReg(X86::RSP, RegState::Implicit)
+        .addReg(X86::RAX, RegState::Define | RegState::Implicit)
+        .addReg(X86::RSP, RegState::Define | RegState::Implicit)
+        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+    } else {
+      // __chkstk(MSVCRT): does not update stack pointer.
+      // Clobbers R10, R11 and EFLAGS.
+      // FIXME: RAX(allocated size) might be reused and not killed.
+      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
+        .addExternalSymbol("__chkstk")
+        .addReg(X86::RAX, RegState::Implicit)
+        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+      // RAX has the offset to subtracted from RSP.
+      BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
+        .addReg(X86::RSP)
+        .addReg(X86::RAX);
+    }
+  } else {
+    const char *StackProbeSymbol =
       Subtarget->isTargetWindows() ? "_chkstk" : "_alloca";
 
-  BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
-    .addExternalSymbol(StackProbeSymbol)
-    .addReg(X86::EAX, RegState::Implicit)
-    .addReg(X86::ESP, RegState::Implicit)
-    .addReg(X86::EAX, RegState::Define | RegState::Implicit)
-    .addReg(X86::ESP, RegState::Define | RegState::Implicit)
-    .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+    BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
+      .addExternalSymbol(StackProbeSymbol)
+      .addReg(X86::EAX, RegState::Implicit)
+      .addReg(X86::ESP, RegState::Implicit)
+      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
+      .addReg(X86::ESP, RegState::Define | RegState::Implicit)
+      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+  }
 
   MI->eraseFromParent();   // The pseudo instruction is gone now.
   return BB;
@@ -12126,7 +12218,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
     AsmPieces.clear();
     SplitString(AsmStr, AsmPieces, " \t");  // Split with whitespace.
 
-    // FIXME: this should verify that we are targetting a 486 or better.  If not,
+    // FIXME: this should verify that we are targeting a 486 or better.  If not,
     // we will turn this bswap into something that will be lowered to logical ops
     // instead of emitting the bswap asm.  For now, we don't support 486 or lower
     // so don't worry about this.
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 6ec4a7d..6301057 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -677,9 +677,6 @@ namespace llvm {
     /// getFunctionAlignment - Return the Log2 alignment of this function.
     virtual unsigned getFunctionAlignment(const Function *F) const;
 
-    unsigned getRegPressureLimit(const TargetRegisterClass *RC,
-                                 MachineFunction &MF) const;
-
     /// getStackCookieLocation - Return true if the target stores stack
     /// protector cookies at a fixed offset in some non-standard address
     /// space, and populates the address space and offset as
@@ -846,6 +843,12 @@ namespace llvm {
 
     virtual bool isUsedByReturnOnly(SDNode *N) const;
 
+    virtual bool mayBeEmittedAsTailCall(CallInst *CI) const;
+
+    virtual EVT
+    getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
+                             ISD::NodeType ExtendKind) const;
+
     virtual bool
       CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
                      const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td
index 45d1c6b..dd4f6a5 100644
--- a/lib/Target/X86/X86Instr3DNow.td
+++ b/lib/Target/X86/X86Instr3DNow.td
@@ -12,66 +12,91 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FIXME: We don't support any intrinsics for these instructions yet.
+class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat>
+      : I<o, F, outs, ins, asm, pat>, TB, Requires<[Has3DNow]> {
+}
 
-class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, 
-             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, TB, Requires<[Has3DNow]> {
+class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+      : I3DNow<o, F, (outs VR64:$dst), ins,
+          !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>,
+        Has3DNow0F0FOpcode {
+  // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
+  let isAsmParserOnly = 1;
+  let Constraints = "$src1 = $dst";
 }
 
-class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic>
-      : I<o, F, (outs VR64:$dst), ins,
-          !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), []>,
-          TB, Requires<[Has3DNow]>, Has3DNow0F0FOpcode {
+class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+      : I3DNow<o, F, (outs VR64:$dst), ins,
+          !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>,
+        Has3DNow0F0FOpcode {
   // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
   let isAsmParserOnly = 1;
 }
 
+multiclass I3DNow_binop_rm<bits<8> opc, string Mn> {
+  def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, []>;
+  def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>;
+}
+
+multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+  def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>;
+  def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1,
+        (bitconvert (load_mmx addr:$src2))))]>;
+}
+
+multiclass I3DNow_conv_rm<bits<8> opc, string Mn> {
+  def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src1), Mn, []>;
+  def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src1), Mn, []>;
+}
 
-let Constraints = "$src1 = $dst" in {
-  // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
-  // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
-  multiclass I3DNow_binop_rm<bits<8> opc, string Mn> {
-    def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn>;
-    def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn>;
-  }
+multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+  def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>;
+  def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn))
+        (bitconvert (load_mmx addr:$src))))]>;
 }
 
-defm PAVGUSB  : I3DNow_binop_rm<0xBF, "pavgusb">;
-defm PF2ID    : I3DNow_binop_rm<0x1D, "pf2id">;
-defm PFACC    : I3DNow_binop_rm<0xAE, "pfacc">;
-defm PFADD    : I3DNow_binop_rm<0x9E, "pfadd">;
-defm PFCMPEQ  : I3DNow_binop_rm<0xB0, "pfcmpeq">;
-defm PFCMPGE  : I3DNow_binop_rm<0x90, "pfcmpge">;
-defm PFCMPGT  : I3DNow_binop_rm<0xA0, "pfcmpgt">;
-defm PFMAX    : I3DNow_binop_rm<0xA4, "pfmax">;
-defm PFMIN    : I3DNow_binop_rm<0x94, "pfmin">;
-defm PFMUL    : I3DNow_binop_rm<0xB4, "pfmul">;
-defm PFRCP    : I3DNow_binop_rm<0x96, "pfrcp">;
-defm PFRCPIT1 : I3DNow_binop_rm<0xA6, "pfrcpit1">;
-defm PFRCPIT2 : I3DNow_binop_rm<0xB6, "pfrcpit2">;
-defm PFRSQIT1 : I3DNow_binop_rm<0xA7, "pfrsqit1">;
-defm PFRSQRT  : I3DNow_binop_rm<0x97, "pfrsqrt">;
-defm PFSUB    : I3DNow_binop_rm<0x9A, "pfsub">;
-defm PFSUBR   : I3DNow_binop_rm<0xAA, "pfsubr">;
-defm PI2FD    : I3DNow_binop_rm<0x0D, "pi2fd">;
-defm PMULHRW  : I3DNow_binop_rm<0xB7, "pmulhrw">;
+defm PAVGUSB  : I3DNow_binop_rm_int<0xBF, "pavgusb">;
+defm PF2ID    : I3DNow_conv_rm_int<0x1D, "pf2id">;
+defm PFACC    : I3DNow_binop_rm_int<0xAE, "pfacc">;
+defm PFADD    : I3DNow_binop_rm_int<0x9E, "pfadd">;
+defm PFCMPEQ  : I3DNow_binop_rm_int<0xB0, "pfcmpeq">;
+defm PFCMPGE  : I3DNow_binop_rm_int<0x90, "pfcmpge">;
+defm PFCMPGT  : I3DNow_binop_rm_int<0xA0, "pfcmpgt">;
+defm PFMAX    : I3DNow_binop_rm_int<0xA4, "pfmax">;
+defm PFMIN    : I3DNow_binop_rm_int<0x94, "pfmin">;
+defm PFMUL    : I3DNow_binop_rm_int<0xB4, "pfmul">;
+defm PFRCP    : I3DNow_conv_rm_int<0x96, "pfrcp">;
+defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">;
+defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">;
+defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">;
+defm PFRSQRT  : I3DNow_conv_rm_int<0x97, "pfrsqrt">;
+defm PFSUB    : I3DNow_binop_rm_int<0x9A, "pfsub">;
+defm PFSUBR   : I3DNow_binop_rm_int<0xAA, "pfsubr">;
+defm PI2FD    : I3DNow_conv_rm_int<0x0D, "pi2fd">;
+defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw">;
 
 
 def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)]>;
 
 def PREFETCH  : I3DNow<0x0D, MRM0m, (outs), (ins i32mem:$addr),
                        "prefetch $addr", []>;
-                       
+
 // FIXME: Diassembler gets a bogus decode conflict.
-let isAsmParserOnly = 1 in {
+let isAsmParserOnly = 1 in
 def PREFETCHW : I3DNow<0x0D, MRM1m, (outs), (ins i16mem:$addr),
                        "prefetchw $addr", []>;
-}
 
 // "3DNowA" instructions
-defm PF2IW    : I3DNow_binop_rm<0x1C, "pf2iw">;
-defm PI2FW    : I3DNow_binop_rm<0x0C, "pi2fw">;
-defm PFNACC   : I3DNow_binop_rm<0x8A, "pfnacc">;
-defm PFPNACC  : I3DNow_binop_rm<0x8E, "pfpnacc">;
-defm PSWAPD   : I3DNow_binop_rm<0xBB, "pswapd">;
+defm PF2IW    : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">;
+defm PI2FW    : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">;
+defm PFNACC   : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">;
+defm PFPNACC  : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">;
+defm PSWAPD   : I3DNow_conv_rm_int<0xBB, "pswapd", "a">;
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index f0ea068..9f7a4b0 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -163,7 +163,7 @@ def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
 
 } // Defs = [EFLAGS]
 
-// Suprisingly enough, these are not two address instructions!
+// Surprisingly enough, these are not two address instructions!
 let Defs = [EFLAGS] in {
 // Register-Integer Signed Integer Multiply
 def IMUL16rri  : Ii16<0x69, MRMSrcReg,                      // GR16 = GR16*I16
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index 77f4725..c228a0ae 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -263,6 +263,16 @@ let isCall = 1, isCodeGenOnly = 1 in
                            Requires<[IsWin64]>;
   }
 
+let isCall = 1, isCodeGenOnly = 1 in
+  // __chkstk(MSVC):     clobber R10, R11 and EFLAGS.
+  // ___chkstk(Mingw64): clobber R10, R11, RAX and EFLAGS, and update RSP.
+  let Defs = [RAX, R10, R11, RSP, EFLAGS],
+      Uses = [RSP] in {
+    def W64ALLOCA : Ii32PCRel<0xE8, RawFrm,
+                      (outs), (ins i64i32imm_pcrel:$dst, variable_ops),
+                      "call{q}\t$dst", []>,
+                    Requires<[IsWin64]>;
+  }
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
     isCodeGenOnly = 1 in
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 0660072..7daa264 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -91,21 +91,23 @@ class REX_W  { bit hasREX_WPrefix = 1; }
 class LOCK   { bit hasLockPrefix = 1; }
 class SegFS  { bits<2> SegOvrBits = 1; }
 class SegGS  { bits<2> SegOvrBits = 2; }
-class TB     { bits<4> Prefix = 1; }
-class REP    { bits<4> Prefix = 2; }
-class D8     { bits<4> Prefix = 3; }
-class D9     { bits<4> Prefix = 4; }
-class DA     { bits<4> Prefix = 5; }
-class DB     { bits<4> Prefix = 6; }
-class DC     { bits<4> Prefix = 7; }
-class DD     { bits<4> Prefix = 8; }
-class DE     { bits<4> Prefix = 9; }
-class DF     { bits<4> Prefix = 10; }
-class XD     { bits<4> Prefix = 11; }
-class XS     { bits<4> Prefix = 12; }
-class T8     { bits<4> Prefix = 13; }
-class TA     { bits<4> Prefix = 14; }
-class TF     { bits<4> Prefix = 15; }
+class TB     { bits<5> Prefix = 1; }
+class REP    { bits<5> Prefix = 2; }
+class D8     { bits<5> Prefix = 3; }
+class D9     { bits<5> Prefix = 4; }
+class DA     { bits<5> Prefix = 5; }
+class DB     { bits<5> Prefix = 6; }
+class DC     { bits<5> Prefix = 7; }
+class DD     { bits<5> Prefix = 8; }
+class DE     { bits<5> Prefix = 9; }
+class DF     { bits<5> Prefix = 10; }
+class XD     { bits<5> Prefix = 11; }
+class XS     { bits<5> Prefix = 12; }
+class T8     { bits<5> Prefix = 13; }
+class TA     { bits<5> Prefix = 14; }
+class A6     { bits<5> Prefix = 15; }
+class A7     { bits<5> Prefix = 16; }
+class TF     { bits<5> Prefix = 17; }
 class VEX    { bit hasVEXPrefix = 1; }
 class VEX_W  { bit hasVEX_WPrefix = 1; }
 class VEX_4V : VEX { bit hasVEX_4VPrefix = 1; }
@@ -136,7 +138,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   bit hasOpSizePrefix = 0;  // Does this inst have a 0x66 prefix?
   bit hasAdSizePrefix = 0;  // Does this inst have a 0x67 prefix?
 
-  bits<4> Prefix = 0;       // Which prefix byte does this inst have?
+  bits<5> Prefix = 0;       // Which prefix byte does this inst have?
   bit hasREX_WPrefix  = 0;  // Does this inst require the REX.W prefix?
   FPFormat FPForm = NotFP;  // What flavor of FP instruction is this?
   bit hasLockPrefix = 0;    // Does this inst have a 0xF0 prefix?
@@ -154,20 +156,20 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   let TSFlags{5-0}   = FormBits;
   let TSFlags{6}     = hasOpSizePrefix;
   let TSFlags{7}     = hasAdSizePrefix;
-  let TSFlags{11-8}  = Prefix;
-  let TSFlags{12}    = hasREX_WPrefix;
-  let TSFlags{15-13} = ImmT.Value;
-  let TSFlags{18-16} = FPForm.Value;
-  let TSFlags{19}    = hasLockPrefix;
-  let TSFlags{21-20} = SegOvrBits;
-  let TSFlags{23-22} = ExeDomain.Value;
-  let TSFlags{31-24} = Opcode;
-  let TSFlags{32}    = hasVEXPrefix;
-  let TSFlags{33}    = hasVEX_WPrefix;
-  let TSFlags{34}    = hasVEX_4VPrefix;
-  let TSFlags{35}    = hasVEX_i8ImmReg;
-  let TSFlags{36}    = hasVEX_L;
-  let TSFlags{37}    = has3DNow0F0FOpcode;
+  let TSFlags{12-8}  = Prefix;
+  let TSFlags{13}    = hasREX_WPrefix;
+  let TSFlags{16-14} = ImmT.Value;
+  let TSFlags{19-17} = FPForm.Value;
+  let TSFlags{20}    = hasLockPrefix;
+  let TSFlags{22-21} = SegOvrBits;
+  let TSFlags{24-23} = ExeDomain.Value;
+  let TSFlags{32-25} = Opcode;
+  let TSFlags{33}    = hasVEXPrefix;
+  let TSFlags{34}    = hasVEX_WPrefix;
+  let TSFlags{35}    = hasVEX_4VPrefix;
+  let TSFlags{36}    = hasVEX_i8ImmReg;
+  let TSFlags{37}    = hasVEX_L;
+  let TSFlags{38}    = has3DNow0F0FOpcode;
 }
 
 class PseudoI<dag oops, dag iops, list<dag> pattern>
@@ -319,7 +321,7 @@ class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
         Requires<[HasAVX]>;
 class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>,
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>, TB,
         Requires<[HasAVX]>;
 
 // SSE2 Instruction Templates:
@@ -353,7 +355,7 @@ class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
         Requires<[HasAVX]>;
 class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedDouble>,
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedDouble>, TB,
         OpSize, Requires<[HasAVX]>;
 
 // SSE3 Instruction Templates:
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 5016c0f..3cbfac1 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -132,6 +132,8 @@ def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
 
 def X86Unpcklps : SDNode<"X86ISD::UNPCKLPS", SDTShuff2Op>;
 def X86Unpcklpd : SDNode<"X86ISD::UNPCKLPD", SDTShuff2Op>;
+def X86Unpcklpsy : SDNode<"X86ISD::VUNPCKLPSY", SDTShuff2Op>;
+def X86Unpcklpdy : SDNode<"X86ISD::VUNPCKLPDY", SDTShuff2Op>;
 def X86Unpckhps : SDNode<"X86ISD::UNPCKHPS", SDTShuff2Op>;
 def X86Unpckhpd : SDNode<"X86ISD::UNPCKHPD", SDTShuff2Op>;
 
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 76a9b12..83f0260 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -232,7 +232,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     assert(!RegOp2MemOpTable2Addr.count(RegOp) && "Duplicated entries?");
     RegOp2MemOpTable2Addr[RegOp] = std::make_pair(MemOp, 0U);
 
-    // If this is not a reversable operation (because there is a many->one)
+    // If this is not a reversible operation (because there is a many->one)
     // mapping, don't insert the reverse of the operation into MemOp2RegOpTable.
     if (OpTbl2Addr[i][1] & TB_NOT_REVERSABLE)
       continue;
@@ -335,7 +335,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     assert(!RegOp2MemOpTable0.count(RegOp) && "Duplicated entries?");
     RegOp2MemOpTable0[RegOp] = std::make_pair(MemOp, Align);
 
-    // If this is not a reversable operation (because there is a many->one)
+    // If this is not a reversible operation (because there is a many->one)
     // mapping, don't insert the reverse of the operation into MemOp2RegOpTable.
     if (OpTbl0[i][1] & TB_NOT_REVERSABLE)
       continue;
@@ -460,7 +460,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     assert(!RegOp2MemOpTable1.count(RegOp) && "Duplicate entries");
     RegOp2MemOpTable1[RegOp] = std::make_pair(MemOp, Align);
 
-    // If this is not a reversable operation (because there is a many->one)
+    // If this is not a reversible operation (because there is a many->one)
     // mapping, don't insert the reverse of the operation into MemOp2RegOpTable.
     if (OpTbl1[i][1] & TB_NOT_REVERSABLE)
       continue;
@@ -682,7 +682,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     assert(!RegOp2MemOpTable2.count(RegOp) && "Duplicate entry!");
     RegOp2MemOpTable2[RegOp] = std::make_pair(MemOp, Align);
 
-    // If this is not a reversable operation (because there is a many->one)
+    // If this is not a reversible operation (because there is a many->one)
     // mapping, don't insert the reverse of the operation into MemOp2RegOpTable.
     if (OpTbl2[i][1] & TB_NOT_REVERSABLE)
       continue;
@@ -916,7 +916,6 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
     case X86::MOVSDrm:
     case X86::MOVAPSrm:
     case X86::MOVUPSrm:
-    case X86::MOVUPSrm_Int:
     case X86::MOVAPDrm:
     case X86::MOVDQArm:
     case X86::MMX_MOVD64rm:
@@ -2241,6 +2240,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   bool isTwoAddr = NumOps > 1 &&
     MI->getDesc().getOperandConstraint(1, TOI::TIED_TO) != -1;
 
+  // FIXME: AsmPrinter doesn't know how to handle
+  // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
+  if (MI->getOpcode() == X86::ADD32ri &&
+      MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
+    return NULL;
+
   MachineInstr *NewMI = NULL;
   // Folding a memory location into the two-address part of a two-address
   // instruction is different than folding it other places.  It requires
@@ -2535,6 +2540,12 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
     case X86::TEST32rr:
     case X86::TEST64rr:
       return true;
+    case X86::ADD32ri:
+      // FIXME: AsmPrinter doesn't know how to handle
+      // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
+      if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
+        return false;
+      break;
     }
   }
 
@@ -2845,11 +2856,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case X86::FsMOVAPDrm:
   case X86::MOVAPSrm:
   case X86::MOVUPSrm:
-  case X86::MOVUPSrm_Int:
   case X86::MOVAPDrm:
   case X86::MOVDQArm:
   case X86::MOVDQUrm:
-  case X86::MOVDQUrm_Int:
     break;
   }
   switch (Opc2) {
@@ -2869,11 +2878,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case X86::FsMOVAPDrm:
   case X86::MOVAPSrm:
   case X86::MOVUPSrm:
-  case X86::MOVUPSrm_Int:
   case X86::MOVAPDrm:
   case X86::MOVDQArm:
   case X86::MOVDQUrm:
-  case X86::MOVDQUrm_Int:
     break;
   }
 
@@ -3085,12 +3092,8 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
   NopInst.setOpcode(X86::NOOP);
 }
 
-bool X86InstrInfo::
-hasHighOperandLatency(const InstrItineraryData *ItinData,
-                      const MachineRegisterInfo *MRI,
-                      const MachineInstr *DefMI, unsigned DefIdx,
-                      const MachineInstr *UseMI, unsigned UseIdx) const {
-  switch (DefMI->getOpcode()) {
+bool X86InstrInfo::isHighLatencyDef(int opc) const {
+  switch (opc) {
   default: return false;
   case X86::DIVSDrm:
   case X86::DIVSDrm_Int:
@@ -3120,6 +3123,14 @@ hasHighOperandLatency(const InstrItineraryData *ItinData,
   }
 }
 
+bool X86InstrInfo::
+hasHighOperandLatency(const InstrItineraryData *ItinData,
+                      const MachineRegisterInfo *MRI,
+                      const MachineInstr *DefMI, unsigned DefIdx,
+                      const MachineInstr *UseMI, unsigned UseIdx) const {
+  return isHighLatencyDef(DefMI->getOpcode());
+}
+
 namespace {
   /// CGBR - Create Global Base Reg pass. This initializes the PIC
   /// global base register for x86-32.
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index fcb5a25..8da68b5 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -33,15 +33,15 @@ namespace X86 {
     AddrScaleAmt = 1,
     AddrIndexReg = 2,
     AddrDisp = 3,
-    
+
     /// AddrSegmentReg - The operand # of the segment in the memory operand.
     AddrSegmentReg = 4,
 
     /// AddrNumOperands - Total number of operands in a memory reference.
     AddrNumOperands = 5
   };
-  
-  
+
+
   // X86 specific condition code. These correspond to X86_*_COND in
   // X86InstrInfo.td. They must be kept in synch.
   enum CondCode {
@@ -72,16 +72,16 @@ namespace X86 {
 
     COND_INVALID
   };
-    
+
   // Turn condition code into conditional branch opcode.
   unsigned GetCondBranchFromCond(CondCode CC);
-  
+
   /// GetOppositeBranchCondition - Return the inverse of the specified cond,
   /// e.g. turning COND_E to COND_NE.
   CondCode GetOppositeBranchCondition(X86::CondCode CC);
 
 }
-  
+
 /// X86II - This namespace holds all of the target specific flags that
 /// instruction info tracks.
 ///
@@ -90,14 +90,14 @@ namespace X86II {
   enum TOF {
     //===------------------------------------------------------------------===//
     // X86 Specific MachineOperand flags.
-    
+
     MO_NO_FLAG,
-    
+
     /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a
     /// relocation of:
     ///    SYMBOL_LABEL + [. - PICBASELABEL]
     MO_GOT_ABSOLUTE_ADDRESS,
-    
+
     /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the
     /// immediate should get the value of the symbol minus the PIC base label:
     ///    SYMBOL_LABEL - PICBASELABEL
@@ -106,77 +106,77 @@ namespace X86II {
     /// MO_GOT - On a symbol operand this indicates that the immediate is the
     /// offset to the GOT entry for the symbol name from the base of the GOT.
     ///
-    /// See the X86-64 ELF ABI supplement for more details. 
+    /// See the X86-64 ELF ABI supplement for more details.
     ///    SYMBOL_LABEL @GOT
     MO_GOT,
-    
+
     /// MO_GOTOFF - On a symbol operand this indicates that the immediate is
-    /// the offset to the location of the symbol name from the base of the GOT. 
+    /// the offset to the location of the symbol name from the base of the GOT.
     ///
-    /// See the X86-64 ELF ABI supplement for more details. 
+    /// See the X86-64 ELF ABI supplement for more details.
     ///    SYMBOL_LABEL @GOTOFF
     MO_GOTOFF,
-    
+
     /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is
     /// offset to the GOT entry for the symbol name from the current code
-    /// location. 
+    /// location.
     ///
-    /// See the X86-64 ELF ABI supplement for more details. 
+    /// See the X86-64 ELF ABI supplement for more details.
     ///    SYMBOL_LABEL @GOTPCREL
     MO_GOTPCREL,
-    
+
     /// MO_PLT - On a symbol operand this indicates that the immediate is
-    /// offset to the PLT entry of symbol name from the current code location. 
+    /// offset to the PLT entry of symbol name from the current code location.
     ///
-    /// See the X86-64 ELF ABI supplement for more details. 
+    /// See the X86-64 ELF ABI supplement for more details.
     ///    SYMBOL_LABEL @PLT
     MO_PLT,
-    
+
     /// MO_TLSGD - On a symbol operand this indicates that the immediate is
     /// some TLS offset.
     ///
-    /// See 'ELF Handling for Thread-Local Storage' for more details. 
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
     ///    SYMBOL_LABEL @TLSGD
     MO_TLSGD,
-    
+
     /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
     /// some TLS offset.
     ///
-    /// See 'ELF Handling for Thread-Local Storage' for more details. 
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
     ///    SYMBOL_LABEL @GOTTPOFF
     MO_GOTTPOFF,
-   
+
     /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
     /// some TLS offset.
     ///
-    /// See 'ELF Handling for Thread-Local Storage' for more details. 
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
     ///    SYMBOL_LABEL @INDNTPOFF
     MO_INDNTPOFF,
-    
+
     /// MO_TPOFF - On a symbol operand this indicates that the immediate is
     /// some TLS offset.
     ///
-    /// See 'ELF Handling for Thread-Local Storage' for more details. 
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
     ///    SYMBOL_LABEL @TPOFF
     MO_TPOFF,
-    
+
     /// MO_NTPOFF - On a symbol operand this indicates that the immediate is
     /// some TLS offset.
     ///
-    /// See 'ELF Handling for Thread-Local Storage' for more details. 
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
     ///    SYMBOL_LABEL @NTPOFF
     MO_NTPOFF,
-    
+
     /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
     /// reference is actually to the "__imp_FOO" symbol.  This is used for
     /// dllimport linkage on windows.
     MO_DLLIMPORT,
-    
+
     /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the
     /// reference is actually to the "FOO$stub" symbol.  This is used for calls
     /// and jumps to external functions on Tiger and earlier.
     MO_DARWIN_STUB,
-    
+
     /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the
     /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a
     /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
@@ -186,19 +186,19 @@ namespace X86II {
     /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is
     /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
     MO_DARWIN_NONLAZY_PIC_BASE,
-    
+
     /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this
     /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE",
     /// which is a PIC-base-relative reference to a hidden dyld lazy pointer
     /// stub.
     MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE,
-    
+
     /// MO_TLVP - On a symbol operand this indicates that the immediate is
     /// some TLS offset.
     ///
     /// This is the TLS offset for the Darwin TLS mechanism.
     MO_TLVP,
-    
+
     /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate
     /// is some TLS offset from the picbase.
     ///
@@ -239,7 +239,7 @@ inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
     return false;
   }
 }
- 
+
 /// X86II - This namespace holds all of the target specific flags that
 /// instruction info tracks.
 ///
@@ -299,7 +299,7 @@ namespace X86II {
     // MRMInitReg - This form is used for instructions whose source and
     // destinations are the same register.
     MRMInitReg = 32,
-    
+
     //// MRM_C1 - A mod/rm byte of exactly 0xC1.
     MRM_C1 = 33,
     MRM_C2 = 34,
@@ -318,7 +318,7 @@ namespace X86II {
     /// immediates, the first of which is a 16-bit immediate (specified by
     /// the imm encoding) and the second is a 8-bit fixed value.
     RawFrmImm8 = 43,
-    
+
     /// RawFrmImm16 - This is used for CALL FAR instructions, which have two
     /// immediates, the first of which is a 16 or 32-bit immediate (specified by
     /// the imm encoding) and the second is a 16-bit fixed value.  In the AMD
@@ -347,7 +347,7 @@ namespace X86II {
     // set, there is no prefix byte for obtaining a multibyte opcode.
     //
     Op0Shift    = 8,
-    Op0Mask     = 0xF << Op0Shift,
+    Op0Mask     = 0x1F << Op0Shift,
 
     // TB - TwoByte - Set if this instruction has a two byte opcode, which
     // starts with a 0x0F byte before the real opcode.
@@ -368,11 +368,12 @@ namespace X86II {
     // floating point operations performed in the SSE registers.
     XD = 11 << Op0Shift,  XS = 12 << Op0Shift,
 
-    // T8, TA - Prefix after the 0x0F prefix.
+    // T8, TA, A6, A7 - Prefix after the 0x0F prefix.
     T8 = 13 << Op0Shift,  TA = 14 << Op0Shift,
-    
+    A6 = 15 << Op0Shift,  A7 = 16 << Op0Shift,
+
     // TF - Prefix before and after 0x0F
-    TF = 15 << Op0Shift,
+    TF = 17 << Op0Shift,
 
     //===------------------------------------------------------------------===//
     // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
@@ -380,13 +381,13 @@ namespace X86II {
     // etc. We only cares about REX.W and REX.R bits and only the former is
     // statically determined.
     //
-    REXShift    = 12,
+    REXShift    = Op0Shift + 5,
     REX_W       = 1 << REXShift,
 
     //===------------------------------------------------------------------===//
     // This three-bit field describes the size of an immediate operand.  Zero is
     // unused so that we can tell if we forgot to set a value.
-    ImmShift = 13,
+    ImmShift = REXShift + 1,
     ImmMask    = 7 << ImmShift,
     Imm8       = 1 << ImmShift,
     Imm8PCRel  = 2 << ImmShift,
@@ -400,7 +401,7 @@ namespace X86II {
     // FP Instruction Classification...  Zero is non-fp instruction.
 
     // FPTypeMask - Mask for all of the FP types...
-    FPTypeShift = 16,
+    FPTypeShift = ImmShift + 3,
     FPTypeMask  = 7 << FPTypeShift,
 
     // NotFP - The default, set for instructions that do not use FP registers.
@@ -433,25 +434,26 @@ namespace X86II {
     SpecialFP  = 7 << FPTypeShift,
 
     // Lock prefix
-    LOCKShift = 19,
+    LOCKShift = FPTypeShift + 3,
     LOCK = 1 << LOCKShift,
 
     // Segment override prefixes. Currently we just need ability to address
     // stuff in gs and fs segments.
-    SegOvrShift = 20,
+    SegOvrShift = LOCKShift + 1,
     SegOvrMask  = 3 << SegOvrShift,
     FS          = 1 << SegOvrShift,
     GS          = 2 << SegOvrShift,
 
-    // Execution domain for SSE instructions in bits 22, 23.
-    // 0 in bits 22-23 means normal, non-SSE instruction.
-    SSEDomainShift = 22,
+    // Execution domain for SSE instructions in bits 23, 24.
+    // 0 in bits 23-24 means normal, non-SSE instruction.
+    SSEDomainShift = SegOvrShift + 2,
 
-    OpcodeShift   = 24,
-    OpcodeMask    = 0xFF << OpcodeShift,
+    OpcodeShift   = SSEDomainShift + 2,
+    OpcodeMask    = 0xFFULL << OpcodeShift,
 
     //===------------------------------------------------------------------===//
     /// VEX - The opcode prefix used by AVX instructions
+    VEXShift = OpcodeShift + 8,
     VEX         = 1U << 0,
 
     /// VEX_W - Has a opcode specific functionality, but is used in the same
@@ -473,7 +475,7 @@ namespace X86II {
     /// if a VR256 register is used, but some AVX instructions also have this
     /// field marked when using a f256 memory references.
     VEX_L       = 1U << 4,
-    
+
     /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the
     /// wacky 0x0F 0x0F prefix for 3DNow! instructions.  The manual documents
     /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
@@ -482,18 +484,18 @@ namespace X86II {
     /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
     Has3DNow0F0FOpcode = 1U << 5
   };
-  
+
   // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
   // specified machine instruction.
   //
   static inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
     return TSFlags >> X86II::OpcodeShift;
   }
-  
+
   static inline bool hasImm(uint64_t TSFlags) {
     return (TSFlags & X86II::ImmMask) != 0;
   }
-  
+
   /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
   /// of the specified instruction.
   static inline unsigned getSizeOfImm(uint64_t TSFlags) {
@@ -508,7 +510,7 @@ namespace X86II {
     case X86II::Imm64:      return 8;
     }
   }
-  
+
   /// isImmPCRel - Return true if the immediate of the specified instruction's
   /// TSFlags indicates that it is pc relative.
   static inline unsigned isImmPCRel(uint64_t TSFlags) {
@@ -525,7 +527,7 @@ namespace X86II {
       return false;
     }
   }
-  
+
   /// getMemoryOperandNo - The function returns the MCInst operand # for the
   /// first field of the memory operand.  If the instruction doesn't have a
   /// memory operand, this returns -1.
@@ -549,11 +551,11 @@ namespace X86II {
     case X86II::MRMDestMem:
       return 0;
     case X86II::MRMSrcMem: {
-      bool HasVEX_4V = (TSFlags >> 32) & X86II::VEX_4V;
+      bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
       unsigned FirstMemOp = 1;
       if (HasVEX_4V)
         ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
-      
+
       // FIXME: Maybe lea should have its own form?  This is a horrible hack.
       //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
       //    Opcode == X86::LEA16r || Opcode == X86::LEA32r)
@@ -613,7 +615,7 @@ inline static bool isMem(const MachineInstr *MI, unsigned Op) {
 class X86InstrInfo : public TargetInstrInfoImpl {
   X86TargetMachine &TM;
   const X86RegisterInfo RI;
-  
+
   /// RegOp2MemOpTable2Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
   /// RegOp2MemOpTable2 - Load / store folding opcode maps.
   ///
@@ -621,7 +623,7 @@ class X86InstrInfo : public TargetInstrInfoImpl {
   DenseMap<unsigned, std::pair<unsigned,unsigned> > RegOp2MemOpTable0;
   DenseMap<unsigned, std::pair<unsigned,unsigned> > RegOp2MemOpTable1;
   DenseMap<unsigned, std::pair<unsigned,unsigned> > RegOp2MemOpTable2;
-  
+
   /// MemOp2RegOpTable - Load / store unfolding opcode map.
   ///
   DenseMap<unsigned, std::pair<unsigned, unsigned> > MemOp2RegOpTable;
@@ -795,7 +797,7 @@ public:
   virtual unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
                                       bool UnfoldLoad, bool UnfoldStore,
                                       unsigned *LoadRegIndex = 0) const;
-  
+
   /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
   /// to determine if two loads are loading from the same base address. It
   /// should only return true if the base pointers are the same and the
@@ -805,7 +807,7 @@ public:
                                        int64_t &Offset1, int64_t &Offset2) const;
 
   /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
-  /// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should
+  /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
   /// be scheduled togther. On some targets if two loads are loading from
   /// addresses in the same cache line, it's better if they are scheduled
   /// together. This function takes two integers that represent the load offsets
@@ -829,7 +831,7 @@ public:
     return (reg == X86::SPL || reg == X86::BPL ||
           reg == X86::SIL || reg == X86::DIL);
   }
-  
+
   static bool isX86_64ExtendedReg(const MachineOperand &MO) {
     if (!MO.isReg()) return false;
     return isX86_64ExtendedReg(MO.getReg());
@@ -858,11 +860,13 @@ public:
                                       const SmallVectorImpl<MachineOperand> &MOs,
                                       unsigned Size, unsigned Alignment) const;
 
+  bool isHighLatencyDef(int opc) const;
+
   bool hasHighOperandLatency(const InstrItineraryData *ItinData,
                              const MachineRegisterInfo *MRI,
                              const MachineInstr *DefMI, unsigned DefIdx,
                              const MachineInstr *UseMI, unsigned UseIdx) const;
-  
+
 private:
   MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
                                               MachineFunction::iterator &MFI,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index f832a7c..03a0b0c 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -459,7 +459,7 @@ def CallImmAddr  : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
 include "X86InstrFormats.td"
 
 //===----------------------------------------------------------------------===//
-// Pattern fragments...
+// Pattern fragments.
 //
 
 // X86 specific condition code. These correspond to CondCode in
@@ -481,21 +481,21 @@ def X86_COND_O   : PatLeaf<(i8 13)>;
 def X86_COND_P   : PatLeaf<(i8 14)>; // alt. COND_PE
 def X86_COND_S   : PatLeaf<(i8 15)>;
 
-def immSext8 : PatLeaf<(imm), [{ return immSext8(N); }]>;
+let FastIselShouldIgnore = 1 in { // FastIsel should ignore all simm8 instrs.
+  def i16immSExt8  : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>;
+  def i32immSExt8  : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>;
+  def i64immSExt8  : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>;
+}
 
-def i16immSExt8  : PatLeaf<(i16 immSext8)>;
-def i32immSExt8  : PatLeaf<(i32 immSext8)>;
-def i64immSExt8  : PatLeaf<(i64 immSext8)>;
-def i64immSExt32  : PatLeaf<(i64 imm), [{ return i64immSExt32(N); }]>;
-def i64immZExt32  : PatLeaf<(i64 imm), [{
-  // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
-  // unsignedsign extended field.
-  return (uint64_t)N->getZExtValue() == (uint32_t)N->getZExtValue();
-}]>;
+def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>;
+
+
+// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+// unsigned field.
+def i64immZExt32 : ImmLeaf<i64, [{ return (uint64_t)Imm == (uint32_t)Imm; }]>;
 
-def i64immZExt32SExt8 : PatLeaf<(i64 imm), [{
-    uint64_t v = N->getZExtValue();
-    return v == (uint32_t)v && (int32_t)v == (int8_t)v;
+def i64immZExt32SExt8 : ImmLeaf<i64, [{
+  return (uint64_t)Imm == (uint32_t)Imm && (int32_t)Imm == (int8_t)Imm;
 }]>;
 
 // Helper fragments for loads.
@@ -1437,7 +1437,7 @@ def : InstAlias<"idivq $src, %rax", (IDIV64m i64mem:$src)>;
 
 // Various unary fpstack operations default to operating on on ST1.
 // For example, "fxch" -> "fxch %st(1)"
-def : InstAlias<"faddp",        (ADD_FPrST0  ST1)>;
+def : InstAlias<"faddp",        (ADD_FPrST0  ST1), 0>;
 def : InstAlias<"fsubp",        (SUBR_FPrST0 ST1)>;
 def : InstAlias<"fsubrp",       (SUB_FPrST0  ST1)>;
 def : InstAlias<"fmulp",        (MUL_FPrST0  ST1)>;
@@ -1455,13 +1455,15 @@ def : InstAlias<"fucompi",      (UCOM_FIPr   ST1)>;
 // For example, "fadd %st(4), %st(0)" -> "fadd %st(4)".  We also disambiguate
 // instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with
 // gas.
-multiclass FpUnaryAlias<string Mnemonic, Instruction Inst> {
- def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"),    (Inst RST:$op)>;
- def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"), (Inst ST0)>;
+multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> {
+ def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"),
+                 (Inst RST:$op), EmitAlias>;
+ def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"),
+                 (Inst ST0), EmitAlias>;
 }
 
 defm : FpUnaryAlias<"fadd",   ADD_FST0r>;
-defm : FpUnaryAlias<"faddp",  ADD_FPrST0>;
+defm : FpUnaryAlias<"faddp",  ADD_FPrST0, 0>;
 defm : FpUnaryAlias<"fsub",   SUB_FST0r>;
 defm : FpUnaryAlias<"fsubp",  SUBR_FPrST0>;
 defm : FpUnaryAlias<"fsubr",  SUBR_FST0r>;
@@ -1472,8 +1474,8 @@ defm : FpUnaryAlias<"fdiv",   DIV_FST0r>;
 defm : FpUnaryAlias<"fdivp",  DIVR_FPrST0>;
 defm : FpUnaryAlias<"fdivr",  DIVR_FST0r>;
 defm : FpUnaryAlias<"fdivrp", DIV_FPrST0>;
-defm : FpUnaryAlias<"fcomi",   COM_FIr>;
-defm : FpUnaryAlias<"fucomi",  UCOM_FIr>;
+defm : FpUnaryAlias<"fcomi",   COM_FIr, 0>;
+defm : FpUnaryAlias<"fucomi",  UCOM_FIr, 0>;
 defm : FpUnaryAlias<"fcompi",   COM_FIPr>;
 defm : FpUnaryAlias<"fucompi",  UCOM_FIPr>;
 
@@ -1481,7 +1483,7 @@ defm : FpUnaryAlias<"fucompi",  UCOM_FIPr>;
 // Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they
 // commute.  We also allow fdiv[r]p/fsubrp even though they don't commute,
 // solely because gas supports it.
-def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op)>;
+def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op), 0>;
 def : InstAlias<"fmulp %st(0), $op", (MUL_FPrST0 RST:$op)>;
 def : InstAlias<"fsubrp %st(0), $op", (SUB_FPrST0 RST:$op)>;
 def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>;
@@ -1534,29 +1536,31 @@ def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg)>;
 def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm)>;
 
 // Match 'movq GR64, MMX' as an alias for movd.
-def : InstAlias<"movq $src, $dst", (MMX_MOVD64to64rr VR64:$dst, GR64:$src)>;
-def : InstAlias<"movq $src, $dst", (MMX_MOVD64from64rr GR64:$dst, VR64:$src)>;
+def : InstAlias<"movq $src, $dst",
+                (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>;
+def : InstAlias<"movq $src, $dst",
+                (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>;
 
 // movsd with no operands (as opposed to the SSE scalar move of a double) is an
 // alias for movsl. (as in rep; movsd)
 def : InstAlias<"movsd", (MOVSD)>;
 
 // movsx aliases
-def : InstAlias<"movsx $src, $dst", (MOVSX16rr8W GR16:$dst, GR8:$src)>;
-def : InstAlias<"movsx $src, $dst", (MOVSX16rm8W GR16:$dst, i8mem:$src)>;
-def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src)>;
-def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src)>;
-def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src)>;
-def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src)>;
-def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src)>;
+def : InstAlias<"movsx $src, $dst", (MOVSX16rr8W GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX16rm8W GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;
 
 // movzx aliases
-def : InstAlias<"movzx $src, $dst", (MOVZX16rr8W GR16:$dst, GR8:$src)>;
-def : InstAlias<"movzx $src, $dst", (MOVZX16rm8W GR16:$dst, i8mem:$src)>;
-def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src)>;
-def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src)>;
-def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src)>;
-def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src)>;
+def : InstAlias<"movzx $src, $dst", (MOVZX16rr8W GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX16rm8W GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>;
 // Note: No GR32->GR64 movzx form.
 
 // outb %dx -> outb %al, %dx
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index b912949..cde3f6b 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -135,18 +135,16 @@ class sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
 // is used instead. Register-to-register movss/movsd is not modeled as an
 // INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable
 // in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
-let isAsmParserOnly = 1 in {
-  def VMOVSSrr : sse12_move_rr<FR32, v4f32,
-                  "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V;
-  def VMOVSDrr : sse12_move_rr<FR64, v2f64,
-                  "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V;
+def VMOVSSrr : sse12_move_rr<FR32, v4f32,
+                "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V;
+def VMOVSDrr : sse12_move_rr<FR64, v2f64,
+                "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V;
 
-  let canFoldAsLoad = 1, isReMaterializable = 1 in {
-    def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX;
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+  def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX;
 
-    let AddedComplexity = 20 in
-      def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX;
-  }
+  let AddedComplexity = 20 in
+    def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -218,14 +216,12 @@ def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
                   "movsd\t{$src, $dst|$dst, $src}",
                   [(store FR64:$src, addr:$dst)]>;
 
-let isAsmParserOnly = 1 in {
 def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
                   "movss\t{$src, $dst|$dst, $src}",
                   [(store FR32:$src, addr:$dst)]>, XS, VEX;
 def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
                   "movsd\t{$src, $dst|$dst, $src}",
                   [(store FR64:$src, addr:$dst)]>, XD, VEX;
-}
 
 // Extract and store.
 def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
@@ -251,7 +247,6 @@ let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
                    [(set RC:$dst, (ld_frag addr:$src))], d>;
 }
 
-let isAsmParserOnly = 1 in {
 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
                               "movaps", SSEPackedSingle>, VEX;
 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
@@ -269,7 +264,6 @@ defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
                               "movups", SSEPackedSingle>, VEX;
 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
                               "movupd", SSEPackedDouble, 0>, OpSize, VEX;
-}
 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
                               "movaps", SSEPackedSingle>, TB;
 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
@@ -279,7 +273,6 @@ defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
 defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
                               "movupd", SSEPackedDouble, 0>, TB, OpSize;
 
-let isAsmParserOnly = 1 in {
 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
                    [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, VEX;
@@ -304,7 +297,6 @@ def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
                    [(store (v4f64 VR256:$src), addr:$dst)]>, VEX;
-}
 
 def : Pat<(int_x86_avx_loadu_ps_256 addr:$src), (VMOVUPSYrm addr:$src)>;
 def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
@@ -328,32 +320,14 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    [(store (v2f64 VR128:$src), addr:$dst)]>;
 
 // Intrinsic forms of MOVUPS/D load and store
-let isAsmParserOnly = 1 in {
-  let canFoldAsLoad = 1, isReMaterializable = 1 in
-  def VMOVUPSrm_Int : VPSI<0x10, MRMSrcMem, (outs VR128:$dst),
-             (ins f128mem:$src),
-             "movups\t{$src, $dst|$dst, $src}",
-             [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>, VEX;
-  def VMOVUPDrm_Int : VPDI<0x10, MRMSrcMem, (outs VR128:$dst),
-             (ins f128mem:$src),
-             "movupd\t{$src, $dst|$dst, $src}",
-             [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>, VEX;
-  def VMOVUPSmr_Int : VPSI<0x11, MRMDestMem, (outs),
-             (ins f128mem:$dst, VR128:$src),
-             "movups\t{$src, $dst|$dst, $src}",
-             [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>, VEX;
-  def VMOVUPDmr_Int : VPDI<0x11, MRMDestMem, (outs),
-             (ins f128mem:$dst, VR128:$src),
-             "movupd\t{$src, $dst|$dst, $src}",
-             [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>, VEX;
-}
-let canFoldAsLoad = 1, isReMaterializable = 1 in
-def MOVUPSrm_Int : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       "movups\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>;
-def MOVUPDrm_Int : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       "movupd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>;
+def VMOVUPSmr_Int : VPSI<0x11, MRMDestMem, (outs),
+           (ins f128mem:$dst, VR128:$src),
+           "movups\t{$src, $dst|$dst, $src}",
+           [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>, VEX;
+def VMOVUPDmr_Int : VPDI<0x11, MRMDestMem, (outs),
+           (ins f128mem:$dst, VR128:$src),
+           "movupd\t{$src, $dst|$dst, $src}",
+           [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>, VEX;
 
 def MOVUPSmr_Int : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                        "movups\t{$src, $dst|$dst, $src}",
@@ -382,7 +356,7 @@ multiclass sse12_mov_hilo_packed<bits<8>opc, RegisterClass RC,
               SSEPackedDouble>, TB, OpSize;
 }
 
-let isAsmParserOnly = 1, AddedComplexity = 20 in {
+let AddedComplexity = 20 in {
   defm VMOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp",
                      "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V;
   defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp",
@@ -395,7 +369,6 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
                                    "\t{$src2, $dst|$dst, $src2}">;
 }
 
-let isAsmParserOnly = 1 in {
 def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlps\t{$src, $dst|$dst, $src}",
                    [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
@@ -404,7 +377,6 @@ def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlpd\t{$src, $dst|$dst, $src}",
                    [(store (f64 (vector_extract (v2f64 VR128:$src),
                                  (iPTR 0))), addr:$dst)]>, VEX;
-}
 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlps\t{$src, $dst|$dst, $src}",
                    [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
@@ -416,7 +388,6 @@ def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
 
 // v2f64 extract element 1 is always custom lowered to unpack high to low
 // and extract element 0 so the non-store version isn't too horrible.
-let isAsmParserOnly = 1 in {
 def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movhps\t{$src, $dst|$dst, $src}",
                    [(store (f64 (vector_extract
@@ -429,7 +400,6 @@ def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                                  (v2f64 (unpckh VR128:$src, (undef))),
                                  (iPTR 0))), addr:$dst)]>,
                    VEX;
-}
 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movhps\t{$src, $dst|$dst, $src}",
                    [(store (f64 (vector_extract
@@ -441,7 +411,7 @@ def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                                  (v2f64 (unpckh VR128:$src, (undef))),
                                  (iPTR 0))), addr:$dst)]>;
 
-let isAsmParserOnly = 1, AddedComplexity = 20 in {
+let AddedComplexity = 20 in {
   def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -516,7 +486,6 @@ multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>;
 }
 
-let isAsmParserOnly = 1 in {
 defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
                                 "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX;
 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
@@ -542,7 +511,6 @@ defm VCVTSI2SDL  : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, XD,
                                   VEX_4V;
 defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, XD,
                                   VEX_4V, VEX_W;
-}
 
 defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
                       "cvttss2si\t{$src, $dst|$dst, $src}">, XS;
@@ -591,27 +559,25 @@ multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
               [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))]>;
 }
 
-let isAsmParserOnly = 1 in {
-  defm Int_VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
-                        f32mem, load, "cvtss2si">, XS, VEX;
-  defm Int_VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
-                          int_x86_sse_cvtss2si64, f32mem, load, "cvtss2si">,
-                          XS, VEX, VEX_W;
-  defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
-                        f128mem, load, "cvtsd2si">, XD, VEX;
-  defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
-                        int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si">,
-                        XD, VEX, VEX_W;
-
-  // FIXME: The asm matcher has a hack to ignore instructions with _Int and Int_
-  // Get rid of this hack or rename the intrinsics, there are several
-  // intructions that only match with the intrinsic form, why create duplicates
-  // to let them be recognized by the assembler?
-  defm VCVTSD2SI_alt : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem,
-                        "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX;
-  defm VCVTSD2SI64   : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem,
-                        "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W;
-}
+defm Int_VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+                      f32mem, load, "cvtss2si">, XS, VEX;
+defm Int_VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
+                        int_x86_sse_cvtss2si64, f32mem, load, "cvtss2si">,
+                        XS, VEX, VEX_W;
+defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
+                      f128mem, load, "cvtsd2si">, XD, VEX;
+defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
+                      int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si">,
+                      XD, VEX, VEX_W;
+
+// FIXME: The asm matcher has a hack to ignore instructions with _Int and Int_
+// Get rid of this hack or rename the intrinsics, there are several
+// intructions that only match with the intrinsic form, why create duplicates
+// to let them be recognized by the assembler?
+defm VCVTSD2SI_alt : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem,
+                      "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX;
+defm VCVTSD2SI64   : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem,
+                      "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W;
 defm Int_CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
                       f32mem, load, "cvtss2si">, XS;
 defm Int_CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
@@ -622,18 +588,16 @@ defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
                   f128mem, load, "cvtsd2si{q}">, XD, REX_W;
 
 
-let isAsmParserOnly = 1 in {
-  defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
-            int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss", 0>, XS, VEX_4V;
-  defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
-            int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss", 0>, XS, VEX_4V,
-            VEX_W;
-  defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
-            int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd", 0>, XD, VEX_4V;
-  defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
-            int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd", 0>, XD,
-            VEX_4V, VEX_W;
-}
+defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+          int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss", 0>, XS, VEX_4V;
+defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+          int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss", 0>, XS, VEX_4V,
+          VEX_W;
+defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+          int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd", 0>, XD, VEX_4V;
+defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+          int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd", 0>, XD,
+          VEX_4V, VEX_W;
 
 let Constraints = "$src1 = $dst" in {
   defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
@@ -653,7 +617,6 @@ let Constraints = "$src1 = $dst" in {
 /// SSE 1 Only
 
 // Aliases for intrinsics
-let isAsmParserOnly = 1 in {
 defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
                                     f32mem, load, "cvttss2si">, XS, VEX;
 defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
@@ -664,7 +627,6 @@ defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
 defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
                                     int_x86_sse2_cvttsd2si64, f128mem, load,
                                     "cvttsd2si">, XD, VEX, VEX_W;
-}
 defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
                                     f32mem, load, "cvttss2si">, XS;
 defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
@@ -676,7 +638,7 @@ defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
                                     int_x86_sse2_cvttsd2si64, f128mem, load,
                                     "cvttsd2si{q}">, XD, REX_W;
 
-let isAsmParserOnly = 1, Pattern = []<dag> in {
+let Pattern = []<dag> in {
 defm VCVTSS2SI   : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load,
                                "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS, VEX;
 defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load,
@@ -702,7 +664,6 @@ defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load /*dummy*/,
 /// SSE 2 Only
 
 // Convert scalar double to scalar single
-let isAsmParserOnly = 1 in {
 def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
                        (ins FR64:$src1, FR64:$src2),
                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
@@ -711,7 +672,6 @@ def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
                        (ins FR64:$src1, f64mem:$src2),
                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       []>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V;
-}
 def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
         Requires<[HasAVX]>;
 
@@ -723,7 +683,6 @@ def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
                       [(set FR32:$dst, (fround (loadf64 addr:$src)))]>, XD,
                   Requires<[HasSSE2, OptForSize]>;
 
-let isAsmParserOnly = 1 in
 defm Int_VCVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128,
                       int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss", 0>,
                       XS, VEX_4V;
@@ -732,7 +691,7 @@ defm Int_CVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128,
                       int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss">, XS;
 
 // Convert scalar single to scalar double
-let isAsmParserOnly = 1 in { // SSE2 instructions with XS prefix
+// SSE2 instructions with XS prefix
 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
                     (ins FR32:$src1, FR32:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -741,7 +700,6 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
                     (ins FR32:$src1, f32mem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     []>, XS, VEX_4V, Requires<[HasAVX, OptForSize]>;
-}
 def : Pat<(f64 (fextend FR32:$src)), (VCVTSS2SDrr FR32:$src, FR32:$src)>,
         Requires<[HasAVX]>;
 
@@ -754,7 +712,6 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
                    [(set FR64:$dst, (extloadf32 addr:$src))]>, XS,
                  Requires<[HasSSE2, OptForSize]>;
 
-let isAsmParserOnly = 1 in {
 def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -767,7 +724,6 @@ def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
                     [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
                                        (load addr:$src2)))]>, XS, VEX_4V,
                     Requires<[HasAVX]>;
-}
 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
 def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
@@ -788,7 +744,7 @@ def : Pat<(extloadf32 addr:$src),
       Requires<[HasSSE2, OptForSpeed]>;
 
 // Convert doubleword to packed single/double fp
-let isAsmParserOnly = 1 in { // SSE2 instructions without OpSize prefix
+// SSE2 instructions without OpSize prefix
 def Int_VCVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
@@ -798,7 +754,6 @@ def Int_VCVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                       [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
                                         (bitconvert (memopv2i64 addr:$src))))]>,
                      TB, VEX, Requires<[HasAVX]>;
-}
 def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtdq2ps\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
@@ -810,7 +765,7 @@ def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                      TB, Requires<[HasSSE2]>;
 
 // FIXME: why the non-intrinsic version is described as SSE3?
-let isAsmParserOnly = 1 in { // SSE2 instructions with XS prefix
+// SSE2 instructions with XS prefix
 def Int_VCVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
@@ -820,7 +775,6 @@ def Int_VCVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                        [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
                                         (bitconvert (memopv2i64 addr:$src))))]>,
                      XS, VEX, Requires<[HasAVX]>;
-}
 def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
@@ -833,7 +787,6 @@ def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
 
 
 // Convert packed single/double fp to doubleword
-let isAsmParserOnly = 1 in {
 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
@@ -842,13 +795,11 @@ def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
-}
 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
 def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                      "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
 
-let isAsmParserOnly = 1 in {
 def Int_VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>,
@@ -858,7 +809,6 @@ def Int_VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst),
                          "cvtps2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst, (int_x86_sse2_cvtps2dq
                                             (memop addr:$src)))]>, VEX;
-}
 def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>;
@@ -867,7 +817,7 @@ def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          [(set VR128:$dst, (int_x86_sse2_cvtps2dq
                                             (memop addr:$src)))]>;
 
-let isAsmParserOnly = 1 in { // SSE2 packed instructions with XD prefix
+// SSE2 packed instructions with XD prefix
 def Int_VCVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "vcvtpd2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
@@ -877,7 +827,6 @@ def Int_VCVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
                                           (memop addr:$src)))]>,
                      XD, VEX, Requires<[HasAVX]>;
-}
 def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtpd2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
@@ -890,7 +839,7 @@ def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 
 
 // Convert with truncation packed single/double fp to doubleword
-let isAsmParserOnly = 1 in { // SSE2 packed instructions with XS prefix
+// SSE2 packed instructions with XS prefix
 def VCVTTPS2DQrr : VSSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTTPS2DQrm : VSSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
@@ -899,7 +848,6 @@ def VCVTTPS2DQYrr : VSSI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTTPS2DQYrm : VSSI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
-}
 def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
@@ -910,7 +858,6 @@ def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                             (int_x86_sse2_cvttps2dq (memop addr:$src)))]>;
 
 
-let isAsmParserOnly = 1 in {
 def Int_VCVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vcvttps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
@@ -921,9 +868,7 @@ def Int_VCVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                         [(set VR128:$dst, (int_x86_sse2_cvttps2dq
                                            (memop addr:$src)))]>,
                       XS, VEX, Requires<[HasAVX]>;
-}
 
-let isAsmParserOnly = 1 in {
 def Int_VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst),
                             (ins VR128:$src),
                           "cvttpd2dq\t{$src, $dst|$dst, $src}",
@@ -934,7 +879,6 @@ def Int_VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst),
                           "cvttpd2dq\t{$src, $dst|$dst, $src}",
                           [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
                                              (memop addr:$src)))]>, VEX;
-}
 def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       "cvttpd2dq\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>;
@@ -943,7 +887,6 @@ def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
                       [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
                                         (memop addr:$src)))]>;
 
-let isAsmParserOnly = 1 in {
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
 // register, but the same isn't true when using memory operands instead.
 // Provide other assembly rr and rm forms to address this explicitly.
@@ -963,10 +906,9 @@ def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                          "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                          "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
-}
 
 // Convert packed single to packed double
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
                   // SSE2 instructions without OpSize prefix
 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX;
@@ -982,7 +924,6 @@ def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB;
 
-let isAsmParserOnly = 1 in {
 def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "vcvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
@@ -992,7 +933,6 @@ def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd
                                           (load addr:$src)))]>,
                      VEX, Requires<[HasAVX]>;
-}
 def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
@@ -1004,7 +944,6 @@ def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                      TB, Requires<[HasSSE2]>;
 
 // Convert packed double to packed single
-let isAsmParserOnly = 1 in {
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
 // register, but the same isn't true when using memory operands instead.
 // Provide other assembly rr and rm forms to address this explicitly.
@@ -1024,14 +963,12 @@ def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                         "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                         "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
-}
 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtpd2ps\t{$src, $dst|$dst, $src}", []>;
 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                      "cvtpd2ps\t{$src, $dst|$dst, $src}", []>;
 
 
-let isAsmParserOnly = 1 in {
 def Int_VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                          "cvtpd2ps\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
@@ -1040,7 +977,6 @@ def Int_VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst),
                          "cvtpd2ps\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
                                             (memop addr:$src)))]>;
-}
 def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                          "cvtpd2ps\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
@@ -1089,26 +1025,27 @@ def : Pat<(int_x86_avx_cvtt_ps2dq_256 (memopv8f32 addr:$src)),
 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
                             string asm, string asm_alt> {
-  def rr : SIi8<0xC2, MRMSrcReg,
-                    (outs RC:$dst), (ins RC:$src1, RC:$src, SSECC:$cc),
-                    asm, []>;
-  let mayLoad = 1 in
-  def rm : SIi8<0xC2, MRMSrcMem,
-                    (outs RC:$dst), (ins RC:$src1, x86memop:$src, SSECC:$cc),
-                    asm, []>;
-  // Accept explicit immediate argument form instead of comparison code.
   let isAsmParserOnly = 1 in {
-    def rr_alt : SIi8<0xC2, MRMSrcReg,
-                  (outs RC:$dst), (ins RC:$src1, RC:$src, i8imm:$src2),
-                  asm_alt, []>;
+    def rr : SIi8<0xC2, MRMSrcReg,
+                  (outs RC:$dst), (ins RC:$src1, RC:$src, SSECC:$cc),
+                  asm, []>;
     let mayLoad = 1 in
-    def rm_alt : SIi8<0xC2, MRMSrcMem,
-                  (outs RC:$dst), (ins RC:$src1, x86memop:$src, i8imm:$src2),
-                  asm_alt, []>;
+    def rm : SIi8<0xC2, MRMSrcMem,
+                  (outs RC:$dst), (ins RC:$src1, x86memop:$src, SSECC:$cc),
+                  asm, []>;
   }
+
+  // Accept explicit immediate argument form instead of comparison code.
+  def rr_alt : SIi8<0xC2, MRMSrcReg,
+                (outs RC:$dst), (ins RC:$src1, RC:$src, i8imm:$src2),
+                asm_alt, []>;
+  let mayLoad = 1 in
+  def rm_alt : SIi8<0xC2, MRMSrcMem,
+                (outs RC:$dst), (ins RC:$src1, x86memop:$src, i8imm:$src2),
+                asm_alt, []>;
 }
 
-let neverHasSideEffects = 1, isAsmParserOnly = 1 in {
+let neverHasSideEffects = 1 in {
   defm VCMPSS  : sse12_cmp_scalar<FR32, f32mem,
                   "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
                   "cmpss\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}">,
@@ -1141,14 +1078,12 @@ multiclass sse12_cmp_scalar_int<RegisterClass RC, X86MemOperand x86memop,
 }
 
 // Aliases to match intrinsics which expect XMM operand(s).
-let isAsmParserOnly = 1 in {
-  defm Int_VCMPSS  : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss,
-                       "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}">,
-                       XS, VEX_4V;
-  defm Int_VCMPSD  : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd,
-                       "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}">,
-                       XD, VEX_4V;
-}
+defm Int_VCMPSS  : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss,
+                     "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}">,
+                     XS, VEX_4V;
+defm Int_VCMPSD  : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd,
+                     "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}">,
+                     XD, VEX_4V;
 let Constraints = "$src1 = $dst" in {
   defm Int_CMPSS  : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss,
                        "cmp${cc}ss\t{$src, $dst|$dst, $src}">, XS;
@@ -1171,28 +1106,26 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
 }
 
 let Defs = [EFLAGS] in {
-  let isAsmParserOnly = 1 in {
-    defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
-                                    "ucomiss", SSEPackedSingle>, VEX;
-    defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
-                                    "ucomisd", SSEPackedDouble>, OpSize, VEX;
-    let Pattern = []<dag> in {
-      defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
-                                      "comiss", SSEPackedSingle>, VEX;
-      defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
-                                      "comisd", SSEPackedDouble>, OpSize, VEX;
-    }
-
-    defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
-                              load, "ucomiss", SSEPackedSingle>, VEX;
-    defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
-                              load, "ucomisd", SSEPackedDouble>, OpSize, VEX;
-
-    defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
-                              load, "comiss", SSEPackedSingle>, VEX;
-    defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
-                              load, "comisd", SSEPackedDouble>, OpSize, VEX;
+  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
+                                  "ucomiss", SSEPackedSingle>, VEX;
+  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
+                                  "ucomisd", SSEPackedDouble>, OpSize, VEX;
+  let Pattern = []<dag> in {
+    defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
+                                    "comiss", SSEPackedSingle>, VEX;
+    defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
+                                    "comisd", SSEPackedDouble>, OpSize, VEX;
   }
+
+  defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+                            load, "ucomiss", SSEPackedSingle>, VEX;
+  defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+                            load, "ucomisd", SSEPackedDouble>, OpSize, VEX;
+
+  defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
+                            load, "comiss", SSEPackedSingle>, VEX;
+  defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
+                            load, "comisd", SSEPackedDouble>, OpSize, VEX;
   defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
                                   "ucomiss", SSEPackedSingle>, TB;
   defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
@@ -1220,41 +1153,40 @@ let Defs = [EFLAGS] in {
 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
                             Intrinsic Int, string asm, string asm_alt,
                             Domain d> {
-  def rri : PIi8<0xC2, MRMSrcReg,
-             (outs RC:$dst), (ins RC:$src1, RC:$src, SSECC:$cc), asm,
-             [(set RC:$dst, (Int RC:$src1, RC:$src, imm:$cc))], d>;
-  def rmi : PIi8<0xC2, MRMSrcMem,
-             (outs RC:$dst), (ins RC:$src1, f128mem:$src, SSECC:$cc), asm,
-             [(set RC:$dst, (Int RC:$src1, (memop addr:$src), imm:$cc))], d>;
-  // Accept explicit immediate argument form instead of comparison code.
   let isAsmParserOnly = 1 in {
-    def rri_alt : PIi8<0xC2, MRMSrcReg,
-               (outs RC:$dst), (ins RC:$src1, RC:$src, i8imm:$src2),
-               asm_alt, [], d>;
-    def rmi_alt : PIi8<0xC2, MRMSrcMem,
-               (outs RC:$dst), (ins RC:$src1, f128mem:$src, i8imm:$src2),
-               asm_alt, [], d>;
+    def rri : PIi8<0xC2, MRMSrcReg,
+               (outs RC:$dst), (ins RC:$src1, RC:$src, SSECC:$cc), asm,
+               [(set RC:$dst, (Int RC:$src1, RC:$src, imm:$cc))], d>;
+    def rmi : PIi8<0xC2, MRMSrcMem,
+               (outs RC:$dst), (ins RC:$src1, f128mem:$src, SSECC:$cc), asm,
+               [(set RC:$dst, (Int RC:$src1, (memop addr:$src), imm:$cc))], d>;
   }
-}
 
-let isAsmParserOnly = 1 in {
-  defm VCMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps,
-                 "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}",
-                 "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
-                 SSEPackedSingle>, VEX_4V;
-  defm VCMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd,
-                 "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}",
-                 "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
-                 SSEPackedDouble>, OpSize, VEX_4V;
-  defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_ps_256,
-                 "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}",
-                 "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
-                 SSEPackedSingle>, VEX_4V;
-  defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_pd_256,
-                 "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}",
-                 "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
-                 SSEPackedDouble>, OpSize, VEX_4V;
-}
+  // Accept explicit immediate argument form instead of comparison code.
+  def rri_alt : PIi8<0xC2, MRMSrcReg,
+             (outs RC:$dst), (ins RC:$src1, RC:$src, i8imm:$src2),
+             asm_alt, [], d>;
+  def rmi_alt : PIi8<0xC2, MRMSrcMem,
+             (outs RC:$dst), (ins RC:$src1, f128mem:$src, i8imm:$src2),
+             asm_alt, [], d>;
+}
+
+defm VCMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps,
+               "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}",
+               "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
+               SSEPackedSingle>, VEX_4V;
+defm VCMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd,
+               "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}",
+               "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
+               SSEPackedDouble>, OpSize, VEX_4V;
+defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_ps_256,
+               "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}",
+               "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
+               SSEPackedSingle>, VEX_4V;
+defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_pd_256,
+               "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}",
+               "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
+               SSEPackedDouble>, OpSize, VEX_4V;
 let Constraints = "$src1 = $dst" in {
   defm CMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps,
                  "cmp${cc}ps\t{$src, $dst|$dst, $src}",
@@ -1294,20 +1226,18 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
                             (vt (shufp:$src3 RC:$src1, RC:$src2)))], d>;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
-             "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-             memopv4f32, SSEPackedSingle>, VEX_4V;
-  defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
-             "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-             memopv8f32, SSEPackedSingle>, VEX_4V;
-  defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
-             "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
-             memopv2f64, SSEPackedDouble>, OpSize, VEX_4V;
-  defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
-             "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
-             memopv4f64, SSEPackedDouble>, OpSize, VEX_4V;
-}
+defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
+           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           memopv4f32, SSEPackedSingle>, TB, VEX_4V;
+defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
+           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           memopv8f32, SSEPackedSingle>, TB, VEX_4V;
+defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
+           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
+           memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
+defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
+           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
+           memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
 
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
@@ -1340,33 +1270,31 @@ multiclass sse12_unpack_interleave<bits<8> opc, PatFrag OpNode, ValueType vt,
 }
 
 let AddedComplexity = 10 in {
-  let isAsmParserOnly = 1 in {
-    defm VUNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32,
-          VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                         SSEPackedSingle>, VEX_4V;
-    defm VUNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64,
-          VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                         SSEPackedDouble>, OpSize, VEX_4V;
-    defm VUNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32,
-          VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                         SSEPackedSingle>, VEX_4V;
-    defm VUNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64,
-          VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                         SSEPackedDouble>, OpSize, VEX_4V;
-
-    defm VUNPCKHPSY: sse12_unpack_interleave<0x15, unpckh, v8f32, memopv8f32,
-          VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                         SSEPackedSingle>, VEX_4V;
-    defm VUNPCKHPDY: sse12_unpack_interleave<0x15, unpckh, v4f64, memopv4f64,
-          VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                         SSEPackedDouble>, OpSize, VEX_4V;
-    defm VUNPCKLPSY: sse12_unpack_interleave<0x14, unpckl, v8f32, memopv8f32,
-          VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                         SSEPackedSingle>, VEX_4V;
-    defm VUNPCKLPDY: sse12_unpack_interleave<0x14, unpckl, v4f64, memopv4f64,
-          VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                         SSEPackedDouble>, OpSize, VEX_4V;
-  }
+  defm VUNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32,
+        VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       SSEPackedSingle>, VEX_4V;
+  defm VUNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64,
+        VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       SSEPackedDouble>, OpSize, VEX_4V;
+  defm VUNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32,
+        VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       SSEPackedSingle>, VEX_4V;
+  defm VUNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64,
+        VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       SSEPackedDouble>, OpSize, VEX_4V;
+
+  defm VUNPCKHPSY: sse12_unpack_interleave<0x15, unpckh, v8f32, memopv8f32,
+        VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       SSEPackedSingle>, VEX_4V;
+  defm VUNPCKHPDY: sse12_unpack_interleave<0x15, unpckh, v4f64, memopv4f64,
+        VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       SSEPackedDouble>, OpSize, VEX_4V;
+  defm VUNPCKLPSY: sse12_unpack_interleave<0x14, unpckl, v8f32, memopv8f32,
+        VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       SSEPackedSingle>, VEX_4V;
+  defm VUNPCKLPDY: sse12_unpack_interleave<0x14, unpckl, v4f64, memopv4f64,
+        VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       SSEPackedDouble>, OpSize, VEX_4V;
 
   let Constraints = "$src1 = $dst" in {
     defm UNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32,
@@ -1404,30 +1332,28 @@ defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
 defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
                                      SSEPackedDouble>, TB, OpSize;
 
-let isAsmParserOnly = 1 in {
-  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
-                                        "movmskps", SSEPackedSingle>, VEX;
-  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
-                                        "movmskpd", SSEPackedDouble>, OpSize,
-                                        VEX;
-  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
-                                        "movmskps", SSEPackedSingle>, VEX;
-  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
-                                        "movmskpd", SSEPackedDouble>, OpSize,
-                                        VEX;
+defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
+                                      "movmskps", SSEPackedSingle>, VEX;
+defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
+                                      "movmskpd", SSEPackedDouble>, OpSize,
+                                      VEX;
+defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
+                                      "movmskps", SSEPackedSingle>, VEX;
+defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
+                                      "movmskpd", SSEPackedDouble>, OpSize,
+                                      VEX;
 
-  // Assembler Only
-  def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-             "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX;
-  def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-             "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize,
-             VEX;
-  def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-             "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX;
-  def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-             "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize,
-             VEX;
-}
+// Assembler Only
+def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+           "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX;
+def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+           "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize,
+           VEX;
+def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
+           "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX;
+def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
+           "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize,
+           VEX;
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Misc aliasing of packed SSE 1 & 2 instructions
@@ -1482,13 +1408,11 @@ def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
 ///
 multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
                                        SDNode OpNode> {
-  let isAsmParserOnly = 1 in {
-    defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
-                FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, VEX_4V;
+  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+              FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, VEX_4V;
 
-    defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
-          FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, OpSize, VEX_4V;
-  }
+  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+        FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, OpSize, VEX_4V;
 
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
@@ -1514,7 +1438,7 @@ let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in
 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
                                  SDNode OpNode, int HasPat = 0,
                                  list<list<dag>> Pattern = []> {
-  let isAsmParserOnly = 1, Pattern = []<dag> in {
+  let Pattern = []<dag> in {
     defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
          !strconcat(OpcodeStr, "ps"), f128mem,
          !if(HasPat, Pattern[0], // rr
@@ -1561,7 +1485,6 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
 
 /// sse12_fp_packed_logical_y - AVX 256-bit SSE 1 & 2 logical ops forms
 ///
-let isAsmParserOnly = 1 in {
 multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr> {
     defm PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
           !strconcat(OpcodeStr, "ps"), f256mem, [], [], 0>, VEX_4V;
@@ -1569,7 +1492,6 @@ multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr> {
     defm PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
           !strconcat(OpcodeStr, "pd"), f256mem, [], [], 0>, OpSize, VEX_4V;
 }
-}
 
 // AVX 256-bit packed logical ops forms
 defm VAND : sse12_fp_packed_logical_y<0x54, "and">;
@@ -1667,38 +1589,36 @@ multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr> {
 }
 
 // Binary Arithmetic instructions
-let isAsmParserOnly = 1 in {
-  defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>,
-              basic_sse12_fp_binop_s_int<0x58, "add", 0>,
-              basic_sse12_fp_binop_p<0x58, "add", fadd, 0>,
-              basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V;
-  defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>,
-              basic_sse12_fp_binop_s_int<0x59, "mul", 0>,
-              basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>,
-              basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V;
+defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>,
+            basic_sse12_fp_binop_s_int<0x58, "add", 0>,
+            basic_sse12_fp_binop_p<0x58, "add", fadd, 0>,
+            basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V;
+defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>,
+            basic_sse12_fp_binop_s_int<0x59, "mul", 0>,
+            basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>,
+            basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V;
 
-  let isCommutable = 0 in {
-    defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>,
-                basic_sse12_fp_binop_s_int<0x5C, "sub", 0>,
-                basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>,
-                basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V;
-    defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>,
-                basic_sse12_fp_binop_s_int<0x5E, "div", 0>,
-                basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>,
-                basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V;
-    defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>,
-                basic_sse12_fp_binop_s_int<0x5F, "max", 0>,
-                basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>,
-                basic_sse12_fp_binop_p_int<0x5F, "max", 0>,
-                basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>,
-                basic_sse12_fp_binop_p_y_int<0x5F, "max">, VEX_4V;
-    defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>,
-                basic_sse12_fp_binop_s_int<0x5D, "min", 0>,
-                basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>,
-                basic_sse12_fp_binop_p_int<0x5D, "min", 0>,
-                basic_sse12_fp_binop_p_y_int<0x5D, "min">,
-                basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V;
-  }
+let isCommutable = 0 in {
+  defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>,
+              basic_sse12_fp_binop_s_int<0x5C, "sub", 0>,
+              basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>,
+              basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V;
+  defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>,
+              basic_sse12_fp_binop_s_int<0x5E, "div", 0>,
+              basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>,
+              basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V;
+  defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>,
+              basic_sse12_fp_binop_s_int<0x5F, "max", 0>,
+              basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>,
+              basic_sse12_fp_binop_p_int<0x5F, "max", 0>,
+              basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>,
+              basic_sse12_fp_binop_p_y_int<0x5F, "max">, VEX_4V;
+  defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>,
+              basic_sse12_fp_binop_s_int<0x5D, "min", 0>,
+              basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>,
+              basic_sse12_fp_binop_p_int<0x5D, "min", 0>,
+              basic_sse12_fp_binop_p_y_int<0x5D, "min">,
+              basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -1899,7 +1819,7 @@ multiclass sse2_fp_unop_p_y_int<bits<8> opc, string OpcodeStr,
                     [(set VR256:$dst, (V2F64Int (memopv4f64 addr:$src)))]>;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   // Square root.
   defm VSQRT  : sse1_fp_unop_s_avx<0x51, "vsqrt", fsqrt, int_x86_sse_sqrt_ss>,
                 sse2_fp_unop_s_avx<0x51, "vsqrt", fsqrt, int_x86_sse2_sqrt_sd>,
@@ -1955,67 +1875,65 @@ defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>,
 // SSE 1 & 2 - Non-temporal stores
 //===----------------------------------------------------------------------===//
 
-let isAsmParserOnly = 1 in {
-  def VMOVNTPSmr_Int : VPSI<0x2B, MRMDestMem, (outs),
-                         (ins i128mem:$dst, VR128:$src),
-                         "movntps\t{$src, $dst|$dst, $src}",
-                         [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>, VEX;
-  def VMOVNTPDmr_Int : VPDI<0x2B, MRMDestMem, (outs),
-                         (ins i128mem:$dst, VR128:$src),
-                         "movntpd\t{$src, $dst|$dst, $src}",
-                         [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>, VEX;
+def VMOVNTPSmr_Int : VPSI<0x2B, MRMDestMem, (outs),
+                       (ins i128mem:$dst, VR128:$src),
+                       "movntps\t{$src, $dst|$dst, $src}",
+                       [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>, VEX;
+def VMOVNTPDmr_Int : VPDI<0x2B, MRMDestMem, (outs),
+                       (ins i128mem:$dst, VR128:$src),
+                       "movntpd\t{$src, $dst|$dst, $src}",
+                       [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>, VEX;
 
-  let ExeDomain = SSEPackedInt in
-    def VMOVNTDQmr_Int : VPDI<0xE7, MRMDestMem, (outs),
+let ExeDomain = SSEPackedInt in
+  def VMOVNTDQmr_Int : VPDI<0xE7, MRMDestMem, (outs),
+                     (ins f128mem:$dst, VR128:$src),
+                     "movntdq\t{$src, $dst|$dst, $src}",
+                     [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>, VEX;
+
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+  def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
+                       (ins f128mem:$dst, VR128:$src),
+                       "movntps\t{$src, $dst|$dst, $src}",
+                       [(alignednontemporalstore (v4f32 VR128:$src),
+                                                 addr:$dst)]>, VEX;
+  def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
                        (ins f128mem:$dst, VR128:$src),
-                       "movntdq\t{$src, $dst|$dst, $src}",
-                       [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>, VEX;
-
-  let AddedComplexity = 400 in { // Prefer non-temporal versions
-    def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
-                         (ins f128mem:$dst, VR128:$src),
-                         "movntps\t{$src, $dst|$dst, $src}",
-                         [(alignednontemporalstore (v4f32 VR128:$src),
-                                                   addr:$dst)]>, VEX;
-    def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
-                         (ins f128mem:$dst, VR128:$src),
-                         "movntpd\t{$src, $dst|$dst, $src}",
-                         [(alignednontemporalstore (v2f64 VR128:$src),
-                                                   addr:$dst)]>, VEX;
-    def VMOVNTDQ_64mr : VPDI<0xE7, MRMDestMem, (outs),
-                          (ins f128mem:$dst, VR128:$src),
-                          "movntdq\t{$src, $dst|$dst, $src}",
-                          [(alignednontemporalstore (v2f64 VR128:$src),
-                                                    addr:$dst)]>, VEX;
-    let ExeDomain = SSEPackedInt in
-    def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
+                       "movntpd\t{$src, $dst|$dst, $src}",
+                       [(alignednontemporalstore (v2f64 VR128:$src),
+                                                 addr:$dst)]>, VEX;
+  def VMOVNTDQ_64mr : VPDI<0xE7, MRMDestMem, (outs),
                         (ins f128mem:$dst, VR128:$src),
                         "movntdq\t{$src, $dst|$dst, $src}",
-                        [(alignednontemporalstore (v4f32 VR128:$src),
+                        [(alignednontemporalstore (v2f64 VR128:$src),
                                                   addr:$dst)]>, VEX;
-
-    def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
-                         (ins f256mem:$dst, VR256:$src),
-                         "movntps\t{$src, $dst|$dst, $src}",
-                         [(alignednontemporalstore (v8f32 VR256:$src),
-                                                   addr:$dst)]>, VEX;
-    def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
-                         (ins f256mem:$dst, VR256:$src),
-                         "movntpd\t{$src, $dst|$dst, $src}",
-                         [(alignednontemporalstore (v4f64 VR256:$src),
-                                                   addr:$dst)]>, VEX;
-    def VMOVNTDQY_64mr : VPDI<0xE7, MRMDestMem, (outs),
-                          (ins f256mem:$dst, VR256:$src),
-                          "movntdq\t{$src, $dst|$dst, $src}",
-                          [(alignednontemporalstore (v4f64 VR256:$src),
-                                                    addr:$dst)]>, VEX;
-    let ExeDomain = SSEPackedInt in
-    def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
+  let ExeDomain = SSEPackedInt in
+  def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
+                      (ins f128mem:$dst, VR128:$src),
+                      "movntdq\t{$src, $dst|$dst, $src}",
+                      [(alignednontemporalstore (v4f32 VR128:$src),
+                                                addr:$dst)]>, VEX;
+
+  def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
+                       (ins f256mem:$dst, VR256:$src),
+                       "movntps\t{$src, $dst|$dst, $src}",
+                       [(alignednontemporalstore (v8f32 VR256:$src),
+                                                 addr:$dst)]>, VEX;
+  def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
+                       (ins f256mem:$dst, VR256:$src),
+                       "movntpd\t{$src, $dst|$dst, $src}",
+                       [(alignednontemporalstore (v4f64 VR256:$src),
+                                                 addr:$dst)]>, VEX;
+  def VMOVNTDQY_64mr : VPDI<0xE7, MRMDestMem, (outs),
                         (ins f256mem:$dst, VR256:$src),
                         "movntdq\t{$src, $dst|$dst, $src}",
-                        [(alignednontemporalstore (v8f32 VR256:$src),
+                        [(alignednontemporalstore (v4f64 VR256:$src),
                                                   addr:$dst)]>, VEX;
-  }
+  let ExeDomain = SSEPackedInt in
+  def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
+                      (ins f256mem:$dst, VR256:$src),
+                      "movntdq\t{$src, $dst|$dst, $src}",
+                      [(alignednontemporalstore (v8f32 VR256:$src),
+                                                addr:$dst)]>, VEX;
 }
 
 def : Pat<(int_x86_avx_movnt_dq_256 addr:$dst, VR256:$src),
@@ -2138,12 +2056,10 @@ def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
 // SSE 1 & 2 - Load/Store XCSR register
 //===----------------------------------------------------------------------===//
 
-let isAsmParserOnly = 1 in {
-  def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
-                    "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, VEX;
-  def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
-                    "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, VEX;
-}
+def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, VEX;
+def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, VEX;
 
 def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
                   "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>;
@@ -2156,45 +2072,43 @@ def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
 
 let ExeDomain = SSEPackedInt in { // SSE integer instructions
 
-let isAsmParserOnly = 1 in {
-  let neverHasSideEffects = 1 in {
-  def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                      "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
-  def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
-                      "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
-  }
-  def VMOVDQUrr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                      "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX;
-  def VMOVDQUYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
-                      "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX;
-
-  let canFoldAsLoad = 1, mayLoad = 1 in {
-  def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                     "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
-  def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
-                     "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
-  let Predicates = [HasAVX] in {
-    def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                      "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
-    def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
-                      "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
-  }
-  }
+let neverHasSideEffects = 1 in {
+def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+}
+def VMOVDQUrr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX;
+def VMOVDQUYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX;
 
-  let mayStore = 1 in {
-  def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
-                       (ins i128mem:$dst, VR128:$src),
-                       "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
-  def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
-                       (ins i256mem:$dst, VR256:$src),
-                       "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
-  let Predicates = [HasAVX] in {
-  def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+let canFoldAsLoad = 1, mayLoad = 1 in {
+def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+let Predicates = [HasAVX] in {
+  def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                     "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
-  def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
+  def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                     "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
-  }
-  }
+}
+}
+
+let mayStore = 1 in {
+def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
+                     (ins i128mem:$dst, VR128:$src),
+                     "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
+                     (ins i256mem:$dst, VR256:$src),
+                     "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+let Predicates = [HasAVX] in {
+def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                  "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
+def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
+                  "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
+}
 }
 
 let neverHasSideEffects = 1 in
@@ -2226,23 +2140,11 @@ def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
 }
 
 // Intrinsic forms of MOVDQU load and store
-let isAsmParserOnly = 1 in {
-let canFoldAsLoad = 1 in
-def VMOVDQUrm_Int : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                       "vmovdqu\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>,
-                     XS, VEX, Requires<[HasAVX]>;
 def VMOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                        "vmovdqu\t{$src, $dst|$dst, $src}",
                        [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
                      XS, VEX, Requires<[HasAVX]>;
-}
 
-let canFoldAsLoad = 1 in
-def MOVDQUrm_Int :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                       "movdqu\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>,
-                 XS, Requires<[HasSSE2]>;
 def MOVDQUmr_Int :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                        "movdqu\t{$src, $dst|$dst, $src}",
                        [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
@@ -2347,7 +2249,7 @@ multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 // 128-bit Integer Arithmetic
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 defm VPADDB  : PDI_binop_rm<0xFC, "vpaddb", add, v16i8, 1, 0 /*3addr*/>, VEX_4V;
 defm VPADDW  : PDI_binop_rm<0xFD, "vpaddw", add, v8i16, 1, 0>, VEX_4V;
 defm VPADDD  : PDI_binop_rm<0xFE, "vpaddd", add, v4i32, 1, 0>, VEX_4V;
@@ -2437,7 +2339,7 @@ defm PSADBW  : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, 1>;
 // SSE2 - Packed Integer Logical Instructions
 //===---------------------------------------------------------------------===//
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 defm VPSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "vpsllw",
                                 int_x86_sse2_psll_w, int_x86_sse2_pslli_w, 0>,
                                 VEX_4V;
@@ -2584,7 +2486,7 @@ let Predicates = [HasSSE2] in {
 // SSE2 - Packed Integer Comparison Instructions
 //===---------------------------------------------------------------------===//
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   defm VPCMPEQB  : PDI_binop_rm_int<0x74, "vpcmpeqb", int_x86_sse2_pcmpeq_b, 1,
                                     0>, VEX_4V;
   defm VPCMPEQW  : PDI_binop_rm_int<0x75, "vpcmpeqw", int_x86_sse2_pcmpeq_w, 1,
@@ -2638,7 +2540,7 @@ def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))),
 // SSE2 - Packed Integer Pack Instructions
 //===---------------------------------------------------------------------===//
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 defm VPACKSSWB : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_sse2_packsswb_128,
                                   0, 0>, VEX_4V;
 defm VPACKSSDW : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_sse2_packssdw_128,
@@ -2676,7 +2578,7 @@ def mi : Ii8<0x70, MRMSrcMem,
 }
 } // ExeDomain = SSEPackedInt
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   let AddedComplexity = 5 in
   defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, pshufd, bc_v4i32>, OpSize,
                                VEX;
@@ -2724,7 +2626,7 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
                                                addr:$src2))))]>;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, unpckl, bc_v16i8,
                                  0>, VEX_4V;
   defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, unpckl, bc_v8i16,
@@ -2834,7 +2736,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
 }
 
 // Extract
-let isAsmParserOnly = 1, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
 def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
                     (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
                     "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -2847,7 +2749,7 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
                                                 imm:$src2))]>;
 
 // Insert
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   defm VPINSRW : sse2_pinsrw<0>, OpSize, VEX_4V;
   def  VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst),
        (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
@@ -2866,13 +2768,11 @@ let Constraints = "$src1 = $dst" in
 
 let ExeDomain = SSEPackedInt in {
 
-let isAsmParserOnly = 1 in {
 def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
            [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>, VEX;
 def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX;
-}
 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
            [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>;
@@ -2885,7 +2785,6 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
 
 let ExeDomain = SSEPackedInt in {
 
-let isAsmParserOnly = 1 in {
 let Uses = [EDI] in
 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
@@ -2896,7 +2795,6 @@ def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, VEX;
-}
 
 let Uses = [EDI] in
 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
@@ -2914,7 +2812,6 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
 //===---------------------------------------------------------------------===//
 
 // Move Int Doubleword to Packed Double Int
-let isAsmParserOnly = 1 in {
 def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
@@ -2924,7 +2821,6 @@ def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                       [(set VR128:$dst,
                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
                       VEX;
-}
 def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
@@ -2943,7 +2839,6 @@ def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
 
 
 // Move Int Doubleword to Single Scalar
-let isAsmParserOnly = 1 in {
 def VMOVDI2SSrr  : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert GR32:$src))]>, VEX;
@@ -2952,7 +2847,6 @@ def VMOVDI2SSrm  : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
                       VEX;
-}
 def MOVDI2SSrr  : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert GR32:$src))]>;
@@ -2962,7 +2856,6 @@ def MOVDI2SSrm  : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
                       [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>;
 
 // Move Packed Doubleword Int to Packed Double Int
-let isAsmParserOnly = 1 in {
 def VMOVPDI2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
@@ -2972,7 +2865,6 @@ def VMOVPDI2DImr  : VPDI<0x7E, MRMDestMem, (outs),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(store (i32 (vector_extract (v4i32 VR128:$src),
                                      (iPTR 0))), addr:$dst)]>, VEX;
-}
 def MOVPDI2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
@@ -2998,14 +2890,12 @@ def MOVSDto64mr  : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
 
 // Move Scalar Single to Double Int
-let isAsmParserOnly = 1 in {
 def VMOVSS2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set GR32:$dst, (bitconvert FR32:$src))]>, VEX;
 def VMOVSS2DImr  : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, VEX;
-}
 def MOVSS2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set GR32:$dst, (bitconvert FR32:$src))]>;
@@ -3014,7 +2904,7 @@ def MOVSS2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
                       [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>;
 
 // movd / movq to XMM register zero-extends
-let AddedComplexity = 15, isAsmParserOnly = 1 in {
+let AddedComplexity = 15 in {
 def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (v4i32 (X86vzmovl
@@ -3038,7 +2928,6 @@ def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
 }
 
 let AddedComplexity = 20 in {
-let isAsmParserOnly = 1 in
 def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
@@ -3064,7 +2953,6 @@ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
 //===---------------------------------------------------------------------===//
 
 // Move Quadword Int to Packed Quadword Int
-let isAsmParserOnly = 1 in
 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
@@ -3077,7 +2965,6 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     Requires<[HasSSE2]>; // SSE2 instruction with XS Prefix
 
 // Move Packed Quadword Int to Quadword Int
-let isAsmParserOnly = 1 in
 def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}",
                       [(store (i64 (vector_extract (v2i64 VR128:$src),
@@ -3091,7 +2978,6 @@ def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
           (f64 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
 
 // Store / copy lower 64-bits of a XMM register.
-let isAsmParserOnly = 1 in
 def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                      "movq\t{$src, $dst|$dst, $src}",
                      [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX;
@@ -3099,7 +2985,7 @@ def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                      "movq\t{$src, $dst|$dst, $src}",
                      [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>;
 
-let AddedComplexity = 20, isAsmParserOnly = 1 in
+let AddedComplexity = 20 in
 def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                      "vmovq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst,
@@ -3124,7 +3010,7 @@ def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
 
 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
 // IA32 document. movq xmm1, xmm2 does clear the high bits.
-let isAsmParserOnly = 1, AddedComplexity = 15 in
+let AddedComplexity = 15 in
 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
@@ -3135,7 +3021,7 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
                       XS, Requires<[HasSSE2]>;
 
-let AddedComplexity = 20, isAsmParserOnly = 1 in
+let AddedComplexity = 20 in
 def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl
@@ -3153,7 +3039,6 @@ def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
 }
 
 // Instructions to match in the assembler
-let isAsmParserOnly = 1 in {
 def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                       "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W;
 def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
@@ -3161,13 +3046,12 @@ def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
 // Recognize "movd" with GR64 destination, but encode as a "movq"
 def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
                           "movd\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W;
-}
 
 // Instructions for the disassembler
 // xr = XMM register
 // xm = mem64
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
 def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS;
 def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -3209,7 +3093,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
 //===---------------------------------------------------------------------===//
 
 // Convert Packed Double FP to Packed DW Integers
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
 // register, but the same isn't true when using memory operands instead.
 // Provide other assembly rr and rm forms to address this explicitly.
@@ -3237,7 +3121,7 @@ def CVTPD2DQrr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
 
 // Convert Packed DW Integers to Packed Double FP
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 def VCVTDQ2PDrm  : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                      "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTDQ2PDrr  : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -3288,7 +3172,7 @@ def rm : S3SI<op, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   // FIXME: Merge above classes when we have patterns for the ymm version
   defm VMOVSHDUP  : sse3_replicate_sfp<0x16, movshdup, "vmovshdup">, VEX;
   defm VMOVSLDUP  : sse3_replicate_sfp<0x12, movsldup, "vmovsldup">, VEX;
@@ -3319,7 +3203,7 @@ def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                     []>;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   // FIXME: Merge above classes when we have patterns for the ymm version
   defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
   defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX;
@@ -3327,7 +3211,7 @@ let isAsmParserOnly = 1, Predicates = [HasAVX] in {
 defm MOVDDUP : sse3_replicate_dfp<"movddup">;
 
 // Move Unaligned Integer
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "vlddqu\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
@@ -3391,21 +3275,21 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
        [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))]>;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX],
+let Predicates = [HasAVX],
   ExeDomain = SSEPackedDouble in {
   defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
-                               f128mem, 0>, XD, VEX_4V;
+                               f128mem, 0>, TB, XD, VEX_4V;
   defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
-                               f128mem, 0>, OpSize, VEX_4V;
+                               f128mem, 0>, TB, OpSize, VEX_4V;
   defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
-                               f256mem, 0>, XD, VEX_4V;
+                               f256mem, 0>, TB, XD, VEX_4V;
   defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
-                               f256mem, 0>, OpSize, VEX_4V;
+                               f256mem, 0>, TB, OpSize, VEX_4V;
 }
 let Constraints = "$src1 = $dst", Predicates = [HasSSE3],
     ExeDomain = SSEPackedDouble in {
   defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
-                              f128mem>, XD;
+                              f128mem>, TB, XD;
   defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
                               f128mem>, TB, OpSize;
 }
@@ -3444,7 +3328,7 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
       [(set RC:$dst, (vt (IntId RC:$src1, (memop addr:$src2))))]>;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
                           int_x86_sse3_hadd_ps, 0>, VEX_4V;
   defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
@@ -3496,7 +3380,7 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
                        (bitconvert (mem_frag128 addr:$src))))]>, OpSize;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb", memopv16i8,
                                   int_x86_ssse3_pabs_b_128>, VEX;
   defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw", memopv8i16,
@@ -3538,7 +3422,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
           (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 let isCommutable = 0 in {
   defm VPHADDW    : SS3I_binop_rm_int<0x01, "vphaddw", memopv8i16,
                                       int_x86_ssse3_phadd_w_128, 0>, VEX_4V;
@@ -3630,7 +3514,7 @@ multiclass ssse3_palign<string asm, bit Is2Addr = 1> {
       []>, OpSize;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V;
 let Constraints = "$src1 = $dst" in
   defm PALIGN : ssse3_palign<"palignr">;
@@ -3985,7 +3869,7 @@ multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
        OpSize;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>,
                                      VEX;
 defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", int_x86_sse41_pmovsxwd>,
@@ -4051,7 +3935,7 @@ multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
           OpSize;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd>,
                                      VEX;
 defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq>,
@@ -4092,7 +3976,7 @@ multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
                  OpSize;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq>,
                                      VEX;
 defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>,
@@ -4134,7 +4018,7 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
 // (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
   def  VPEXTRBrr64 : SS4AIi8<0x14, MRMDestReg, (outs GR64:$dst),
          (ins VR128:$src1, i32i8imm:$src2),
@@ -4156,7 +4040,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
 // (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
 
 defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
@@ -4178,7 +4062,7 @@ multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
                           addr:$dst)]>, OpSize;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
 
 defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
@@ -4199,7 +4083,7 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
                           addr:$dst)]>, OpSize, REX_W;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
 
 defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
@@ -4222,7 +4106,7 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
                           addr:$dst)]>, OpSize;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
   def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst),
                   (ins VR128:$src1, i32i8imm:$src2),
@@ -4262,7 +4146,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
                    imm:$src3))]>, OpSize;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
 let Constraints = "$src1 = $dst" in
   defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
@@ -4288,7 +4172,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
                           imm:$src3)))]>, OpSize;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
 let Constraints = "$src1 = $dst" in
   defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
@@ -4314,7 +4198,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
                           imm:$src3)))]>, OpSize;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
 let Constraints = "$src1 = $dst" in
   defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
@@ -4347,7 +4231,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
 
 let Constraints = "$src1 = $dst" in
   defm INSERTPS : SS41I_insertf32<0x21, "insertps">;
-let isAsmParserOnly = 1, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
 
 def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3),
@@ -4517,7 +4401,7 @@ multiclass sse41_fp_binop_rm_avx_s<bits<8> opcss, bits<8> opcsd,
 }
 
 // FP round - roundss, roundps, roundsd, roundpd
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   // Intrinsic form
   defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
                                   memopv4f32, memopv2f64,
@@ -4552,7 +4436,7 @@ defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
 
 // ptest instruction we'll lower to this in X86ISelLowering primarily from
 // the intel intrinsic that corresponds to this.
-let Defs = [EFLAGS], isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
 def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86ptest VR128:$src1, (v4f32 VR128:$src2)))]>,
@@ -4595,7 +4479,7 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
             OpSize, VEX;
 }
 
-let Defs = [EFLAGS], isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
 defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>;
 defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>;
 defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>;
@@ -4644,7 +4528,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
                        (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
 defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
                                          int_x86_sse41_phminposuw>, VEX;
 defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
@@ -4670,7 +4554,7 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
           (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   let isCommutable = 0 in
   defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
                                                          0>, VEX_4V;
@@ -4737,7 +4621,7 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
        OpSize;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, 0>, VEX_4V;
 let Constraints = "$src1 = $dst" in
   defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32>;
@@ -4769,7 +4653,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
         OpSize;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
   defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
                                       VR128, memopv16i8, i128mem, 0>, VEX_4V;
@@ -4810,7 +4694,7 @@ let Constraints = "$src1 = $dst" in {
 }
 
 /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
                                     RegisterClass RC, X86MemOperand x86memop,
                                     PatFrag mem_frag, Intrinsic IntId> {
@@ -4870,7 +4754,7 @@ defm PBLENDVB     : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;
 def : Pat<(X86pblendv VR128:$src1, VR128:$src2, XMM0),
           (PBLENDVBrr0 VR128:$src1, VR128:$src2)>;
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "vmovntdqa\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
@@ -4904,7 +4788,7 @@ multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr,
           (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VPCMPGTQ : SS42I_binop_rm_int<0x37, "vpcmpgtq", int_x86_sse42_pcmpgtq,
                                      0>, VEX_4V;
 let Constraints = "$src1 = $dst" in
@@ -4936,8 +4820,7 @@ let Defs = [EFLAGS], usesCustomInserter = 1 in {
   defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
 }
 
-let Defs = [XMM0, EFLAGS], isAsmParserOnly = 1,
-    Predicates = [HasAVX] in {
+let Defs = [XMM0, EFLAGS], Predicates = [HasAVX] in {
   def VPCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
       (ins VR128:$src1, VR128:$src2, i8imm:$src3),
       "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX;
@@ -4972,7 +4855,7 @@ let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
   defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX],
+let Predicates = [HasAVX],
     Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in {
   def VPCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
       (ins VR128:$src1, VR128:$src3, i8imm:$src5),
@@ -5007,7 +4890,7 @@ let Defs = [ECX, EFLAGS] in {
   }
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 defm VPCMPISTRI  : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128, "vpcmpistri">,
                                     VEX;
 defm VPCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128, "vpcmpistri">,
@@ -5046,7 +4929,7 @@ let Defs = [ECX, EFLAGS], Uses = [EAX, EDX] in {
   }
 }
 
-let isAsmParserOnly = 1, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 defm VPCMPESTRI  : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128, "vpcmpestri">,
                                     VEX;
 defm VPCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128, "vpcmpestri">,
@@ -5165,7 +5048,7 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
 }
 
 // Perform One Round of an AES Encryption/Decryption Flow
-let isAsmParserOnly = 1, Predicates = [HasAVX, HasAES] in {
+let Predicates = [HasAVX, HasAES] in {
   defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
                          int_x86_aesni_aesenc, 0>, VEX_4V;
   defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
@@ -5205,7 +5088,7 @@ def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))),
           (AESDECLASTrm VR128:$src1, addr:$src2)>;
 
 // Perform the AES InvMixColumn Transformation
-let isAsmParserOnly = 1, Predicates = [HasAVX, HasAES] in {
+let Predicates = [HasAVX, HasAES] in {
   def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
       (ins VR128:$src1),
       "vaesimc\t{$src1, $dst|$dst, $src1}",
@@ -5233,7 +5116,7 @@ def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
   OpSize;
 
 // AES Round Key Generation Assist
-let isAsmParserOnly = 1, Predicates = [HasAVX, HasAES] in {
+let Predicates = [HasAVX, HasAES] in {
   def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
       (ins VR128:$src1, i8imm:$src2),
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -5269,7 +5152,6 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
 // Only the AVX version of CLMUL instructions are described here.
 
 // Carry-less Multiplication instructions
-let isAsmParserOnly = 1 in {
 def VPCLMULQDQrr : CLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, i8imm:$src3),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -5295,13 +5177,10 @@ defm VPCLMULHQLQDQ : avx_vpclmul<"vpclmulhqlqdq">;
 defm VPCLMULLQHQDQ : avx_vpclmul<"vpclmullqhqdq">;
 defm VPCLMULLQLQDQ : avx_vpclmul<"vpclmullqlqdq">;
 
-} // isAsmParserOnly
-
 //===----------------------------------------------------------------------===//
 // AVX Instructions
 //===----------------------------------------------------------------------===//
 
-let isAsmParserOnly = 1 in {
 
 // Load from memory and broadcast to all elements of the destination operand
 class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
@@ -5435,8 +5314,6 @@ def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
                    [(int_x86_avx_vzeroupper)]>, VEX, Requires<[HasAVX]>;
 
-} // isAsmParserOnly
-
 def : Pat<(int_x86_avx_vinsertf128_pd_256 VR256:$src1, VR128:$src2, imm:$src3),
           (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
 def : Pat<(int_x86_avx_vinsertf128_ps_256 VR256:$src1, VR128:$src2, imm:$src3),
@@ -5622,11 +5499,15 @@ def : Pat<(X86Movddup (bc_v2f64
 // Shuffle with UNPCKLPS
 def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
           (VUNPCKLPSrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
+def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, (memopv8f32 addr:$src2))),
+          (VUNPCKLPSYrm VR256:$src1, addr:$src2)>, Requires<[HasAVX]>;
 def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
           (UNPCKLPSrm VR128:$src1, addr:$src2)>;
 
 def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)),
           (VUNPCKLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>;
+def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, VR256:$src2)),
+          (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>;
 def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)),
           (UNPCKLPSrr VR128:$src1, VR128:$src2)>;
 
@@ -5644,11 +5525,15 @@ def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)),
 // Shuffle with UNPCKLPD
 def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
           (VUNPCKLPDrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
+def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, (memopv4f64 addr:$src2))),
+          (VUNPCKLPDYrm VR256:$src1, addr:$src2)>, Requires<[HasAVX]>;
 def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
           (UNPCKLPDrm VR128:$src1, addr:$src2)>;
 
 def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)),
           (VUNPCKLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>;
+def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, VR256:$src2)),
+          (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>;
 def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)),
           (UNPCKLPDrr VR128:$src1, VR128:$src2)>;
 
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 6a24d14..f73cff3 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -34,9 +34,16 @@ let Uses = [EFLAGS] in
   def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>;
 def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
               [(int_x86_int (i8 3))]>;
+
+// The long form of "int $3" turns into int3 as a size optimization.
+// FIXME: This doesn't work because InstAlias can't match immediate constants.
+//def : InstAlias<"int\t$3", (INT3)>;
+
+
 def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap",
               [(int_x86_int imm:$trap)]>;
 
+
 def SYSCALL  : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB;
 def SYSRETL  : I<0x07, RawFrm, (outs), (ins), "sysretl", []>, TB;
 def SYSRETQ  :RI<0x07, RawFrm, (outs), (ins), "sysretq", []>, TB,
@@ -207,10 +214,15 @@ def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
 
 def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", []>, TB;
 
-def STRr : I<0x00, MRM1r, (outs GR16:$dst), (ins),
-             "str{w}\t{$dst}", []>, TB;
-def STRm : I<0x00, MRM1m, (outs i16mem:$dst), (ins),
-             "str{w}\t{$dst}", []>, TB;
+def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins),
+               "str{w}\t{$dst}", []>, TB, OpSize;
+def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
+               "str{l}\t{$dst}", []>, TB;
+def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
+                "str{q}\t{$dst}", []>, TB;
+def STRm   : I<0x00, MRM1m, (outs i16mem:$dst), (ins),
+               "str{w}\t{$dst}", []>, TB;
+
 def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
              "ltr{w}\t{$src}", []>, TB;
 def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
@@ -393,3 +405,23 @@ let Defs = [RDX, RAX], Uses = [RCX] in
 
 let Uses = [RDX, RAX, RCX] in
   def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB;
+
+//===----------------------------------------------------------------------===//
+// VIA PadLock crypto instructions
+let Defs = [RAX, RDI], Uses = [RDX, RDI] in
+  def XSTORE : I<0xc0, RawFrm, (outs), (ins), "xstore", []>, A7;
+
+let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in {
+  def XCRYPTECB : I<0xc8, RawFrm, (outs), (ins), "xcryptecb", []>, A7;
+  def XCRYPTCBC : I<0xd0, RawFrm, (outs), (ins), "xcryptcbc", []>, A7;
+  def XCRYPTCTR : I<0xd8, RawFrm, (outs), (ins), "xcryptctr", []>, A7;
+  def XCRYPTCFB : I<0xe0, RawFrm, (outs), (ins), "xcryptcfb", []>, A7;
+  def XCRYPTOFB : I<0xe8, RawFrm, (outs), (ins), "xcryptofb", []>, A7;
+}
+
+let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in {
+  def XSHA1 : I<0xc8, RawFrm, (outs), (ins), "xsha1", []>, A6;
+  def XSHA256 : I<0xd0, RawFrm, (outs), (ins), "xsha256", []>, A6;
+}
+let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
+  def MONTMUL : I<0xc0, RawFrm, (outs), (ins), "montmul", []>, A6;
diff --git a/lib/Target/X86/X86MCAsmInfo.cpp b/lib/Target/X86/X86MCAsmInfo.cpp
index 6686214..83bba52 100644
--- a/lib/Target/X86/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/X86MCAsmInfo.cpp
@@ -15,7 +15,9 @@
 #include "X86TargetMachine.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ELF.h"
 using namespace llvm;
@@ -69,7 +71,22 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &Triple) {
   DwarfUsesInlineInfoSection = true;
 
   // Exceptions handling
-  ExceptionsType = ExceptionHandling::DwarfTable;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
+
+const MCExpr *
+X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                                   unsigned Encoding,
+                                                   MCStreamer &Streamer) const {
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Res =
+    MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Context);
+  const MCExpr *Four = MCConstantExpr::Create(4, Context);
+  return MCBinaryExpr::CreateAdd(Res, Four, Context);
+}
+
+X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple)
+  : X86MCAsmInfoDarwin(Triple) {
 }
 
 X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
@@ -89,7 +106,9 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
   SupportsDebugInformation = true;
 
   // Exceptions handling
-  ExceptionsType = ExceptionHandling::DwarfTable;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  DwarfRequiresFrameSection = false;
 
   // OpenBSD has buggy support for .quad in 32-bit mode, just split into two
   // .words.
diff --git a/lib/Target/X86/X86MCAsmInfo.h b/lib/Target/X86/X86MCAsmInfo.h
index 5815225..2cd4c8e 100644
--- a/lib/Target/X86/X86MCAsmInfo.h
+++ b/lib/Target/X86/X86MCAsmInfo.h
@@ -25,6 +25,14 @@ namespace llvm {
     explicit X86MCAsmInfoDarwin(const Triple &Triple);
   };
 
+  struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin {
+    explicit X86_64MCAsmInfoDarwin(const Triple &Triple);
+    virtual const MCExpr *
+    getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                unsigned Encoding,
+                                MCStreamer &Streamer) const;
+  };
+
   struct X86ELFMCAsmInfo : public MCAsmInfo {
     explicit X86ELFMCAsmInfo(const Triple &Triple);
     virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const;
diff --git a/lib/Target/X86/X86MCCodeEmitter.cpp b/lib/Target/X86/X86MCCodeEmitter.cpp
index 0e3b571..f195a67 100644
--- a/lib/Target/X86/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/X86MCCodeEmitter.cpp
@@ -382,7 +382,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
                                            const TargetInstrDesc &Desc,
                                            raw_ostream &OS) const {
   bool HasVEX_4V = false;
-  if ((TSFlags >> 32) & X86II::VEX_4V)
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_4V)
     HasVEX_4V = true;
 
   // VEX_R: opcode externsion equivalent to REX.R in
@@ -446,10 +446,10 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   if (TSFlags & X86II::OpSize)
     VEX_PP = 0x01;
 
-  if ((TSFlags >> 32) & X86II::VEX_W)
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
     VEX_W = 1;
 
-  if ((TSFlags >> 32) & X86II::VEX_L)
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
     VEX_L = 1;
 
   switch (TSFlags & X86II::Op0Mask) {
@@ -470,6 +470,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   case X86II::XD:  // F2 0F
     VEX_PP = 0x3;
     break;
+  case X86II::A6:  // Bypass: Not used by VEX
+  case X86II::A7:  // Bypass: Not used by VEX
   case X86II::TB:  // Bypass: Not used by VEX
   case 0:
     break;  // No prefix!
@@ -512,13 +514,13 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     }
 
     // To only check operands before the memory address ones, start
-    // the search from the begining
+    // the search from the beginning
     if (IsDestMem)
       CurOp = 0;
 
     // If the last register should be encoded in the immediate field
     // do not use any bit from VEX prefix to this register, ignore it
-    if ((TSFlags >> 32) & X86II::VEX_I8IMM)
+    if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM)
       NumOps--;
 
     for (; CurOp != NumOps; ++CurOp) {
@@ -742,6 +744,8 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   case X86II::TB:  // Two-byte opcode prefix
   case X86II::T8:  // 0F 38
   case X86II::TA:  // 0F 3A
+  case X86II::A6:  // 0F A6
+  case X86II::A7:  // 0F A7
     Need0FPrefix = true;
     break;
   case X86II::TF: // F2 0F 38
@@ -786,6 +790,12 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   case X86II::TA:    // 0F 3A
     EmitByte(0x3A, CurByte, OS);
     break;
+  case X86II::A6:    // 0F A6
+    EmitByte(0xA6, CurByte, OS);
+    break;
+  case X86II::A7:    // 0F A7
+    EmitByte(0xA7, CurByte, OS);
+    break;
   }
 }
 
@@ -819,9 +829,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   // It uses the VEX.VVVV field?
   bool HasVEX_4V = false;
 
-  if ((TSFlags >> 32) & X86II::VEX)
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX)
     HasVEXPrefix = true;
-  if ((TSFlags >> 32) & X86II::VEX_4V)
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_4V)
     HasVEX_4V = true;
 
   
@@ -837,7 +847,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   
   unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
   
-  if ((TSFlags >> 32) & X86II::Has3DNow0F0FOpcode)
+  if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode)
     BaseOpcode = 0x0F;   // Weird 3DNow! encoding.
   
   unsigned SrcRegNum = 0;
@@ -994,7 +1004,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   if (CurOp != NumOps) {
     // The last source register of a 4 operand instruction in AVX is encoded
     // in bits[7:4] of a immediate byte, and bits[3:0] are ignored.
-    if ((TSFlags >> 32) & X86II::VEX_I8IMM) {
+    if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
       const MCOperand &MO = MI.getOperand(CurOp++);
       bool IsExtReg =
         X86InstrInfo::isX86_64ExtendedReg(MO.getReg());
@@ -1017,7 +1027,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     }
   }
 
-  if ((TSFlags >> 32) & X86II::Has3DNow0F0FOpcode)
+  if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode)
     EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
   
 
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 2f6bd88..37fb0fe 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -308,6 +308,33 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
   return 0;
 }
 
+const TargetRegisterClass*
+X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
+  const TargetRegisterClass *Super = RC;
+  TargetRegisterClass::sc_iterator I = RC->superclasses_begin();
+  do {
+    switch (Super->getID()) {
+    case X86::GR8RegClassID:
+    case X86::GR16RegClassID:
+    case X86::GR32RegClassID:
+    case X86::GR64RegClassID:
+    case X86::FR32RegClassID:
+    case X86::FR64RegClassID:
+    case X86::RFP32RegClassID:
+    case X86::RFP64RegClassID:
+    case X86::RFP80RegClassID:
+    case X86::VR128RegClassID:
+    case X86::VR256RegClassID:
+      // Don't return a super-class that would shrink the spill size.
+      // That can happen with the vector and float classes.
+      if (Super->getSize() == RC->getSize())
+        return Super;
+    }
+    Super = *I++;
+  } while (Super);
+  return RC;
+}
+
 const TargetRegisterClass *
 X86RegisterInfo::getPointerRegClass(unsigned Kind) const {
   switch (Kind) {
@@ -337,7 +364,27 @@ X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
     else
       return &X86::GR32RegClass;
   }
-  return NULL;
+  return RC;
+}
+
+unsigned
+X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                     MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0;
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case X86::GR32RegClassID:
+    return 4 - FPDiff;
+  case X86::GR64RegClassID:
+    return 12 - FPDiff;
+  case X86::VR128RegClassID:
+    return TM.getSubtarget<X86Subtarget>().is64Bit() ? 10 : 4;
+  case X86::VR64RegClassID:
+    return 4;
+  }
 }
 
 const unsigned *
@@ -450,7 +497,7 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   // FIXME: It's more complicated than this...
   if (0 && requiresRealignment && MFI->hasVarSizedObjects())
     report_fatal_error(
-      "Stack realignment in presense of dynamic allocas is not supported");
+      "Stack realignment in presence of dynamic allocas is not supported");
 
   // If we've requested that we force align the stack do so now.
   if (ForceStackAlign)
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 064be64..9970c52 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -91,6 +91,9 @@ public:
   getMatchingSuperRegClass(const TargetRegisterClass *A,
                            const TargetRegisterClass *B, unsigned Idx) const;
 
+  const TargetRegisterClass*
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
+
   /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
   /// values.
   const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
@@ -101,6 +104,9 @@ public:
   const TargetRegisterClass *
   getCrossCopyRegClass(const TargetRegisterClass *RC) const;
 
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const;
+
   /// getCalleeSavedRegs - Return a null-terminated list of all of the
   /// callee-save registers on this target.
   const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 612fac2..fd7a247 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -46,7 +46,8 @@ let Namespace = "X86" in {
   def CL : Register<"cl">, DwarfRegNum<[2, 1, 1]>;
   def BL : Register<"bl">, DwarfRegNum<[3, 3, 3]>;
 
-  // X86-64 only
+  // X86-64 only, requires REX.
+  let CostPerUse = 1 in {
   def SIL : Register<"sil">, DwarfRegNum<[4, 6, 6]>;
   def DIL : Register<"dil">, DwarfRegNum<[5, 7, 7]>;
   def BPL : Register<"bpl">, DwarfRegNum<[6, 4, 5]>;
@@ -59,6 +60,7 @@ let Namespace = "X86" in {
   def R13B : Register<"r13b">, DwarfRegNum<[13, -2, -2]>;
   def R14B : Register<"r14b">, DwarfRegNum<[14, -2, -2]>;
   def R15B : Register<"r15b">, DwarfRegNum<[15, -2, -2]>;
+  }
 
   // High registers. On x86-64, these cannot be used in any instruction
   // with a REX prefix.
@@ -82,8 +84,8 @@ let Namespace = "X86" in {
   }
   def IP : Register<"ip">, DwarfRegNum<[16]>;
 
-  // X86-64 only
-  let SubRegIndices = [sub_8bit] in {
+  // X86-64 only, requires REX.
+  let SubRegIndices = [sub_8bit], CostPerUse = 1 in {
   def R8W  : RegisterWithSubRegs<"r8w", [R8B]>, DwarfRegNum<[8, -2, -2]>;
   def R9W  : RegisterWithSubRegs<"r9w", [R9B]>, DwarfRegNum<[9, -2, -2]>;
   def R10W : RegisterWithSubRegs<"r10w", [R10B]>, DwarfRegNum<[10, -2, -2]>;
@@ -105,7 +107,8 @@ let Namespace = "X86" in {
   def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[7, 5, 4]>;
   def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[16, 8, 8]>;
 
-  // X86-64 only
+  // X86-64 only, requires REX
+  let CostPerUse = 1 in {
   def R8D  : RegisterWithSubRegs<"r8d", [R8W]>, DwarfRegNum<[8, -2, -2]>;
   def R9D  : RegisterWithSubRegs<"r9d", [R9W]>, DwarfRegNum<[9, -2, -2]>;
   def R10D : RegisterWithSubRegs<"r10d", [R10W]>, DwarfRegNum<[10, -2, -2]>;
@@ -114,7 +117,7 @@ let Namespace = "X86" in {
   def R13D : RegisterWithSubRegs<"r13d", [R13W]>, DwarfRegNum<[13, -2, -2]>;
   def R14D : RegisterWithSubRegs<"r14d", [R14W]>, DwarfRegNum<[14, -2, -2]>;
   def R15D : RegisterWithSubRegs<"r15d", [R15W]>, DwarfRegNum<[15, -2, -2]>;
-  }
+  }}
 
   // 64-bit registers, X86-64 only
   let SubRegIndices = [sub_32bit] in {
@@ -127,6 +130,8 @@ let Namespace = "X86" in {
   def RBP : RegisterWithSubRegs<"rbp", [EBP]>, DwarfRegNum<[6, -2, -2]>;
   def RSP : RegisterWithSubRegs<"rsp", [ESP]>, DwarfRegNum<[7, -2, -2]>;
 
+  // These also require REX.
+  let CostPerUse = 1 in {
   def R8  : RegisterWithSubRegs<"r8", [R8D]>, DwarfRegNum<[8, -2, -2]>;
   def R9  : RegisterWithSubRegs<"r9", [R9D]>, DwarfRegNum<[9, -2, -2]>;
   def R10 : RegisterWithSubRegs<"r10", [R10D]>, DwarfRegNum<[10, -2, -2]>;
@@ -136,7 +141,7 @@ let Namespace = "X86" in {
   def R14 : RegisterWithSubRegs<"r14", [R14D]>, DwarfRegNum<[14, -2, -2]>;
   def R15 : RegisterWithSubRegs<"r15", [R15D]>, DwarfRegNum<[15, -2, -2]>;
   def RIP : RegisterWithSubRegs<"rip", [EIP]>,  DwarfRegNum<[16, -2, -2]>;
-  }
+  }}
 
   // MMX Registers. These are actually aliased to ST0 .. ST7
   def MM0 : Register<"mm0">, DwarfRegNum<[41, 29, 29]>;
@@ -170,6 +175,7 @@ let Namespace = "X86" in {
   def XMM7: Register<"xmm7">, DwarfRegNum<[24, 28, 28]>;
 
   // X86-64 only
+  let CostPerUse = 1 in {
   def XMM8:  Register<"xmm8">,  DwarfRegNum<[25, -2, -2]>;
   def XMM9:  Register<"xmm9">,  DwarfRegNum<[26, -2, -2]>;
   def XMM10: Register<"xmm10">, DwarfRegNum<[27, -2, -2]>;
@@ -178,7 +184,7 @@ let Namespace = "X86" in {
   def XMM13: Register<"xmm13">, DwarfRegNum<[30, -2, -2]>;
   def XMM14: Register<"xmm14">, DwarfRegNum<[31, -2, -2]>;
   def XMM15: Register<"xmm15">, DwarfRegNum<[32, -2, -2]>;
-  }
+  }}
 
   // YMM Registers, used by AVX instructions
   let SubRegIndices = [sub_xmm] in {
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 42e8193..02754f9 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -178,7 +178,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
                                         bool isVolatile, bool AlwaysInline,
                                          MachinePointerInfo DstPtrInfo,
                                          MachinePointerInfo SrcPtrInfo) const {
-  // This requires the copy size to be a constant, preferrably
+  // This requires the copy size to be a constant, preferably
   // within a subtarget-specific limit.
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   if (!ConstantSize)
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 1ee7312..ba5864e 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -144,7 +144,8 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
 /// passed as the second argument. Otherwise it returns null.
 const char *X86Subtarget::getBZeroEntry() const {
   // Darwin 10 has a __bzero entry point for this purpose.
-  if (getDarwinVers() >= 10)
+  if (getTargetTriple().isMacOSX() &&
+      !getTargetTriple().isMacOSXVersionLT(10, 6))
     return "__bzero";
 
   return 0;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 0a62a02..286a798 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -165,9 +165,15 @@ public:
   bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
   bool hasVectorUAMem() const { return HasVectorUAMem; }
 
-  bool isTargetDarwin() const { return TargetTriple.getOS() == Triple::Darwin; }
-  bool isTargetFreeBSD() const { return TargetTriple.getOS() == Triple::FreeBSD; }
-  bool isTargetSolaris() const { return TargetTriple.getOS() == Triple::Solaris; }
+  const Triple &getTargetTriple() const { return TargetTriple; }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+  bool isTargetFreeBSD() const {
+    return TargetTriple.getOS() == Triple::FreeBSD;
+  }
+  bool isTargetSolaris() const {
+    return TargetTriple.getOS() == Triple::Solaris;
+  }
 
   // ELF is a reasonably sane default and the only other X86 targets we
   // support are Darwin and Windows. Just use "not those".
@@ -215,13 +221,6 @@ public:
     return PICStyle == PICStyles::StubDynamicNoPIC ||
            PICStyle == PICStyles::StubPIC; }
 
-  /// getDarwinVers - Return the darwin version number, 8 = Tiger, 9 = Leopard,
-  /// 10 = Snow Leopard, etc.
-  unsigned getDarwinVers() const {
-    if (isTargetDarwin()) return TargetTriple.getDarwinMajorNumber();
-    return 0;
-  }
-
   /// ClassifyGlobalReference - Classify a global variable reference for the
   /// current subtarget according to how we should reference it in a non-pcrel
   /// context.
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 889c824..7483329 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -26,19 +26,18 @@ using namespace llvm;
 
 static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
-  switch (TheTriple.getOS()) {
-  case Triple::Darwin:
-    return new X86MCAsmInfoDarwin(TheTriple);
-  case Triple::MinGW32:
-  case Triple::Cygwin:
-  case Triple::Win32:
-    if (TheTriple.getEnvironment() == Triple::MachO)
-      return new X86MCAsmInfoDarwin(TheTriple);
+
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) {
+    if (TheTriple.getArch() == Triple::x86_64)
+      return new X86_64MCAsmInfoDarwin(TheTriple);
     else
-      return new X86MCAsmInfoCOFF(TheTriple);
-  default:
-    return new X86ELFMCAsmInfo(TheTriple);
+      return new X86MCAsmInfoDarwin(TheTriple);
   }
+
+  if (TheTriple.isOSWindows())
+    return new X86MCAsmInfoCOFF(TheTriple);
+
+  return new X86ELFMCAsmInfo(TheTriple);
 }
 
 static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
@@ -48,19 +47,14 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
                                     bool RelaxAll,
                                     bool NoExecStack) {
   Triple TheTriple(TT);
-  switch (TheTriple.getOS()) {
-  case Triple::Darwin:
+
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
     return createMachOStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll);
-  case Triple::MinGW32:
-  case Triple::Cygwin:
-  case Triple::Win32:
-    if (TheTriple.getEnvironment() == Triple::MachO)
-      return createMachOStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll);
-    else
-      return createWinCOFFStreamer(Ctx, TAB, *_Emitter, _OS, RelaxAll);
-  default:
-    return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack);
-  }
+
+  if (TheTriple.isOSWindows())
+    return createWinCOFFStreamer(Ctx, TAB, *_Emitter, _OS, RelaxAll);
+
+  return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack);
 }
 
 extern "C" void LLVMInitializeX86Target() {
@@ -96,11 +90,11 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, const std::string &TT,
                                          const std::string &FS)
   : X86TargetMachine(T, TT, FS, false),
     DataLayout(getSubtargetImpl()->isTargetDarwin() ?
-               "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-n8:16:32" :
+               "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-n8:16:32" :
                (getSubtargetImpl()->isTargetCygMing() ||
                 getSubtargetImpl()->isTargetWindows()) ?
-               "e-p:32:32-f64:64:64-i64:64:64-f80:32:32-n8:16:32" :
-               "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-n8:16:32"),
+               "e-p:32:32-f64:64:64-i64:64:64-f80:32:32-f128:128:128-n8:16:32" :
+               "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-f128:128:128-n8:16:32"),
     InstrInfo(*this),
     TSInfo(*this),
     TLInfo(*this),
@@ -111,7 +105,7 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, const std::string &TT,
 X86_64TargetMachine::X86_64TargetMachine(const Target &T, const std::string &TT,
                                          const std::string &FS)
   : X86TargetMachine(T, TT, FS, true),
-    DataLayout("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-n8:16:32:64"),
+    DataLayout("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-n8:16:32:64"),
     InstrInfo(*this),
     TSInfo(*this),
     TLInfo(*this),
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index c15dfbb..1231798 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -38,6 +38,12 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
     getExprForDwarfGlobalReference(GV, Mang, MMI, Encoding, Streamer);
 }
 
+MCSymbol *X8664_MachoTargetObjectFile::
+getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
+                        MachineModuleInfo *MMI) const {
+  return Mang->getSymbol(GV);
+}
+
 unsigned X8632_ELFTargetObjectFile::getPersonalityEncoding() const {
   if (TM.getRelocationModel() == Reloc::PIC_)
     return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
@@ -52,7 +58,7 @@ unsigned X8632_ELFTargetObjectFile::getLSDAEncoding() const {
     return DW_EH_PE_absptr;
 }
 
-unsigned X8632_ELFTargetObjectFile::getFDEEncoding() const {
+unsigned X8632_ELFTargetObjectFile::getFDEEncoding(bool FDE) const {
   if (TM.getRelocationModel() == Reloc::PIC_)
     return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
   else
@@ -91,17 +97,14 @@ unsigned X8664_ELFTargetObjectFile::getLSDAEncoding() const {
   return DW_EH_PE_absptr;
 }
 
-unsigned X8664_ELFTargetObjectFile::getFDEEncoding() const {
-  CodeModel::Model Model = TM.getCodeModel();
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return DW_EH_PE_pcrel | (Model == CodeModel::Small ||
-                             Model == CodeModel::Medium ?
-                             DW_EH_PE_sdata4 : DW_EH_PE_sdata8);
+unsigned X8664_ELFTargetObjectFile::getFDEEncoding(bool CFI) const {
+  if (CFI)
+    return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
 
-  if (Model == CodeModel::Small || Model == CodeModel::Medium)
-    return DW_EH_PE_udata4;
+  if (TM.getRelocationModel() == Reloc::PIC_)
+    return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
 
-  return DW_EH_PE_absptr;
+  return DW_EH_PE_udata4;
 }
 
 unsigned X8664_ELFTargetObjectFile::getTTypeEncoding() const {
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index f2fd49c..e21b5bf 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -25,6 +25,12 @@ namespace llvm {
     getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
                                    MachineModuleInfo *MMI, unsigned Encoding,
                                    MCStreamer &Streamer) const;
+
+    // getCFIPersonalitySymbol - The symbol that gets passed to
+    // .cfi_personality.
+    virtual MCSymbol *
+    getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
+                            MachineModuleInfo *MMI) const;
   };
 
   class X8632_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
@@ -34,7 +40,7 @@ namespace llvm {
       :TM(tm) { }
     virtual unsigned getPersonalityEncoding() const;
     virtual unsigned getLSDAEncoding() const;
-    virtual unsigned getFDEEncoding() const;
+    virtual unsigned getFDEEncoding(bool CFI) const;
     virtual unsigned getTTypeEncoding() const;
   };
 
@@ -45,7 +51,7 @@ namespace llvm {
       :TM(tm) { }
     virtual unsigned getPersonalityEncoding() const;
     virtual unsigned getLSDAEncoding() const;
-    virtual unsigned getFDEEncoding() const;
+    virtual unsigned getFDEEncoding(bool CFI) const;
     virtual unsigned getTTypeEncoding() const;
   };
 
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index fc8a07a..6bec9f9 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -30,8 +30,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include <queue>
-#include <set>
 using namespace llvm;
 
 /// XCoreDAGToDAGISel - XCore specific code to select XCore machine
@@ -49,7 +47,8 @@ namespace {
         Subtarget(*TM.getSubtargetImpl()) { }
 
     SDNode *Select(SDNode *N);
-    
+    SDNode *SelectBRIND(SDNode *N);
+
     /// getI32Imm - Return a target constant with the specified value, of type
     /// i32.
     inline SDValue getI32Imm(unsigned Imm) {
@@ -154,62 +153,133 @@ bool XCoreDAGToDAGISel::SelectADDRcpii(SDValue Addr, SDValue &Base,
 
 SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
   DebugLoc dl = N->getDebugLoc();
-  EVT NVT = N->getValueType(0);
-  if (NVT == MVT::i32) {
-    switch (N->getOpcode()) {
-      default: break;
-      case ISD::Constant: {
-        uint64_t Val = cast<ConstantSDNode>(N)->getZExtValue();
-        if (immMskBitp(N)) {
-          // Transformation function: get the size of a mask
-          // Look for the first non-zero bit
-          SDValue MskSize = getI32Imm(32 - CountLeadingZeros_32(Val));
-          return CurDAG->getMachineNode(XCore::MKMSK_rus, dl,
-                                        MVT::i32, MskSize);
-        }
-        else if (!isUInt<16>(Val)) {
-          SDValue CPIdx =
-            CurDAG->getTargetConstantPool(ConstantInt::get(
-                                  Type::getInt32Ty(*CurDAG->getContext()), Val),
-                                          TLI.getPointerTy());
-          return CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32, 
-                                        MVT::Other, CPIdx, 
-                                        CurDAG->getEntryNode());
-        }
-        break;
-      }
-      case XCoreISD::LADD: {
-        SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                            N->getOperand(2) };
-        return CurDAG->getMachineNode(XCore::LADD_l5r, dl, MVT::i32, MVT::i32,
-                                      Ops, 3);
-      }
-      case XCoreISD::LSUB: {
-        SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                            N->getOperand(2) };
-        return CurDAG->getMachineNode(XCore::LSUB_l5r, dl, MVT::i32, MVT::i32,
-                                      Ops, 3);
-      }
-      case XCoreISD::MACCU: {
-        SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                          N->getOperand(2), N->getOperand(3) };
-        return CurDAG->getMachineNode(XCore::MACCU_l4r, dl, MVT::i32, MVT::i32,
-                                      Ops, 4);
-      }
-      case XCoreISD::MACCS: {
-        SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                          N->getOperand(2), N->getOperand(3) };
-        return CurDAG->getMachineNode(XCore::MACCS_l4r, dl, MVT::i32, MVT::i32,
-                                      Ops, 4);
-      }
-      case XCoreISD::LMUL: {
-        SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                          N->getOperand(2), N->getOperand(3) };
-        return CurDAG->getMachineNode(XCore::LMUL_l6r, dl, MVT::i32, MVT::i32,
-                                      Ops, 4);
-      }
-      // Other cases are autogenerated.
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    uint64_t Val = cast<ConstantSDNode>(N)->getZExtValue();
+    if (immMskBitp(N)) {
+      // Transformation function: get the size of a mask
+      // Look for the first non-zero bit
+      SDValue MskSize = getI32Imm(32 - CountLeadingZeros_32(Val));
+      return CurDAG->getMachineNode(XCore::MKMSK_rus, dl,
+                                    MVT::i32, MskSize);
+    }
+    else if (!isUInt<16>(Val)) {
+      SDValue CPIdx =
+        CurDAG->getTargetConstantPool(ConstantInt::get(
+                              Type::getInt32Ty(*CurDAG->getContext()), Val),
+                                      TLI.getPointerTy());
+      return CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32, 
+                                    MVT::Other, CPIdx, 
+                                    CurDAG->getEntryNode());
     }
+    break;
+  }
+  case XCoreISD::LADD: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        N->getOperand(2) };
+    return CurDAG->getMachineNode(XCore::LADD_l5r, dl, MVT::i32, MVT::i32,
+                                  Ops, 3);
+  }
+  case XCoreISD::LSUB: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        N->getOperand(2) };
+    return CurDAG->getMachineNode(XCore::LSUB_l5r, dl, MVT::i32, MVT::i32,
+                                  Ops, 3);
+  }
+  case XCoreISD::MACCU: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                      N->getOperand(2), N->getOperand(3) };
+    return CurDAG->getMachineNode(XCore::MACCU_l4r, dl, MVT::i32, MVT::i32,
+                                  Ops, 4);
+  }
+  case XCoreISD::MACCS: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                      N->getOperand(2), N->getOperand(3) };
+    return CurDAG->getMachineNode(XCore::MACCS_l4r, dl, MVT::i32, MVT::i32,
+                                  Ops, 4);
+  }
+  case XCoreISD::LMUL: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                      N->getOperand(2), N->getOperand(3) };
+    return CurDAG->getMachineNode(XCore::LMUL_l6r, dl, MVT::i32, MVT::i32,
+                                  Ops, 4);
+  }
+  case ISD::BRIND:
+    if (SDNode *ResNode = SelectBRIND(N))
+      return ResNode;
+    break;
+  // Other cases are autogenerated.
   }
   return SelectCode(N);
 }
+
+/// Given a chain return a new chain where any appearance of Old is replaced
+/// by New. There must be at most one instruction between Old and Chain and
+/// this instruction must be a TokenFactor. Returns an empty SDValue if 
+/// these conditions don't hold.
+static SDValue
+replaceInChain(SelectionDAG *CurDAG, SDValue Chain, SDValue Old, SDValue New)
+{
+  if (Chain == Old)
+    return New;
+  if (Chain->getOpcode() != ISD::TokenFactor)
+    return SDValue();
+  SmallVector<SDValue, 8> Ops;
+  bool found = false;
+  for (unsigned i = 0, e = Chain->getNumOperands(); i != e; ++i) {
+    if (Chain->getOperand(i) == Old) {
+      Ops.push_back(New);
+      found = true;
+    } else {
+      Ops.push_back(Chain->getOperand(i));
+    }
+  }
+  if (!found)
+    return SDValue();
+  return CurDAG->getNode(ISD::TokenFactor, Chain->getDebugLoc(), MVT::Other,
+                         &Ops[0], Ops.size());
+}
+
+SDNode *XCoreDAGToDAGISel::SelectBRIND(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  // (brind (int_xcore_checkevent (addr)))
+  SDValue Chain = N->getOperand(0);
+  SDValue Addr = N->getOperand(1);
+  if (Addr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return 0;
+  unsigned IntNo = cast<ConstantSDNode>(Addr->getOperand(1))->getZExtValue();
+  if (IntNo != Intrinsic::xcore_checkevent)
+    return 0;
+  SDValue nextAddr = Addr->getOperand(2);
+  SDValue CheckEventChainOut(Addr.getNode(), 1);
+  if (!CheckEventChainOut.use_empty()) {
+    // If the chain out of the checkevent intrinsic is an operand of the
+    // indirect branch or used in a TokenFactor which is the operand of the
+    // indirect branch then build a new chain which uses the chain coming into
+    // the checkevent intrinsic instead.
+    SDValue CheckEventChainIn = Addr->getOperand(0);
+    SDValue NewChain = replaceInChain(CurDAG, Chain, CheckEventChainOut,
+                                      CheckEventChainIn);
+    if (!NewChain.getNode())
+      return 0;
+    Chain = NewChain;
+  }
+  // Enable events on the thread using setsr 1 and then disable them immediately
+  // after with clrsr 1. If any resources owned by the thread are ready an event
+  // will be taken. If no resource is ready we branch to the address which was
+  // the operand to the checkevent intrinsic.
+  SDValue constOne = getI32Imm(1);
+  SDValue Glue =
+    SDValue(CurDAG->getMachineNode(XCore::SETSR_branch_u6, dl, MVT::Glue,
+                                   constOne, Chain), 0);
+  Glue =
+    SDValue(CurDAG->getMachineNode(XCore::CLRSR_branch_u6, dl, MVT::Glue,
+                                   constOne, Glue), 0);
+  if (nextAddr->getOpcode() == XCoreISD::PCRelativeWrapper &&
+      nextAddr->getOperand(0)->getOpcode() == ISD::TargetBlockAddress) {
+    return CurDAG->SelectNodeTo(N, XCore::BRFU_lu6, MVT::Other,
+                                nextAddr->getOperand(0), Glue);
+  }
+  return CurDAG->SelectNodeTo(N, XCore::BAU_1r, MVT::Other, nextAddr, Glue);
+}
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 4817787..5987e8b 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -37,8 +37,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/VectorExtras.h"
-#include <queue>
-#include <set>
 using namespace llvm;
 
 const char *XCoreTargetLowering::
@@ -967,7 +965,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
-  // The InFlag in necessary since all emited instructions must be
+  // The InFlag in necessary since all emitted instructions must be
   // stuck together.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index ecdd4cb..789546e 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -308,6 +308,16 @@ multiclass FU6_LU6<string OpcStr, SDNode OpNode> {
                  !strconcat(OpcStr, " $b"),
                  [(OpNode immU16:$b)]>;
 }
+multiclass FU6_LU6_int<string OpcStr, Intrinsic Int> {
+  def _u6: _FU6<
+                 (outs), (ins i32imm:$b),
+                 !strconcat(OpcStr, " $b"),
+                 [(Int immU6:$b)]>;
+  def _lu6: _FLU6<
+                 (outs), (ins i32imm:$b),
+                 !strconcat(OpcStr, " $b"),
+                 [(Int immU16:$b)]>;
+}
 
 multiclass FU6_LU6_np<string OpcStr> {
   def _u6: _FU6<
@@ -638,8 +648,8 @@ defm RETSP : FU6_LU6<"retsp", XCoreRetsp>;
 }
 }
 
-// TODO extdp, kentsp, krestsp, blat, setsr
-// clrsr, getsr, kalli
+// TODO extdp, kentsp, krestsp, blat
+// getsr, kalli
 let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
 def BRBU_u6 : _FU6<
                  (outs),
@@ -678,6 +688,17 @@ def LDAWCP_lu6: _FLRU6<
                     "ldaw r11, cp[$a]",
                     [(set R11, ADDRcpii:$a)]>;
 
+defm SETSR : FU6_LU6_int<"setsr", int_xcore_setsr>;
+
+defm CLRSR : FU6_LU6_int<"clrsr", int_xcore_clrsr>;
+
+// setsr may cause a branch if it is used to enable events. clrsr may
+// branch if it is executed while events are enabled.
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in {
+defm SETSR_branch : FU6_LU6_np<"setsr">;
+defm CLRSR_branch : FU6_LU6_np<"clrsr">;
+}
+
 // U10
 // TODO ldwcpl, blacp
 
@@ -718,7 +739,7 @@ def BL_lu10 : _FLU10<
 }
 
 // Two operand short
-// TODO getr, getst
+// TODO eet, eef, testwct, tsetmr, sext (reg), zext (reg)
 def NOT : _F2R<(outs GRRegs:$dst), (ins GRRegs:$b),
                  "not $dst, $b",
                  [(set GRRegs:$dst, (not GRRegs:$b))]>;
@@ -727,8 +748,6 @@ def NEG : _F2R<(outs GRRegs:$dst), (ins GRRegs:$b),
                  "neg $dst, $b",
                  [(set GRRegs:$dst, (ineg GRRegs:$b))]>;
 
-// TODO setd, eet, eef, testwct, tinitpc, tinitdp,
-// tinitsp, tinitcp, tsetmr, sext (reg), zext (reg)
 let Constraints = "$src1 = $dst" in {
 let neverHasSideEffects = 1 in
 def SEXT_rus : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
@@ -816,9 +835,29 @@ def SETD_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val),
                  "setd res[$r], $val",
                  [(int_xcore_setd GRRegs:$r, GRRegs:$val)]>;
 
+def GETST_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r),
+                    "getst $dst, res[$r]",
+                    [(set GRRegs:$dst, (int_xcore_getst GRRegs:$r))]>;
+
+def INITSP_2r : _F2R<(outs), (ins GRRegs:$t, GRRegs:$src),
+                     "init t[$t]:sp, $src",
+                     [(int_xcore_initsp GRRegs:$t, GRRegs:$src)]>;
+
+def INITPC_2r : _F2R<(outs), (ins GRRegs:$t, GRRegs:$src),
+                     "init t[$t]:pc, $src",
+                     [(int_xcore_initpc GRRegs:$t, GRRegs:$src)]>;
+
+def INITCP_2r : _F2R<(outs), (ins GRRegs:$t, GRRegs:$src),
+                     "init t[$t]:cp, $src",
+                     [(int_xcore_initcp GRRegs:$t, GRRegs:$src)]>;
+
+def INITDP_2r : _F2R<(outs), (ins GRRegs:$t, GRRegs:$src),
+                     "init t[$t]:dp, $src",
+                     [(int_xcore_initdp GRRegs:$t, GRRegs:$src)]>;
+
 // Two operand long
-// TODO setclk, setrdy, setpsc, endin, peek,
-// getd, testlcl, tinitlr, getps, setps
+// TODO endin, peek,
+// getd, testlcl
 def BITREV_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
                  "bitrev $dst, $src",
                  [(set GRRegs:$dst, (int_xcore_bitrev GRRegs:$src))]>;
@@ -839,10 +878,41 @@ def SETTW_l2r : _FL2R<(outs), (ins GRRegs:$r, GRRegs:$val),
                   "settw res[$r], $val",
                   [(int_xcore_settw GRRegs:$r, GRRegs:$val)]>;
 
+def GETPS_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
+                 "get $dst, ps[$src]",
+                 [(set GRRegs:$dst, (int_xcore_getps GRRegs:$src))]>;
+
+def SETPS_l2r : _FL2R<(outs), (ins GRRegs:$src1, GRRegs:$src2),
+                 "set ps[$src1], $src2",
+                 [(int_xcore_setps GRRegs:$src1, GRRegs:$src2)]>;
+
+def INITLR_l2r : _FL2R<(outs), (ins GRRegs:$t, GRRegs:$src),
+                       "init t[$t]:lr, $src",
+                       [(int_xcore_initlr GRRegs:$t, GRRegs:$src)]>;
+
+def SETCLK_l2r : _FL2R<(outs), (ins GRRegs:$src1, GRRegs:$src2),
+                       "setclk res[$src1], $src2",
+                       [(int_xcore_setclk GRRegs:$src1, GRRegs:$src2)]>;
+
+def SETRDY_l2r : _FL2R<(outs), (ins GRRegs:$src1, GRRegs:$src2),
+                       "setrdy res[$src1], $src2",
+                       [(int_xcore_setrdy GRRegs:$src1, GRRegs:$src2)]>;
+
+def SETPSC_l2r : _FL2R<(outs), (ins GRRegs:$src1, GRRegs:$src2),
+                       "setpsc res[$src1], $src2",
+                       [(int_xcore_setpsc GRRegs:$src1, GRRegs:$src2)]>;
+
 // One operand short
-// TODO edu, eeu, waitet, waitef, tstart, msync, mjoin, clrtp
+// TODO edu, eeu, waitet, waitef, tstart, clrtp
 // setdp, setcp, setev, kcall
 // dgetreg
+def MSYNC_1r : _F1R<(outs), (ins GRRegs:$i),
+                    "msync res[$i]",
+		    [(int_xcore_msync GRRegs:$i)]>;
+def MJOIN_1r : _F1R<(outs), (ins GRRegs:$i),
+                    "mjoin res[$i]",
+		    [(int_xcore_mjoin GRRegs:$i)]>;
+
 let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
 def BAU_1r : _F1R<(outs), (ins GRRegs:$addr),
                  "bau $addr",
@@ -899,7 +969,7 @@ def EEU_1r : _F1R<(outs), (ins GRRegs:$r),
                [(int_xcore_eeu GRRegs:$r)]>;
 
 // Zero operand short
-// TODO ssync, freet, ldspc, stspc, ldssr, stssr, ldsed, stsed,
+// TODO freet, ldspc, stspc, ldssr, stssr, ldsed, stsed,
 // stet, geted, getet, getkep, getksp, setkep, getid, kret, dcall, dret,
 // dentsp, drestsp
 
@@ -910,6 +980,10 @@ def GETID_0R : _F0R<(outs), (ins),
                  "get r11, id",
                  [(set R11, (int_xcore_getid))]>;
 
+def SSYNC_0r : _F0R<(outs), (ins),
+                    "ssync",
+		    [(int_xcore_ssync)]>;
+
 let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1,
     hasSideEffects = 1 in
 def WAITEU_0R : _F0R<(outs), (ins),
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index 56c0879..0287a51 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -104,6 +104,11 @@ XCoreRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
   return TFI->hasFP(MF);
 }
 
+bool
+XCoreRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
+  return false;
+}
+
 // This function eliminates ADJCALLSTACKDOWN,
 // ADJCALLSTACKUP pseudo instructions
 void XCoreRegisterInfo::
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index 2185755..770483b 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -48,6 +48,8 @@ public:
   
   bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
+  bool useFPForScavengingIndex(const MachineFunction &MF) const;
+
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;