Update LLVM to r100181.

author: rdivacky <rdivacky@FreeBSD.org> 2010-04-02 08:54:30 +0000
committer: rdivacky <rdivacky@FreeBSD.org> 2010-04-02 08:54:30 +0000
commit: 20e856b2a58d12231aa42d5d13888b15ac03e5a4 (patch)
tree: cf5763d092b81cecc168fa28032247ee495d06e2 /lib/Target
parent: 2f2afc1aae898651e26987a5c71f3febb19bca98 (diff)
download: FreeBSD-src-20e856b2a58d12231aa42d5d13888b15ac03e5a4.zip
FreeBSD-src-20e856b2a58d12231aa42d5d13888b15ac03e5a4.tar.gz
72 files changed, 2678 insertions, 1734 deletions
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index bbb1dbd..6486a60 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -43,6 +43,21 @@ def FeatureThumb2 : SubtargetFeature<"thumb2", "ThumbMode", "Thumb2",
 def FeatureFP16   : SubtargetFeature<"fp16", "HasFP16", "true",
                                      "Enable half-precision floating point">;
 
+// Some processors have multiply-accumulate instructions that don't
+// play nicely with other VFP instructions, and it's generally better
+// to just not use them.
+// FIXME: Currently, this is only flagged for Cortex-A8. It may be true for
+// others as well. We should do more benchmarking and confirm one way or
+// the other.
+def FeatureHasSlowVMLx   : SubtargetFeature<"vmlx", "SlowVMLx", "true",
+                                            "Disable VFP MAC instructions">;
+// Some processors benefit from using NEON instructions for scalar
+// single-precision FP operations.
+def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
+                                        "true",
+                                        "Use NEON for single precision FP">;
+
+
 //===----------------------------------------------------------------------===//
 // ARM Processors supported.
 //
@@ -92,7 +107,8 @@ def : ProcNoItin<"iwmmxt",          [ArchV5TE]>;
 
 // V6 Processors.
 def : Processor<"arm1136j-s",       ARMV6Itineraries, [ArchV6]>;
-def : Processor<"arm1136jf-s",      ARMV6Itineraries, [ArchV6, FeatureVFP2]>;
+def : Processor<"arm1136jf-s",      ARMV6Itineraries, [ArchV6, FeatureVFP2,
+                                                       FeatureHasSlowVMLx]>;
 def : Processor<"arm1176jz-s",      ARMV6Itineraries, [ArchV6]>;
 def : Processor<"arm1176jzf-s",     ARMV6Itineraries, [ArchV6, FeatureVFP2]>;
 def : Processor<"mpcorenovfp",      ARMV6Itineraries, [ArchV6]>;
@@ -106,7 +122,8 @@ def : Processor<"arm1156t2f-s",    ARMV6Itineraries,
 
 // V7 Processors.
 def : Processor<"cortex-a8",        CortexA8Itineraries,
-                [ArchV7A, FeatureThumb2, FeatureNEON]>;
+                [ArchV7A, FeatureThumb2, FeatureNEON, FeatureHasSlowVMLx,
+                 FeatureNEONForFP]>;
 def : ProcNoItin<"cortex-a9",       [ArchV7A, FeatureThumb2, FeatureNEON]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index e6ea03a..0a0b0ea 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -204,7 +204,15 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
                                 bool AllowModify) const {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
     return false;
 
   // Get the last instruction in the block.
@@ -275,6 +283,11 @@ unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin()) return 0;
   --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
   if (!isUncondBranchOpcode(I->getOpcode()) &&
       !isCondBranchOpcode(I->getOpcode()))
     return 0;
@@ -738,14 +751,16 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
             RC == ARM::QPR_VFP2RegisterClass) && "Unknown regclass!");
     // FIXME: Neon instructions should support predicates
     if (Align >= 16 && (getRegisterInfo().canRealignStack(MF))) {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64))
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q))
                      .addFrameIndex(FI).addImm(128)
                      .addMemOperand(MMO)
                      .addReg(SrcReg, getKillRegState(isKill)));
     } else {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRQ)).
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQ)).
                      addReg(SrcReg, getKillRegState(isKill))
-                     .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+                     .addFrameIndex(FI)
+                     .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))
+                     .addMemOperand(MMO));
     }
   }
 }
@@ -788,12 +803,14 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
             RC == ARM::QPR_8RegisterClass) && "Unknown regclass!");
     if (Align >= 16
         && (getRegisterInfo().canRealignStack(MF))) {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg)
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q), DestReg)
                      .addFrameIndex(FI).addImm(128)
                      .addMemOperand(MMO));
     } else {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRQ), DestReg)
-                     .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQ), DestReg)
+                     .addFrameIndex(FI)
+                     .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))
+                     .addMemOperand(MMO));
     }
   }
 }
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 71207c8..7d48663 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -124,17 +124,17 @@ private:
   /// SelectDYN_ALLOC - Select dynamic alloc for Thumb.
   SDNode *SelectDYN_ALLOC(SDNode *N);
 
-  /// SelectVLD - Select NEON load intrinsics.  NumVecs should
-  /// be 2, 3 or 4.  The opcode arrays specify the instructions used for
+  /// SelectVLD - Select NEON load intrinsics.  NumVecs should be
+  /// 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// loads of D registers and even subregs and odd subregs of Q registers.
-  /// For NumVecs == 2, QOpcodes1 is not used.
+  /// For NumVecs <= 2, QOpcodes1 is not used.
   SDNode *SelectVLD(SDNode *N, unsigned NumVecs, unsigned *DOpcodes,
                     unsigned *QOpcodes0, unsigned *QOpcodes1);
 
   /// SelectVST - Select NEON store intrinsics.  NumVecs should
-  /// be 2, 3 or 4.  The opcode arrays specify the instructions used for
+  /// be 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// stores of D registers and even subregs and odd subregs of Q registers.
-  /// For NumVecs == 2, QOpcodes1 is not used.
+  /// For NumVecs <= 2, QOpcodes1 is not used.
   SDNode *SelectVST(SDNode *N, unsigned NumVecs, unsigned *DOpcodes,
                     unsigned *QOpcodes0, unsigned *QOpcodes1);
 
@@ -1022,7 +1022,7 @@ static EVT GetNEONSubregVT(EVT VT) {
 SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
                                    unsigned *DOpcodes, unsigned *QOpcodes0,
                                    unsigned *QOpcodes1) {
-  assert(NumVecs >=2 && NumVecs <= 4 && "VLD NumVecs out-of-range");
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
   DebugLoc dl = N->getDebugLoc();
 
   SDValue MemAddr, Align;
@@ -1047,6 +1047,9 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
   case MVT::v8i16: OpcodeIndex = 1; break;
   case MVT::v4f32:
   case MVT::v4i32: OpcodeIndex = 2; break;
+  case MVT::v2i64: OpcodeIndex = 3;
+    assert(NumVecs == 1 && "v2i64 type only supported for VLD1");
+    break;
   }
 
   SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32);
@@ -1060,15 +1063,15 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
   }
 
   EVT RegVT = GetNEONSubregVT(VT);
-  if (NumVecs == 2) {
-    // Quad registers are directly supported for VLD2,
-    // loading 2 pairs of D regs.
+  if (NumVecs <= 2) {
+    // Quad registers are directly supported for VLD1 and VLD2,
+    // loading pairs of D regs.
     unsigned Opc = QOpcodes0[OpcodeIndex];
     const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain };
-    std::vector<EVT> ResTys(4, VT);
+    std::vector<EVT> ResTys(2 * NumVecs, RegVT);
     ResTys.push_back(MVT::Other);
     SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 5);
-    Chain = SDValue(VLd, 4);
+    Chain = SDValue(VLd, 2 * NumVecs);
 
     // Combine the even and odd subregs to produce the result.
     for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
@@ -1109,7 +1112,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
 SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
                                    unsigned *DOpcodes, unsigned *QOpcodes0,
                                    unsigned *QOpcodes1) {
-  assert(NumVecs >=2 && NumVecs <= 4 && "VST NumVecs out-of-range");
+  assert(NumVecs >=1 && NumVecs <= 4 && "VST NumVecs out-of-range");
   DebugLoc dl = N->getDebugLoc();
 
   SDValue MemAddr, Align;
@@ -1134,6 +1137,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
   case MVT::v8i16: OpcodeIndex = 1; break;
   case MVT::v4f32:
   case MVT::v4i32: OpcodeIndex = 2; break;
+  case MVT::v2i64: OpcodeIndex = 3;
+    assert(NumVecs == 1 && "v2i64 type only supported for VST1");
+    break;
   }
 
   SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32);
@@ -1154,9 +1160,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
   }
 
   EVT RegVT = GetNEONSubregVT(VT);
-  if (NumVecs == 2) {
-    // Quad registers are directly supported for VST2,
-    // storing 2 pairs of D regs.
+  if (NumVecs <= 2) {
+    // Quad registers are directly supported for VST1 and VST2,
+    // storing pairs of D regs.
     unsigned Opc = QOpcodes0[OpcodeIndex];
     for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
       Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
@@ -1167,7 +1173,8 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
     Ops.push_back(Pred);
     Ops.push_back(Reg0); // predicate register
     Ops.push_back(Chain);
-    return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), 9);
+    return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(),
+                                  5 + 2 * NumVecs);
   }
 
   // Otherwise, quad registers are stored with two separate instructions,
@@ -1694,6 +1701,35 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       ResNode = SelectARMIndexedLoad(N);
     if (ResNode)
       return ResNode;
+
+    // VLDMQ must be custom-selected for "v2f64 load" to set the AM5Opc value.
+    if (Subtarget->hasVFP2() &&
+        N->getValueType(0).getSimpleVT().SimpleTy == MVT::v2f64) {
+      SDValue Chain = N->getOperand(0);
+      SDValue AM5Opc =
+        CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::ia, 4), MVT::i32);
+      SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32);
+      SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+      SDValue Ops[] = { N->getOperand(1), AM5Opc, Pred, PredReg, Chain };
+      return CurDAG->getMachineNode(ARM::VLDMQ, dl, MVT::v2f64, MVT::Other,
+                                    Ops, 5);
+    }
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::STORE: {
+    // VSTMQ must be custom-selected for "v2f64 store" to set the AM5Opc value.
+    if (Subtarget->hasVFP2() &&
+        N->getOperand(1).getValueType().getSimpleVT().SimpleTy == MVT::v2f64) {
+      SDValue Chain = N->getOperand(0);
+      SDValue AM5Opc =
+        CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::ia, 4), MVT::i32);
+      SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32);
+      SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+      SDValue Ops[] = { N->getOperand(1), N->getOperand(2),
+                        AM5Opc, Pred, PredReg, Chain };
+      return CurDAG->getMachineNode(ARM::VSTMQ, dl, MVT::Other, Ops, 6);
+    }
     // Other cases are autogenerated.
     break;
   }
@@ -1831,16 +1867,24 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     default:
       break;
 
+    case Intrinsic::arm_neon_vld1: {
+      unsigned DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16,
+                              ARM::VLD1d32, ARM::VLD1d64 };
+      unsigned QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
+                              ARM::VLD1q32, ARM::VLD1q64 };
+      return SelectVLD(N, 1, DOpcodes, QOpcodes, 0);
+    }
+
     case Intrinsic::arm_neon_vld2: {
       unsigned DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16,
-                              ARM::VLD2d32, ARM::VLD2d64 };
+                              ARM::VLD2d32, ARM::VLD1q64 };
       unsigned QOpcodes[] = { ARM::VLD2q8, ARM::VLD2q16, ARM::VLD2q32 };
       return SelectVLD(N, 2, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vld3: {
       unsigned DOpcodes[] = { ARM::VLD3d8, ARM::VLD3d16,
-                              ARM::VLD3d32, ARM::VLD3d64 };
+                              ARM::VLD3d32, ARM::VLD1d64T };
       unsigned QOpcodes0[] = { ARM::VLD3q8_UPD,
                                ARM::VLD3q16_UPD,
                                ARM::VLD3q32_UPD };
@@ -1852,7 +1896,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 
     case Intrinsic::arm_neon_vld4: {
       unsigned DOpcodes[] = { ARM::VLD4d8, ARM::VLD4d16,
-                              ARM::VLD4d32, ARM::VLD4d64 };
+                              ARM::VLD4d32, ARM::VLD1d64Q };
       unsigned QOpcodes0[] = { ARM::VLD4q8_UPD,
                                ARM::VLD4q16_UPD,
                                ARM::VLD4q32_UPD };
@@ -1883,16 +1927,24 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       return SelectVLDSTLane(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
     }
 
+    case Intrinsic::arm_neon_vst1: {
+      unsigned DOpcodes[] = { ARM::VST1d8, ARM::VST1d16,
+                              ARM::VST1d32, ARM::VST1d64 };
+      unsigned QOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
+                              ARM::VST1q32, ARM::VST1q64 };
+      return SelectVST(N, 1, DOpcodes, QOpcodes, 0);
+    }
+
     case Intrinsic::arm_neon_vst2: {
       unsigned DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,
-                              ARM::VST2d32, ARM::VST2d64 };
+                              ARM::VST2d32, ARM::VST1q64 };
       unsigned QOpcodes[] = { ARM::VST2q8, ARM::VST2q16, ARM::VST2q32 };
       return SelectVST(N, 2, DOpcodes, QOpcodes, 0);
     }
 
     case Intrinsic::arm_neon_vst3: {
       unsigned DOpcodes[] = { ARM::VST3d8, ARM::VST3d16,
-                              ARM::VST3d32, ARM::VST3d64 };
+                              ARM::VST3d32, ARM::VST1d64T };
       unsigned QOpcodes0[] = { ARM::VST3q8_UPD,
                                ARM::VST3q16_UPD,
                                ARM::VST3q32_UPD };
@@ -1904,7 +1956,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 
     case Intrinsic::arm_neon_vst4: {
       unsigned DOpcodes[] = { ARM::VST4d8, ARM::VST4d16,
-                              ARM::VST4d32, ARM::VST4d64 };
+                              ARM::VST4d32, ARM::VST1d64Q };
       unsigned QOpcodes0[] = { ARM::VST4q8_UPD,
                                ARM::VST4q16_UPD,
                                ARM::VST4q32_UPD };
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 0d0a004..b6c81f6 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -456,6 +456,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     // Generic (and overly aggressive) if-conversion limits.
     setIfCvtBlockSizeLimit(10);
     setIfCvtDupBlockSizeLimit(2);
+  } else if (Subtarget->hasV7Ops()) {
+    setIfCvtBlockSizeLimit(3);
+    setIfCvtDupBlockSizeLimit(1);
   } else if (Subtarget->hasV6Ops()) {
     setIfCvtBlockSizeLimit(2);
     setIfCvtDupBlockSizeLimit(1);
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 4f6f05d..4427e50 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -1,10 +1,10 @@
 //===- ARMInstrFormats.td - ARM Instruction Formats --*- tablegen -*---------=//
-// 
+//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
@@ -59,7 +59,18 @@ def NEONDupFrm    : Format<28>;
 def MiscFrm       : Format<29>;
 def ThumbMiscFrm  : Format<30>;
 
-def NLdStFrm      : Format<31>;
+def NLdStFrm       : Format<31>;
+def N1RegModImmFrm : Format<32>;
+def N2RegFrm       : Format<33>;
+def NVCVTFrm       : Format<34>;
+def NVDupLnFrm     : Format<35>;
+def N2RegVShLFrm   : Format<36>;
+def N2RegVShRFrm   : Format<37>;
+def N3RegFrm       : Format<38>;
+def N3RegVShFrm    : Format<39>;
+def NVExtFrm       : Format<40>;
+def NVMulSLFrm     : Format<41>;
+def NVTBLFrm       : Format<42>;
 
 // Misc flags.
 
@@ -177,13 +188,13 @@ class InstTemplate<AddrMode am, SizeFlagVal sz, IndexMode im,
   // TSFlagsFields
   AddrMode AM = am;
   bits<4> AddrModeBits = AM.Value;
-  
+
   SizeFlagVal SZ = sz;
   bits<3> SizeFlag = SZ.Value;
 
   IndexMode IM = im;
   bits<2> IndexModeBits = IM.Value;
-  
+
   Format F = f;
   bits<6> Form = F.Value;
 
@@ -195,7 +206,7 @@ class InstTemplate<AddrMode am, SizeFlagVal sz, IndexMode im,
   //
   bit isUnaryDataProc = 0;
   bit canXformTo16Bit = 0;
-  
+
   let Constraints = cstr;
   let Itinerary = itin;
 }
@@ -214,9 +225,9 @@ class InstThumb<AddrMode am, SizeFlagVal sz, IndexMode im,
                 Format f, Domain d, string cstr, InstrItinClass itin>
   : InstTemplate<am, sz, im, f, d, cstr, itin>;
 
-class PseudoInst<dag oops, dag iops, InstrItinClass itin, 
+class PseudoInst<dag oops, dag iops, InstrItinClass itin,
                  string asm, list<dag> pattern>
-  : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, GenericDomain, 
+  : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, GenericDomain,
             "", itin> {
   let OutOperandList = oops;
   let InOperandList = iops;
@@ -226,7 +237,7 @@ class PseudoInst<dag oops, dag iops, InstrItinClass itin,
 
 // Almost all ARM instructions are predicable.
 class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
-        IndexMode im, Format f, InstrItinClass itin, 
+        IndexMode im, Format f, InstrItinClass itin,
         string opc, string asm, string cstr,
         list<dag> pattern>
   : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
@@ -238,9 +249,9 @@ class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
 }
 // A few are not predicable
 class InoP<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
-        IndexMode im, Format f, InstrItinClass itin, 
-        string opc, string asm, string cstr,
-        list<dag> pattern>
+           IndexMode im, Format f, InstrItinClass itin,
+           string opc, string asm, string cstr,
+           list<dag> pattern>
   : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = iops;
@@ -290,9 +301,9 @@ class AXI<dag oops, dag iops, Format f, InstrItinClass itin,
   : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
        asm, "", pattern>;
 class AInoP<dag oops, dag iops, Format f, InstrItinClass itin,
-         string opc, string asm, list<dag> pattern>
+            string opc, string asm, list<dag> pattern>
   : InoP<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
-      opc, asm, "", pattern>;
+         opc, asm, "", pattern>;
 
 // Ctrl flow instructions
 class ABI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
@@ -362,7 +373,7 @@ class AXI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{24-21} = opcod;
   let Inst{27-26} = {0,0};
 }
-class AI1x2<dag oops, dag iops, Format f, InstrItinClass itin, 
+class AI1x2<dag oops, dag iops, Format f, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrMode1, Size8Bytes, IndexModeNone, f, itin,
       opc, asm, "", pattern>;
@@ -387,7 +398,7 @@ class AI2ldw<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{24}    = 1; // P bit
   let Inst{27-26} = {0,1};
 }
-class AXI2ldw<dag oops, dag iops, Format f, InstrItinClass itin, 
+class AXI2ldw<dag oops, dag iops, Format f, InstrItinClass itin,
               string asm, list<dag> pattern>
   : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
        asm, "", pattern> {
@@ -407,7 +418,7 @@ class AI2ldb<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{24}    = 1; // P bit
   let Inst{27-26} = {0,1};
 }
-class AXI2ldb<dag oops, dag iops, Format f, InstrItinClass itin, 
+class AXI2ldb<dag oops, dag iops, Format f, InstrItinClass itin,
               string asm, list<dag> pattern>
   : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
        asm, "", pattern> {
@@ -549,7 +560,7 @@ class AI2stbpo<dag oops, dag iops, Format f, InstrItinClass itin,
 }
 
 // addrmode3 instructions
-class AI3<dag oops, dag iops, Format f, InstrItinClass itin, 
+class AI3<dag oops, dag iops, Format f, InstrItinClass itin,
           string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
       opc, asm, "", pattern>;
@@ -853,7 +864,6 @@ class AI3stdpo<dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{27-25} = 0b000;
 }
 
-
 // addrmode4 instructions
 class AXI4ld<dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin,
              string asm, string cstr, list<dag> pattern>
@@ -961,20 +971,25 @@ class TI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
   : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "", pattern>;
 
 // Two-address instructions
-class TIt<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
-  : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "$lhs = $dst", pattern>;
+class TIt<dag oops, dag iops, InstrItinClass itin, string asm,
+          list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "$lhs = $dst",
+           pattern>;
 
 // tBL, tBX 32-bit instructions
 class TIx2<bits<5> opcod1, bits<2> opcod2, bit opcod3,
-    dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
-    : ThumbI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>, Encoding {
+           dag oops, dag iops, InstrItinClass itin, string asm,
+           list<dag> pattern>
+    : ThumbI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>,
+      Encoding {
   let Inst{31-27} = opcod1;
   let Inst{15-14} = opcod2;
   let Inst{12} = opcod3;
 }
 
 // BR_JT instructions
-class TJTI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+class TJTI<dag oops, dag iops, InstrItinClass itin, string asm,
+           list<dag> pattern>
   : ThumbI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>;
 
 // Thumb1 only
@@ -1001,7 +1016,7 @@ class T1JTI<dag oops, dag iops, InstrItinClass itin,
 // Two-address instructions
 class T1It<dag oops, dag iops, InstrItinClass itin,
            string asm, string cstr, list<dag> pattern>
-  : Thumb1I<oops, iops, AddrModeNone, Size2Bytes, itin, 
+  : Thumb1I<oops, iops, AddrModeNone, Size2Bytes, itin,
             asm, cstr, pattern>;
 
 // Thumb1 instruction that can either be predicated or set CPSR.
@@ -1024,7 +1039,7 @@ class T1sI<dag oops, dag iops, InstrItinClass itin,
 class T1sIt<dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : Thumb1sI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm,
-            "$lhs = $dst", pattern>;
+             "$lhs = $dst", pattern>;
 
 // Thumb1 instruction that can be predicated.
 class Thumb1pI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
@@ -1046,7 +1061,7 @@ class T1pI<dag oops, dag iops, InstrItinClass itin,
 class T1pIt<dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : Thumb1pI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm,
-            "$lhs = $dst", pattern>;
+             "$lhs = $dst", pattern>;
 
 class T1pI1<dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
@@ -1057,7 +1072,7 @@ class T1pI2<dag oops, dag iops, InstrItinClass itin,
 class T1pI4<dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : Thumb1pI<oops, iops, AddrModeT1_4, Size2Bytes, itin, opc, asm, "", pattern>;
-class T1pIs<dag oops, dag iops, 
+class T1pIs<dag oops, dag iops,
             InstrItinClass itin, string opc, string asm, list<dag> pattern>
   : Thumb1pI<oops, iops, AddrModeT1_s, Size2Bytes, itin, opc, asm, "", pattern>;
 
@@ -1146,8 +1161,8 @@ class Thumb2XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
 }
 
 class ThumbXI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
-               InstrItinClass itin,
-               string asm, string cstr, list<dag> pattern>
+              InstrItinClass itin,
+              string asm, string cstr, list<dag> pattern>
   : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = iops;
@@ -1161,7 +1176,7 @@ class T2I<dag oops, dag iops, InstrItinClass itin,
   : Thumb2I<oops, iops, AddrModeNone, Size4Bytes, itin, opc, asm, "", pattern>;
 class T2Ii12<dag oops, dag iops, InstrItinClass itin,
              string opc, string asm, list<dag> pattern>
-  : Thumb2I<oops, iops, AddrModeT2_i12, Size4Bytes, itin, opc, asm, "", pattern>;
+  : Thumb2I<oops, iops, AddrModeT2_i12, Size4Bytes, itin, opc, asm, "",pattern>;
 class T2Ii8<dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeT2_i8, Size4Bytes, itin, opc, asm, "", pattern>;
@@ -1196,7 +1211,7 @@ class T2JTI<dag oops, dag iops, InstrItinClass itin,
   : Thumb2XI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>;
 
 class T2Ix2<dag oops, dag iops, InstrItinClass itin,
-          string opc, string asm, list<dag> pattern>
+            string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeNone, Size8Bytes, itin, opc, asm, "", pattern>;
 
 // Two-address instructions
@@ -1295,7 +1310,7 @@ class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
            InstrItinClass itin,
            string opc, string asm, list<dag> pattern>
   : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
-      VFPLdStFrm, itin, opc, asm, "", pattern> {
+         VFPLdStFrm, itin, opc, asm, "", pattern> {
   // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-24} = opcod1;
   let Inst{21-20} = opcod2;
@@ -1309,7 +1324,7 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
            InstrItinClass itin,
            string opc, string asm, list<dag> pattern>
   : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
-      VFPLdStFrm, itin, opc, asm, "", pattern> {
+         VFPLdStFrm, itin, opc, asm, "", pattern> {
   // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-24} = opcod1;
   let Inst{21-20} = opcod2;
@@ -1320,7 +1335,7 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
 class AXDI5<dag oops, dag iops, IndexMode im, InstrItinClass itin,
             string asm, string cstr, list<dag> pattern>
   : VFPXI<oops, iops, AddrMode5, Size4Bytes, im,
-       VFPLdStMulFrm, itin, asm, cstr, pattern> {
+          VFPLdStMulFrm, itin, asm, cstr, pattern> {
   // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-25} = 0b110;
   let Inst{11-8}  = 0b1011;
@@ -1332,7 +1347,7 @@ class AXDI5<dag oops, dag iops, IndexMode im, InstrItinClass itin,
 class AXSI5<dag oops, dag iops, IndexMode im, InstrItinClass itin,
             string asm, string cstr, list<dag> pattern>
   : VFPXI<oops, iops, AddrMode5, Size4Bytes, im,
-       VFPLdStMulFrm, itin, asm, cstr, pattern> {
+          VFPLdStMulFrm, itin, asm, cstr, pattern> {
   // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-25} = 0b110;
   let Inst{11-8}  = 0b1010;
@@ -1353,7 +1368,8 @@ class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
 
 // Double precision, binary
 class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
-       dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern>
+           dag iops, InstrItinClass itin, string opc, string asm,
+           list<dag> pattern>
   : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
   let Inst{27-23} = opcod1;
   let Inst{21-20} = opcod2;
@@ -1362,6 +1378,20 @@ class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
   let Inst{4} = op4;
 }
 
+// Double precision, binary, VML[AS] (for additional predicate)
+class ADbI_vmlX<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
+           dag iops, InstrItinClass itin, string opc, string asm,
+           list<dag> pattern>
+  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-8}  = 0b1011;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+  list<Predicate> Predicates = [HasVFP2, UseVMLx];
+}
+
+
 // Single precision, unary
 class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
@@ -1399,7 +1429,8 @@ class ASbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops,
 // Single precision binary, if no NEON
 // Same as ASbI except not available if NEON is enabled
 class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
-       dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern>
+            dag iops, InstrItinClass itin, string opc, string asm,
+            list<dag> pattern>
   : ASbI<opcod1, opcod2, op6, op4, oops, iops, itin, opc, asm, pattern> {
   list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
 }
@@ -1419,8 +1450,8 @@ class AVConv1I<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4,
 
 // VFP conversion between floating-point and fixed-point
 class AVConv1XI<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5,
-               dag oops, dag iops, InstrItinClass itin, string opc, string asm,
-               list<dag> pattern>
+                dag oops, dag iops, InstrItinClass itin, string opc, string asm,
+                list<dag> pattern>
   : AVConv1I<op1, op2, op3, op4, oops, iops, itin, opc, asm, pattern> {
   // size (fixed-point number): sx == 0 ? 16 : 32
   let Inst{7} = op5; // sx
@@ -1448,7 +1479,7 @@ class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
                InstrItinClass itin, string opc, string asm, list<dag> pattern>
   : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, itin, opc, asm, pattern>;
 
-class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, 
+class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
                InstrItinClass itin, string opc, string asm, list<dag> pattern>
   : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, itin, opc, asm, pattern>;
 
@@ -1480,9 +1511,10 @@ class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
 }
 
 // Same as NeonI except it does not have a "data type" specifier.
-class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, InstrItinClass itin,
-            string opc, string asm, string cstr, list<dag> pattern>
-  : InstARM<am, Size4Bytes, im, NEONFrm, NeonDomain, cstr, itin> {
+class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
+             InstrItinClass itin, string opc, string asm, string cstr,
+             list<dag> pattern>
+  : InstARM<am, Size4Bytes, im, f, NeonDomain, cstr, itin> {
   let OutOperandList = oops;
   let InOperandList = !con(iops, (ins pred:$p));
   let AsmString = !strconcat(!strconcat(opc, "${p}"), !strconcat("\t", asm));
@@ -1490,18 +1522,6 @@ class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, InstrItinClass itin,
   list<Predicate> Predicates = [HasNEON];
 }
 
-class NI<dag oops, dag iops, InstrItinClass itin, string opc, string asm,
-         list<dag> pattern>
-  : NeonXI<oops, iops, AddrModeNone, IndexModeNone, itin, opc, asm, "",
-          pattern> {
-}
-
-class NI4<dag oops, dag iops, InstrItinClass itin, string opc,
-          string asm, list<dag> pattern>
-  : NeonXI<oops, iops, AddrMode4, IndexModeNone, itin, opc, asm, "",
-          pattern> {
-}
-
 class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
             dag oops, dag iops, InstrItinClass itin,
             string opc, string dt, string asm, string cstr, list<dag> pattern>
@@ -1514,17 +1534,17 @@ class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
   let Inst{7-4} = op7_4;
 }
 
-class NDataI<dag oops, dag iops, InstrItinClass itin,
+class NDataI<dag oops, dag iops, Format f, InstrItinClass itin,
              string opc, string dt, string asm, string cstr, list<dag> pattern>
-  : NeonI<oops, iops, AddrModeNone, IndexModeNone, NEONFrm, itin, opc, dt, asm,
-         cstr, pattern> {
+  : NeonI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, dt, asm, cstr,
+          pattern> {
   let Inst{31-25} = 0b1111001;
 }
 
-class NDataXI<dag oops, dag iops, InstrItinClass itin,
-             string opc, string asm, string cstr, list<dag> pattern>
-  : NeonXI<oops, iops, AddrModeNone, IndexModeNone, itin, opc, asm,
-         cstr, pattern> {
+class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin,
+              string opc, string asm, string cstr, list<dag> pattern>
+  : NeonXI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, asm,
+           cstr, pattern> {
   let Inst{31-25} = 0b1111001;
 }
 
@@ -1532,8 +1552,9 @@ class NDataXI<dag oops, dag iops, InstrItinClass itin,
 class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
                bit op5, bit op4,
                dag oops, dag iops, InstrItinClass itin,
-               string opc, string dt, string asm, string cstr, list<dag> pattern>
-  : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> {
+               string opc, string dt, string asm, string cstr,
+               list<dag> pattern>
+  : NDataI<oops, iops, N1RegModImmFrm, itin, opc, dt, asm, cstr, pattern> {
   let Inst{23} = op23;
   let Inst{21-19} = op21_19;
   let Inst{11-8} = op11_8;
@@ -1548,7 +1569,7 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
           bits<5> op11_7, bit op6, bit op4,
           dag oops, dag iops, InstrItinClass itin,
           string opc, string dt, string asm, string cstr, list<dag> pattern>
-  : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> {
+  : NDataI<oops, iops, N2RegFrm, itin, opc, dt, asm, cstr, pattern> {
   let Inst{24-23} = op24_23;
   let Inst{21-20} = op21_20;
   let Inst{19-18} = op19_18;
@@ -1560,10 +1581,10 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
 
 // Same as N2V except it doesn't have a datatype suffix.
 class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
-          bits<5> op11_7, bit op6, bit op4,
-          dag oops, dag iops, InstrItinClass itin,
-          string opc, string asm, string cstr, list<dag> pattern>
-  : NDataXI<oops, iops, itin, opc, asm, cstr, pattern> {
+           bits<5> op11_7, bit op6, bit op4,
+           dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, string cstr, list<dag> pattern>
+  : NDataXI<oops, iops, N2RegFrm, itin, opc, asm, cstr, pattern> {
   let Inst{24-23} = op24_23;
   let Inst{21-20} = op21_20;
   let Inst{19-18} = op19_18;
@@ -1575,9 +1596,9 @@ class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
 
 // NEON 2 vector register with immediate.
 class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
-             dag oops, dag iops, InstrItinClass itin,
+             dag oops, dag iops, Format f, InstrItinClass itin,
              string opc, string dt, string asm, string cstr, list<dag> pattern>
-  : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> {
+  : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
   let Inst{24} = op24;
   let Inst{23} = op23;
   let Inst{11-8} = op11_8;
@@ -1588,9 +1609,9 @@ class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
 
 // NEON 3 vector register format.
 class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
-          dag oops, dag iops, InstrItinClass itin,
+          dag oops, dag iops, Format f, InstrItinClass itin,
           string opc, string dt, string asm, string cstr, list<dag> pattern>
-  : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> {
+  : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
   let Inst{24} = op24;
   let Inst{23} = op23;
   let Inst{21-20} = op21_20;
@@ -1599,11 +1620,12 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
   let Inst{4} = op4;
 }
 
-// Same as N3VX except it doesn't have a data type suffix.
-class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
-          dag oops, dag iops, InstrItinClass itin,
-          string opc, string asm, string cstr, list<dag> pattern>
-  : NDataXI<oops, iops, itin, opc, asm, cstr, pattern> {
+// Same as N3V except it doesn't have a data type suffix.
+class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+           bit op4,
+           dag oops, dag iops, Format f, InstrItinClass itin,
+           string opc, string asm, string cstr, list<dag> pattern>
+  : NDataXI<oops, iops, f, itin, opc, asm, cstr, pattern> {
   let Inst{24} = op24;
   let Inst{23} = op23;
   let Inst{21-20} = op21_20;
@@ -1617,7 +1639,7 @@ class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
                dag oops, dag iops, Format f, InstrItinClass itin,
                string opc, string dt, string asm, list<dag> pattern>
   : InstARM<AddrModeNone, Size4Bytes, IndexModeNone, f, GenericDomain,
-    "", itin> {
+            "", itin> {
   let Inst{27-20} = opcod1;
   let Inst{11-8} = opcod2;
   let Inst{6-5} = opcod3;
@@ -1647,6 +1669,19 @@ class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
   : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, itin,
              opc, dt, asm, pattern>;
 
+// Vector Duplicate Lane (from scalar to all elements)
+class NVDupLane<bits<4> op19_16, bit op6, dag oops, dag iops,
+                InstrItinClass itin, string opc, string dt, string asm,
+                list<dag> pattern>
+  : NDataI<oops, iops, NVDupLnFrm, itin, opc, dt, asm, "", pattern> {
+  let Inst{24-23} = 0b11;
+  let Inst{21-20} = 0b11;
+  let Inst{19-16} = op19_16;
+  let Inst{11-7} = 0b11000;
+  let Inst{6} = op6;
+  let Inst{4} = 0;
+}
+
 // NEONFPPat - Same as Pat<>, but requires that the compiler be using NEON
 // for single-precision FP.
 class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> {
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 26a2806..f2ab06f 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -140,6 +140,8 @@ def IsNotDarwin : Predicate<"!Subtarget->isTargetDarwin()">;
 def UseMovt   : Predicate<"Subtarget->useMovt()">;
 def DontUseMovt : Predicate<"!Subtarget->useMovt()">;
 
+def UseVMLx   : Predicate<"Subtarget->useVMLx()">;
+
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
 
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index c977cc3..ed9d31d 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -115,51 +115,80 @@ def h64imm : Operand<i64> {
 // NEON load / store instructions
 //===----------------------------------------------------------------------===//
 
+let mayLoad = 1 in {
 // Use vldmia to load a Q register as a D register pair.
-def VLDRQ : NI4<(outs QPR:$dst), (ins addrmode4:$addr), IIC_fpLoadm,
-                "vldmia", "$addr, ${dst:dregpair}",
-                [(set QPR:$dst, (v2f64 (load addrmode4:$addr)))]> {
-  let Inst{27-25} = 0b110;
-  let Inst{24}    = 0; // P bit
-  let Inst{23}    = 1; // U bit
-  let Inst{20}    = 1;
-  let Inst{11-8}  = 0b1011;
-}
+// This is equivalent to VLDMD except that it has a Q register operand
+// instead of a pair of D registers.
+def VLDMQ
+  : AXDI5<(outs QPR:$dst), (ins addrmode5:$addr, pred:$p),
+          IndexModeNone, IIC_fpLoadm,
+          "vldm${addr:submode}${p}\t${addr:base}, ${dst:dregpair}", "", []>;
+def VLDMQ_UPD
+  : AXDI5<(outs QPR:$dst, GPR:$wb), (ins addrmode5:$addr, pred:$p),
+          IndexModeUpd, IIC_fpLoadm,
+          "vldm${addr:submode}${p}\t${addr:base}!, ${dst:dregpair}",
+          "$addr.base = $wb", []>;
+
+// Use vld1 to load a Q register as a D register pair.
+// This alternative to VLDMQ allows an alignment to be specified.
+// This is equivalent to VLD1q64 except that it has a Q register operand.
+def VLD1q
+  : NLdSt<0,0b10,0b1010,0b1100, (outs QPR:$dst), (ins addrmode6:$addr),
+          IIC_VLD1, "vld1", "64", "${dst:dregpair}, $addr", "", []>;
+def VLD1q_UPD
+  : NLdSt<0,0b10,0b1010,0b1100, (outs QPR:$dst, GPR:$wb),
+          (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", "64",
+          "${dst:dregpair}, $addr$offset", "$addr.addr = $wb", []>;
+} // mayLoad = 1
 
+let mayStore = 1 in {
 // Use vstmia to store a Q register as a D register pair.
-def VSTRQ : NI4<(outs), (ins QPR:$src, addrmode4:$addr), IIC_fpStorem,
-                "vstmia", "$addr, ${src:dregpair}",
-                [(store (v2f64 QPR:$src), addrmode4:$addr)]> {
-  let Inst{27-25} = 0b110;
-  let Inst{24}    = 0; // P bit
-  let Inst{23}    = 1; // U bit
-  let Inst{20}    = 0;
-  let Inst{11-8}  = 0b1011;
-}
+// This is equivalent to VSTMD except that it has a Q register operand
+// instead of a pair of D registers.
+def VSTMQ
+  : AXDI5<(outs), (ins QPR:$src, addrmode5:$addr, pred:$p),
+          IndexModeNone, IIC_fpStorem,
+          "vstm${addr:submode}${p}\t${addr:base}, ${src:dregpair}", "", []>;
+def VSTMQ_UPD
+  : AXDI5<(outs GPR:$wb), (ins QPR:$src, addrmode5:$addr, pred:$p),
+          IndexModeUpd, IIC_fpStorem,
+          "vstm${addr:submode}${p}\t${addr:base}!, ${src:dregpair}",
+          "$addr.base = $wb", []>;
+
+// Use vst1 to store a Q register as a D register pair.
+// This alternative to VSTMQ allows an alignment to be specified.
+// This is equivalent to VST1q64 except that it has a Q register operand.
+def VST1q
+  : NLdSt<0,0b00,0b1010,0b1100, (outs), (ins addrmode6:$addr, QPR:$src),
+          IIC_VST, "vst1", "64", "${src:dregpair}, $addr", "", []>;
+def VST1q_UPD
+  : NLdSt<0,0b00,0b1010,0b1100, (outs GPR:$wb),
+          (ins addrmode6:$addr, am6offset:$offset, QPR:$src),
+          IIC_VST, "vst1", "64", "{$src:dregpair}, $addr$offset",
+          "$addr.addr = $wb", []>;
+} // mayStore = 1
 
-//   VLD1     : Vector Load (multiple single elements)
-class VLD1D<bits<4> op7_4, string Dt, ValueType Ty>
-  : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst), (ins addrmode6:$addr), IIC_VLD1,
-          "vld1", Dt, "\\{$dst\\}, $addr", "",
-          [(set DPR:$dst, (Ty (int_arm_neon_vld1 addrmode6:$addr)))]>;
-class VLD1Q<bits<4> op7_4, string Dt, ValueType Ty>
-  : NLdSt<0,0b10,0b1010,op7_4, (outs QPR:$dst), (ins addrmode6:$addr), IIC_VLD1,
-          "vld1", Dt, "${dst:dregpair}, $addr", "",
-          [(set QPR:$dst, (Ty (int_arm_neon_vld1 addrmode6:$addr)))]>;
-
-def  VLD1d8   : VLD1D<0b0000, "8",  v8i8>;
-def  VLD1d16  : VLD1D<0b0100, "16", v4i16>;
-def  VLD1d32  : VLD1D<0b1000, "32", v2i32>;
-def  VLD1df   : VLD1D<0b1000, "32", v2f32>;
-def  VLD1d64  : VLD1D<0b1100, "64", v1i64>;
-
-def  VLD1q8   : VLD1Q<0b0000, "8",  v16i8>;
-def  VLD1q16  : VLD1Q<0b0100, "16", v8i16>;
-def  VLD1q32  : VLD1Q<0b1000, "32", v4i32>;
-def  VLD1qf   : VLD1Q<0b1000, "32", v4f32>;
-def  VLD1q64  : VLD1Q<0b1100, "64", v2i64>;
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
 
-let mayLoad = 1 in {
+//   VLD1     : Vector Load (multiple single elements)
+class VLD1D<bits<4> op7_4, string Dt>
+  : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst),
+          (ins addrmode6:$addr), IIC_VLD1,
+          "vld1", Dt, "\\{$dst\\}, $addr", "", []>;
+class VLD1Q<bits<4> op7_4, string Dt>
+  : NLdSt<0,0b10,0b1010,op7_4, (outs DPR:$dst1, DPR:$dst2),
+          (ins addrmode6:$addr), IIC_VLD1,
+          "vld1", Dt, "\\{$dst1, $dst2\\}, $addr", "", []>;
+
+def  VLD1d8   : VLD1D<0b0000, "8">;
+def  VLD1d16  : VLD1D<0b0100, "16">;
+def  VLD1d32  : VLD1D<0b1000, "32">;
+def  VLD1d64  : VLD1D<0b1100, "64">;
+
+def  VLD1q8   : VLD1Q<0b0000, "8">;
+def  VLD1q16  : VLD1Q<0b0100, "16">;
+def  VLD1q32  : VLD1Q<0b1000, "32">;
+def  VLD1q64  : VLD1Q<0b1100, "64">;
 
 // ...with address register writeback:
 class VLD1DWB<bits<4> op7_4, string Dt>
@@ -182,54 +211,48 @@ def VLD1q8_UPD  : VLD1QWB<0b0000, "8">;
 def VLD1q16_UPD : VLD1QWB<0b0100, "16">;
 def VLD1q32_UPD : VLD1QWB<0b1000, "32">;
 def VLD1q64_UPD : VLD1QWB<0b1100, "64">;
-} // mayLoad = 1
 
-let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
-
-// These (dreg triple/quadruple) are for disassembly only.
+// ...with 3 registers (some of these are only for the disassembler):
 class VLD1D3<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
           (ins addrmode6:$addr), IIC_VLD1, "vld1", Dt,
-          "\\{$dst1, $dst2, $dst3\\}, $addr", "",
-          [/* For disassembly only; pattern left blank */]>;
-class VLD1D4<bits<4> op7_4, string Dt>
-  : NLdSt<0,0b10,0b0010,op7_4,(outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
-          (ins addrmode6:$addr), IIC_VLD1, "vld1", Dt,
-          "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", "",
-          [/* For disassembly only; pattern left blank */]>;
-
-def  VLD1d8T  : VLD1D3<0b0000, "8">;
-def  VLD1d16T : VLD1D3<0b0100, "16">;
-def  VLD1d32T : VLD1D3<0b1000, "32">;
-//   VLD1d64T : implemented as VLD3d64
-
-def  VLD1d8Q  : VLD1D4<0b0000, "8">;
-def  VLD1d16Q : VLD1D4<0b0100, "16">;
-def  VLD1d32Q : VLD1D4<0b1000, "32">;
-//   VLD1d64Q : implemented as VLD4d64
-
-// ...with address register writeback:
+          "\\{$dst1, $dst2, $dst3\\}, $addr", "", []>;
 class VLD1D3WB<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", Dt,
-          "\\{$dst1, $dst2, $dst3\\}, $addr$offset", "$addr.addr = $wb",
-          [/* For disassembly only; pattern left blank */]>;
+          "\\{$dst1, $dst2, $dst3\\}, $addr$offset", "$addr.addr = $wb", []>;
+
+def VLD1d8T      : VLD1D3<0b0000, "8">;
+def VLD1d16T     : VLD1D3<0b0100, "16">;
+def VLD1d32T     : VLD1D3<0b1000, "32">;
+def VLD1d64T     : VLD1D3<0b1100, "64">;
+
+def VLD1d8T_UPD  : VLD1D3WB<0b0000, "8">;
+def VLD1d16T_UPD : VLD1D3WB<0b0100, "16">;
+def VLD1d32T_UPD : VLD1D3WB<0b1000, "32">;
+def VLD1d64T_UPD : VLD1D3WB<0b1100, "64">;
+
+// ...with 4 registers (some of these are only for the disassembler):
+class VLD1D4<bits<4> op7_4, string Dt>
+  : NLdSt<0,0b10,0b0010,op7_4,(outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6:$addr), IIC_VLD1, "vld1", Dt,
+          "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr", "", []>;
 class VLD1D4WB<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b0010,op7_4,
           (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1, "vld1", Dt,
           "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr$offset", "$addr.addr = $wb",
-          [/* For disassembly only; pattern left blank */]>;
+          []>;
 
-def VLD1d8T_UPD  : VLD1D3WB<0b0000, "8">;
-def VLD1d16T_UPD : VLD1D3WB<0b0100, "16">;
-def VLD1d32T_UPD : VLD1D3WB<0b1000, "32">;
-//  VLD1d64T_UPD : implemented as VLD3d64_UPD
+def VLD1d8Q      : VLD1D4<0b0000, "8">;
+def VLD1d16Q     : VLD1D4<0b0100, "16">;
+def VLD1d32Q     : VLD1D4<0b1000, "32">;
+def VLD1d64Q     : VLD1D4<0b1100, "64">;
 
 def VLD1d8Q_UPD  : VLD1D4WB<0b0000, "8">;
 def VLD1d16Q_UPD : VLD1D4WB<0b0100, "16">;
 def VLD1d32Q_UPD : VLD1D4WB<0b1000, "32">;
-//  VLD1d64Q_UPD : implemented as VLD4d64_UPD
+def VLD1d64Q_UPD : VLD1D4WB<0b1100, "64">;
 
 //   VLD2     : Vector Load (multiple 2-element structures)
 class VLD2D<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -245,9 +268,6 @@ class VLD2Q<bits<4> op7_4, string Dt>
 def  VLD2d8   : VLD2D<0b1000, 0b0000, "8">;
 def  VLD2d16  : VLD2D<0b1000, 0b0100, "16">;
 def  VLD2d32  : VLD2D<0b1000, 0b1000, "32">;
-def  VLD2d64  : NLdSt<0,0b10,0b1010,0b1100, (outs DPR:$dst1, DPR:$dst2),
-                      (ins addrmode6:$addr), IIC_VLD1,
-                      "vld1", "64", "\\{$dst1, $dst2\\}, $addr", "", []>;
 
 def  VLD2q8   : VLD2Q<0b0000, "8">;
 def  VLD2q16  : VLD2Q<0b0100, "16">;
@@ -269,11 +289,6 @@ class VLD2QWB<bits<4> op7_4, string Dt>
 def VLD2d8_UPD  : VLD2DWB<0b1000, 0b0000, "8">;
 def VLD2d16_UPD : VLD2DWB<0b1000, 0b0100, "16">;
 def VLD2d32_UPD : VLD2DWB<0b1000, 0b1000, "32">;
-def VLD2d64_UPD : NLdSt<0,0b10,0b1010,0b1100,
-                        (outs DPR:$dst1, DPR:$dst2, GPR:$wb),
-                        (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1,
-                        "vld1", "64", "\\{$dst1, $dst2\\}, $addr$offset",
-                        "$addr.addr = $wb", []>;
 
 def VLD2q8_UPD  : VLD2QWB<0b0000, "8">;
 def VLD2q16_UPD : VLD2QWB<0b0100, "16">;
@@ -296,10 +311,6 @@ class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt>
 def  VLD3d8   : VLD3D<0b0100, 0b0000, "8">;
 def  VLD3d16  : VLD3D<0b0100, 0b0100, "16">;
 def  VLD3d32  : VLD3D<0b0100, 0b1000, "32">;
-def  VLD3d64  : NLdSt<0,0b10,0b0110,0b1100,
-                      (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
-                      (ins addrmode6:$addr), IIC_VLD1,
-                      "vld1", "64", "\\{$dst1, $dst2, $dst3\\}, $addr", "", []>;
 
 // ...with address register writeback:
 class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -312,11 +323,6 @@ class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
 def VLD3d8_UPD  : VLD3DWB<0b0100, 0b0000, "8">;
 def VLD3d16_UPD : VLD3DWB<0b0100, 0b0100, "16">;
 def VLD3d32_UPD : VLD3DWB<0b0100, 0b1000, "32">;
-def VLD3d64_UPD : NLdSt<0,0b10,0b0110,0b1100,
-                        (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb),
-                        (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1,
-                        "vld1", "64", "\\{$dst1, $dst2, $dst3\\}, $addr$offset",
-                        "$addr.addr = $wb", []>;
 
 // ...with double-spaced registers (non-updating versions for disassembly only):
 def VLD3q8      : VLD3D<0b0101, 0b0000, "8">;
@@ -341,11 +347,6 @@ class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt>
 def  VLD4d8   : VLD4D<0b0000, 0b0000, "8">;
 def  VLD4d16  : VLD4D<0b0000, 0b0100, "16">;
 def  VLD4d32  : VLD4D<0b0000, 0b1000, "32">;
-def  VLD4d64  : NLdSt<0,0b10,0b0010,0b1100,
-                      (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
-                      (ins addrmode6:$addr), IIC_VLD1,
-                      "vld1", "64", "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr",
-                      "", []>;
 
 // ...with address register writeback:
 class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -358,13 +359,6 @@ class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
 def VLD4d8_UPD  : VLD4DWB<0b0000, 0b0000, "8">;
 def VLD4d16_UPD : VLD4DWB<0b0000, 0b0100, "16">;
 def VLD4d32_UPD : VLD4DWB<0b0000, 0b1000, "32">;
-def VLD4d64_UPD : NLdSt<0,0b10,0b0010,0b1100,
-                        (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4,
-                         GPR:$wb),
-                        (ins addrmode6:$addr, am6offset:$offset), IIC_VLD1,
-                        "vld1", "64",
-                        "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr$offset",
-                        "$addr.addr = $wb", []>;
 
 // ...with double-spaced registers (non-updating versions for disassembly only):
 def VLD4q8      : VLD4D<0b0001, 0b0000, "8">;
@@ -383,62 +377,62 @@ def VLD4q32odd_UPD : VLD4DWB<0b0001, 0b1000, "32">;
 //   FIXME: Not yet implemented.
 
 //   VLD2LN   : Vector Load (single 2-element structure to one lane)
-class VLD2LN<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b10, op11_8, {?,?,?,?}, (outs DPR:$dst1, DPR:$dst2),
+class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2),
           (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane),
           IIC_VLD2, "vld2", Dt, "\\{$dst1[$lane], $dst2[$lane]\\}, $addr",
           "$src1 = $dst1, $src2 = $dst2", []>;
 
-def VLD2LNd8  : VLD2LN<0b0001, "8">;
-def VLD2LNd16 : VLD2LN<0b0101, "16"> { let Inst{5} = 0; }
-def VLD2LNd32 : VLD2LN<0b1001, "32"> { let Inst{6} = 0; }
+def VLD2LNd8  : VLD2LN<0b0001, {?,?,?,?}, "8">;
+def VLD2LNd16 : VLD2LN<0b0101, {?,?,0,?}, "16">;
+def VLD2LNd32 : VLD2LN<0b1001, {?,0,?,?}, "32">;
 
 // ...with double-spaced registers:
-def VLD2LNq16 : VLD2LN<0b0101, "16"> { let Inst{5} = 1; }
-def VLD2LNq32 : VLD2LN<0b1001, "32"> { let Inst{6} = 1; }
+def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16">;
+def VLD2LNq32 : VLD2LN<0b1001, {?,1,?,?}, "32">;
 
 // ...alternate versions to be allocated odd register numbers:
-def VLD2LNq16odd : VLD2LN<0b0101, "16"> { let Inst{5} = 1; }
-def VLD2LNq32odd : VLD2LN<0b1001, "32"> { let Inst{6} = 1; }
+def VLD2LNq16odd : VLD2LN<0b0101, {?,?,1,?}, "16">;
+def VLD2LNq32odd : VLD2LN<0b1001, {?,1,?,?}, "32">;
 
 // ...with address register writeback:
-class VLD2LNWB<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b10, op11_8, {?,?,?,?}, (outs DPR:$dst1, DPR:$dst2, GPR:$wb),
+class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset,
            DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2, "vld2", Dt,
           "\\{$dst1[$lane], $dst2[$lane]\\}, $addr$offset",
           "$src1 = $dst1, $src2 = $dst2, $addr.addr = $wb", []>;
 
-def VLD2LNd8_UPD  : VLD2LNWB<0b0001, "8">;
-def VLD2LNd16_UPD : VLD2LNWB<0b0101, "16"> { let Inst{5} = 0; }
-def VLD2LNd32_UPD : VLD2LNWB<0b1001, "32"> { let Inst{6} = 0; }
+def VLD2LNd8_UPD  : VLD2LNWB<0b0001, {?,?,?,?}, "8">;
+def VLD2LNd16_UPD : VLD2LNWB<0b0101, {?,?,0,?}, "16">;
+def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,?,?}, "32">;
 
-def VLD2LNq16_UPD : VLD2LNWB<0b0101, "16"> { let Inst{5} = 1; }
-def VLD2LNq32_UPD : VLD2LNWB<0b1001, "32"> { let Inst{6} = 1; }
+def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16">;
+def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,?,?}, "32">;
 
 //   VLD3LN   : Vector Load (single 3-element structure to one lane)
-class VLD3LN<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b10, op11_8, {?,?,?,?}, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
+class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
           (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
           nohash_imm:$lane), IIC_VLD3, "vld3", Dt,
           "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane]\\}, $addr",
           "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3", []>;
 
-def VLD3LNd8  : VLD3LN<0b0010, "8"> { let Inst{4} = 0; }
-def VLD3LNd16 : VLD3LN<0b0110, "16"> { let Inst{5-4} = 0b00; }
-def VLD3LNd32 : VLD3LN<0b1010, "32"> { let Inst{6-4} = 0b000; }
+def VLD3LNd8  : VLD3LN<0b0010, {?,?,?,0}, "8">;
+def VLD3LNd16 : VLD3LN<0b0110, {?,?,0,0}, "16">;
+def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32">;
 
 // ...with double-spaced registers:
-def VLD3LNq16 : VLD3LN<0b0110, "16"> { let Inst{5-4} = 0b10; }
-def VLD3LNq32 : VLD3LN<0b1010, "32"> { let Inst{6-4} = 0b100; }
+def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16">;
+def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32">;
 
 // ...alternate versions to be allocated odd register numbers:
-def VLD3LNq16odd : VLD3LN<0b0110, "16"> { let Inst{5-4} = 0b10; }
-def VLD3LNq32odd : VLD3LN<0b1010, "32"> { let Inst{6-4} = 0b100; }
+def VLD3LNq16odd : VLD3LN<0b0110, {?,?,1,0}, "16">;
+def VLD3LNq32odd : VLD3LN<0b1010, {?,1,0,0}, "32">;
 
 // ...with address register writeback:
-class VLD3LNWB<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b10, op11_8, {?,?,?,?},
+class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, op11_8, op7_4,
           (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset,
            DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane),
@@ -447,37 +441,37 @@ class VLD3LNWB<bits<4> op11_8, string Dt>
           "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $addr.addr = $wb",
           []>;
 
-def VLD3LNd8_UPD  : VLD3LNWB<0b0010, "8"> { let Inst{4} = 0; }
-def VLD3LNd16_UPD : VLD3LNWB<0b0110, "16"> { let Inst{5-4} = 0b00; }
-def VLD3LNd32_UPD : VLD3LNWB<0b1010, "32"> { let Inst{6-4} = 0b000; }
+def VLD3LNd8_UPD  : VLD3LNWB<0b0010, {?,?,?,0}, "8">;
+def VLD3LNd16_UPD : VLD3LNWB<0b0110, {?,?,0,0}, "16">;
+def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32">;
 
-def VLD3LNq16_UPD : VLD3LNWB<0b0110, "16"> { let Inst{5-4} = 0b10; }
-def VLD3LNq32_UPD : VLD3LNWB<0b1010, "32"> { let Inst{6-4} = 0b100; }
+def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16">;
+def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32">;
 
 //   VLD4LN   : Vector Load (single 4-element structure to one lane)
-class VLD4LN<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b10, op11_8, {?,?,?,?},
+class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, op11_8, op7_4,
           (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
           (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
           nohash_imm:$lane), IIC_VLD4, "vld4", Dt,
           "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $addr",
           "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []>;
 
-def VLD4LNd8  : VLD4LN<0b0011, "8">;
-def VLD4LNd16 : VLD4LN<0b0111, "16"> { let Inst{5} = 0; }
-def VLD4LNd32 : VLD4LN<0b1011, "32"> { let Inst{6} = 0; }
+def VLD4LNd8  : VLD4LN<0b0011, {?,?,?,?}, "8">;
+def VLD4LNd16 : VLD4LN<0b0111, {?,?,0,?}, "16">;
+def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32">;
 
 // ...with double-spaced registers:
-def VLD4LNq16 : VLD4LN<0b0111, "16"> { let Inst{5} = 1; }
-def VLD4LNq32 : VLD4LN<0b1011, "32"> { let Inst{6} = 1; }
+def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16">;
+def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32">;
 
 // ...alternate versions to be allocated odd register numbers:
-def VLD4LNq16odd : VLD4LN<0b0111, "16"> { let Inst{5} = 1; }
-def VLD4LNq32odd : VLD4LN<0b1011, "32"> { let Inst{6} = 1; }
+def VLD4LNq16odd : VLD4LN<0b0111, {?,?,1,?}, "16">;
+def VLD4LNq32odd : VLD4LN<0b1011, {?,1,?,?}, "32">;
 
 // ...with address register writeback:
-class VLD4LNWB<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b10, op11_8, {?,?,?,?},
+class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, op11_8, op7_4,
           (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset,
            DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane),
@@ -486,12 +480,12 @@ class VLD4LNWB<bits<4> op11_8, string Dt>
 "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4, $addr.addr = $wb",
           []>;
 
-def VLD4LNd8_UPD  : VLD4LNWB<0b0011, "8">;
-def VLD4LNd16_UPD : VLD4LNWB<0b0111, "16"> { let Inst{5} = 0; }
-def VLD4LNd32_UPD : VLD4LNWB<0b1011, "32"> { let Inst{6} = 0; }
+def VLD4LNd8_UPD  : VLD4LNWB<0b0011, {?,?,?,?}, "8">;
+def VLD4LNd16_UPD : VLD4LNWB<0b0111, {?,?,0,?}, "16">;
+def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32">;
 
-def VLD4LNq16_UPD : VLD4LNWB<0b0111, "16"> { let Inst{5} = 1; }
-def VLD4LNq32_UPD : VLD4LNWB<0b1011, "32"> { let Inst{6} = 1; }
+def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16">;
+def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32">;
 
 //   VLD1DUP  : Vector Load (single element to all lanes)
 //   VLD2DUP  : Vector Load (single 2-element structure to all lanes)
@@ -500,31 +494,26 @@ def VLD4LNq32_UPD : VLD4LNWB<0b1011, "32"> { let Inst{6} = 1; }
 //   FIXME: Not yet implemented.
 } // mayLoad = 1, hasExtraDefRegAllocReq = 1
 
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in {
+
 //   VST1     : Vector Store (multiple single elements)
-class VST1D<bits<4> op7_4, string Dt, ValueType Ty>
+class VST1D<bits<4> op7_4, string Dt>
   : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$addr, DPR:$src), IIC_VST,
-          "vst1", Dt, "\\{$src\\}, $addr", "",
-          [(int_arm_neon_vst1 addrmode6:$addr, (Ty DPR:$src))]>;
-class VST1Q<bits<4> op7_4, string Dt, ValueType Ty>
-  : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins addrmode6:$addr, QPR:$src), IIC_VST,
-          "vst1", Dt, "${src:dregpair}, $addr", "",
-          [(int_arm_neon_vst1 addrmode6:$addr, (Ty QPR:$src))]>;
-
-let hasExtraSrcRegAllocReq = 1 in {
-def  VST1d8   : VST1D<0b0000, "8",  v8i8>;
-def  VST1d16  : VST1D<0b0100, "16", v4i16>;
-def  VST1d32  : VST1D<0b1000, "32", v2i32>;
-def  VST1df   : VST1D<0b1000, "32", v2f32>;
-def  VST1d64  : VST1D<0b1100, "64", v1i64>;
-
-def  VST1q8   : VST1Q<0b0000, "8",  v16i8>;
-def  VST1q16  : VST1Q<0b0100, "16", v8i16>;
-def  VST1q32  : VST1Q<0b1000, "32", v4i32>;
-def  VST1qf   : VST1Q<0b1000, "32", v4f32>;
-def  VST1q64  : VST1Q<0b1100, "64", v2i64>;
-} // hasExtraSrcRegAllocReq
-
-let mayStore = 1, hasExtraSrcRegAllocReq = 1 in {
+          "vst1", Dt, "\\{$src\\}, $addr", "", []>;
+class VST1Q<bits<4> op7_4, string Dt>
+  : NLdSt<0,0b00,0b1010,op7_4, (outs),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST,
+          "vst1", Dt, "\\{$src1, $src2\\}, $addr", "", []>;
+
+def  VST1d8   : VST1D<0b0000, "8">;
+def  VST1d16  : VST1D<0b0100, "16">;
+def  VST1d32  : VST1D<0b1000, "32">;
+def  VST1d64  : VST1D<0b1100, "64">;
+
+def  VST1q8   : VST1Q<0b0000, "8">;
+def  VST1q16  : VST1Q<0b0100, "16">;
+def  VST1q32  : VST1Q<0b1000, "32">;
+def  VST1q64  : VST1Q<0b1100, "64">;
 
 // ...with address register writeback:
 class VST1DWB<bits<4> op7_4, string Dt>
@@ -546,53 +535,50 @@ def VST1q16_UPD : VST1QWB<0b0100, "16">;
 def VST1q32_UPD : VST1QWB<0b1000, "32">;
 def VST1q64_UPD : VST1QWB<0b1100, "64">;
 
-// These (dreg triple/quadruple) are for disassembly only.
+// ...with 3 registers (some of these are only for the disassembler):
 class VST1D3<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0110, op7_4, (outs),
           (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3),
-          IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3\\}, $addr", "",
-          [/* For disassembly only; pattern left blank */]>;
-class VST1D4<bits<4> op7_4, string Dt>
-  : NLdSt<0, 0b00, 0b0010, op7_4, (outs),
-          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
-          IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr", "",
-          [/* For disassembly only; pattern left blank */]>;
-
-def  VST1d8T  : VST1D3<0b0000, "8">;
-def  VST1d16T : VST1D3<0b0100, "16">;
-def  VST1d32T : VST1D3<0b1000, "32">;
-//   VST1d64T : implemented as VST3d64
-
-def  VST1d8Q  : VST1D4<0b0000, "8">;
-def  VST1d16Q : VST1D4<0b0100, "16">;
-def  VST1d32Q : VST1D4<0b1000, "32">;
-//   VST1d64Q : implemented as VST4d64
-
-// ...with address register writeback:
+          IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3\\}, $addr", "", []>;
 class VST1D3WB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0110, op7_4, (outs GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset,
            DPR:$src1, DPR:$src2, DPR:$src3),
           IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3\\}, $addr$offset",
-          "$addr.addr = $wb",
-          [/* For disassembly only; pattern left blank */]>;
+          "$addr.addr = $wb", []>;
+
+def VST1d8T      : VST1D3<0b0000, "8">;
+def VST1d16T     : VST1D3<0b0100, "16">;
+def VST1d32T     : VST1D3<0b1000, "32">;
+def VST1d64T     : VST1D3<0b1100, "64">;
+
+def VST1d8T_UPD  : VST1D3WB<0b0000, "8">;
+def VST1d16T_UPD : VST1D3WB<0b0100, "16">;
+def VST1d32T_UPD : VST1D3WB<0b1000, "32">;
+def VST1d64T_UPD : VST1D3WB<0b1100, "64">;
+
+// ...with 4 registers (some of these are only for the disassembler):
+class VST1D4<bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b00, 0b0010, op7_4, (outs),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
+          IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr", "",
+          []>;
 class VST1D4WB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0010, op7_4, (outs GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset,
            DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
           IIC_VST, "vst1", Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr$offset",
-          "$addr.addr = $wb",
-          [/* For disassembly only; pattern left blank */]>;
+          "$addr.addr = $wb", []>;
 
-def VST1d8T_UPD  : VST1D3WB<0b0000, "8">;
-def VST1d16T_UPD : VST1D3WB<0b0100, "16">;
-def VST1d32T_UPD : VST1D3WB<0b1000, "32">;
-//  VST1d64T_UPD : implemented as VST3d64_UPD
+def VST1d8Q      : VST1D4<0b0000, "8">;
+def VST1d16Q     : VST1D4<0b0100, "16">;
+def VST1d32Q     : VST1D4<0b1000, "32">;
+def VST1d64Q     : VST1D4<0b1100, "64">;
 
 def VST1d8Q_UPD  : VST1D4WB<0b0000, "8">;
 def VST1d16Q_UPD : VST1D4WB<0b0100, "16">;
 def VST1d32Q_UPD : VST1D4WB<0b1000, "32">;
-//  VST1d64Q_UPD : implemented as VST4d64_UPD
+def VST1d64Q_UPD : VST1D4WB<0b1100, "64">;
 
 //   VST2     : Vector Store (multiple 2-element structures)
 class VST2D<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -608,9 +594,6 @@ class VST2Q<bits<4> op7_4, string Dt>
 def  VST2d8   : VST2D<0b1000, 0b0000, "8">;
 def  VST2d16  : VST2D<0b1000, 0b0100, "16">;
 def  VST2d32  : VST2D<0b1000, 0b1000, "32">;
-def  VST2d64  : NLdSt<0,0b00,0b1010,0b1100, (outs),
-                      (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST,
-                      "vst1", "64", "\\{$src1, $src2\\}, $addr", "", []>;
 
 def  VST2q8   : VST2Q<0b0000, "8">;
 def  VST2q16  : VST2Q<0b0100, "16">;
@@ -632,11 +615,6 @@ class VST2QWB<bits<4> op7_4, string Dt>
 def VST2d8_UPD  : VST2DWB<0b1000, 0b0000, "8">;
 def VST2d16_UPD : VST2DWB<0b1000, 0b0100, "16">;
 def VST2d32_UPD : VST2DWB<0b1000, 0b1000, "32">;
-def VST2d64_UPD : NLdSt<0,0b00,0b1010,0b1100, (outs GPR:$wb),
-                        (ins addrmode6:$addr, am6offset:$offset,
-                         DPR:$src1, DPR:$src2), IIC_VST,
-                        "vst1", "64", "\\{$src1, $src2\\}, $addr$offset",
-                        "$addr.addr = $wb", []>;
 
 def VST2q8_UPD  : VST2QWB<0b0000, "8">;
 def VST2q16_UPD : VST2QWB<0b0100, "16">;
@@ -659,10 +637,6 @@ class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt>
 def  VST3d8   : VST3D<0b0100, 0b0000, "8">;
 def  VST3d16  : VST3D<0b0100, 0b0100, "16">;
 def  VST3d32  : VST3D<0b0100, 0b1000, "32">;
-def  VST3d64  : NLdSt<0,0b00,0b0110,0b1100, (outs),
-                      (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3),
-                      IIC_VST,
-                      "vst1", "64", "\\{$src1, $src2, $src3\\}, $addr", "", []>;
 
 // ...with address register writeback:
 class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -675,11 +649,6 @@ class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
 def VST3d8_UPD  : VST3DWB<0b0100, 0b0000, "8">;
 def VST3d16_UPD : VST3DWB<0b0100, 0b0100, "16">;
 def VST3d32_UPD : VST3DWB<0b0100, 0b1000, "32">;
-def VST3d64_UPD : NLdSt<0,0b00,0b0110,0b1100, (outs GPR:$wb),
-                      (ins addrmode6:$addr, am6offset:$offset,
-                       DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST,
-                      "vst1", "64", "\\{$src1, $src2, $src3\\}, $addr$offset",
-                      "$addr.addr = $wb", []>;
 
 // ...with double-spaced registers (non-updating versions for disassembly only):
 def VST3q8      : VST3D<0b0101, 0b0000, "8">;
@@ -704,11 +673,6 @@ class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt>
 def  VST4d8   : VST4D<0b0000, 0b0000, "8">;
 def  VST4d16  : VST4D<0b0000, 0b0100, "16">;
 def  VST4d32  : VST4D<0b0000, 0b1000, "32">;
-def  VST4d64  : NLdSt<0,0b00,0b0010,0b1100, (outs),
-                      (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
-                       DPR:$src4), IIC_VST,
-                      "vst1", "64", "\\{$src1, $src2, $src3, $src4\\}, $addr",
-                      "", []>;
 
 // ...with address register writeback:
 class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -721,12 +685,6 @@ class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
 def VST4d8_UPD  : VST4DWB<0b0000, 0b0000, "8">;
 def VST4d16_UPD : VST4DWB<0b0000, 0b0100, "16">;
 def VST4d32_UPD : VST4DWB<0b0000, 0b1000, "32">;
-def VST4d64_UPD : NLdSt<0,0b00,0b0010,0b1100, (outs GPR:$wb),
-                      (ins addrmode6:$addr, am6offset:$offset,
-                       DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST,
-                      "vst1", "64",
-                      "\\{$src1, $src2, $src3, $src4\\}, $addr$offset",
-                      "$addr.addr = $wb", []>;
 
 // ...with double-spaced registers (non-updating versions for disassembly only):
 def VST4q8      : VST4D<0b0001, 0b0000, "8">;
@@ -745,109 +703,109 @@ def VST4q32odd_UPD : VST4DWB<0b0001, 0b1000, "32">;
 //   FIXME: Not yet implemented.
 
 //   VST2LN   : Vector Store (single 2-element structure from one lane)
-class VST2LN<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs),
+class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b00, op11_8, op7_4, (outs),
           (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane),
           IIC_VST, "vst2", Dt, "\\{$src1[$lane], $src2[$lane]\\}, $addr",
           "", []>;
 
-def VST2LNd8  : VST2LN<0b0001, "8">;
-def VST2LNd16 : VST2LN<0b0101, "16"> { let Inst{5} = 0; }
-def VST2LNd32 : VST2LN<0b1001, "32"> { let Inst{6} = 0; }
+def VST2LNd8  : VST2LN<0b0001, {?,?,?,?}, "8">;
+def VST2LNd16 : VST2LN<0b0101, {?,?,0,?}, "16">;
+def VST2LNd32 : VST2LN<0b1001, {?,0,?,?}, "32">;
 
 // ...with double-spaced registers:
-def VST2LNq16 : VST2LN<0b0101, "16"> { let Inst{5} = 1; }
-def VST2LNq32 : VST2LN<0b1001, "32"> { let Inst{6} = 1; }
+def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16">;
+def VST2LNq32 : VST2LN<0b1001, {?,1,?,?}, "32">;
 
 // ...alternate versions to be allocated odd register numbers:
-def VST2LNq16odd : VST2LN<0b0101, "16"> { let Inst{5} = 1; }
-def VST2LNq32odd : VST2LN<0b1001, "32"> { let Inst{6} = 1; }
+def VST2LNq16odd : VST2LN<0b0101, {?,?,1,?}, "16">;
+def VST2LNq32odd : VST2LN<0b1001, {?,1,?,?}, "32">;
 
 // ...with address register writeback:
-class VST2LNWB<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs GPR:$wb),
+class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset,
            DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VST, "vst2", Dt,
           "\\{$src1[$lane], $src2[$lane]\\}, $addr$offset",
           "$addr.addr = $wb", []>;
 
-def VST2LNd8_UPD  : VST2LNWB<0b0001, "8">;
-def VST2LNd16_UPD : VST2LNWB<0b0101, "16"> { let Inst{5} = 0; }
-def VST2LNd32_UPD : VST2LNWB<0b1001, "32"> { let Inst{6} = 0; }
+def VST2LNd8_UPD  : VST2LNWB<0b0001, {?,?,?,?}, "8">;
+def VST2LNd16_UPD : VST2LNWB<0b0101, {?,?,0,?}, "16">;
+def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,?,?}, "32">;
 
-def VST2LNq16_UPD : VST2LNWB<0b0101, "16"> { let Inst{5} = 1; }
-def VST2LNq32_UPD : VST2LNWB<0b1001, "32"> { let Inst{6} = 1; }
+def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16">;
+def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,?,?}, "32">;
 
 //   VST3LN   : Vector Store (single 3-element structure from one lane)
-class VST3LN<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs),
+class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b00, op11_8, op7_4, (outs),
           (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
            nohash_imm:$lane), IIC_VST, "vst3", Dt,
           "\\{$src1[$lane], $src2[$lane], $src3[$lane]\\}, $addr", "", []>;
 
-def VST3LNd8  : VST3LN<0b0010, "8"> { let Inst{4} = 0; }
-def VST3LNd16 : VST3LN<0b0110, "16"> { let Inst{5-4} = 0b00; }
-def VST3LNd32 : VST3LN<0b1010, "32"> { let Inst{6-4} = 0b000; }
+def VST3LNd8  : VST3LN<0b0010, {?,?,?,0}, "8">;
+def VST3LNd16 : VST3LN<0b0110, {?,?,0,0}, "16">;
+def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32">;
 
 // ...with double-spaced registers:
-def VST3LNq16 : VST3LN<0b0110, "16"> { let Inst{5-4} = 0b10; }
-def VST3LNq32 : VST3LN<0b1010, "32"> { let Inst{6-4} = 0b100; }
+def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16">;
+def VST3LNq32 : VST3LN<0b1010, {?,1,0,0}, "32">;
 
 // ...alternate versions to be allocated odd register numbers:
-def VST3LNq16odd : VST3LN<0b0110, "16"> { let Inst{5-4} = 0b10; }
-def VST3LNq32odd : VST3LN<0b1010, "32"> { let Inst{6-4} = 0b100; }
+def VST3LNq16odd : VST3LN<0b0110, {?,?,1,0}, "16">;
+def VST3LNq32odd : VST3LN<0b1010, {?,1,0,0}, "32">;
 
 // ...with address register writeback:
-class VST3LNWB<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs GPR:$wb),
+class VST3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset,
            DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane),
           IIC_VST, "vst3", Dt,
           "\\{$src1[$lane], $src2[$lane], $src3[$lane]\\}, $addr$offset",
           "$addr.addr = $wb", []>;
 
-def VST3LNd8_UPD  : VST3LNWB<0b0010, "8"> { let Inst{4} = 0; }
-def VST3LNd16_UPD : VST3LNWB<0b0110, "16"> { let Inst{5-4} = 0b00; }
-def VST3LNd32_UPD : VST3LNWB<0b1010, "32"> { let Inst{6-4} = 0b000; }
+def VST3LNd8_UPD  : VST3LNWB<0b0010, {?,?,?,0}, "8">;
+def VST3LNd16_UPD : VST3LNWB<0b0110, {?,?,0,0}, "16">;
+def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32">;
 
-def VST3LNq16_UPD : VST3LNWB<0b0110, "16"> { let Inst{5-4} = 0b10; }
-def VST3LNq32_UPD : VST3LNWB<0b1010, "32"> { let Inst{6-4} = 0b100; }
+def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16">;
+def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32">;
 
 //   VST4LN   : Vector Store (single 4-element structure from one lane)
-class VST4LN<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs),
+class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b00, op11_8, op7_4, (outs),
           (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
            nohash_imm:$lane), IIC_VST, "vst4", Dt,
           "\\{$src1[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $addr",
           "", []>;
 
-def VST4LNd8  : VST4LN<0b0011, "8">;
-def VST4LNd16 : VST4LN<0b0111, "16"> { let Inst{5} = 0; }
-def VST4LNd32 : VST4LN<0b1011, "32"> { let Inst{6} = 0; }
+def VST4LNd8  : VST4LN<0b0011, {?,?,?,?}, "8">;
+def VST4LNd16 : VST4LN<0b0111, {?,?,0,?}, "16">;
+def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32">;
 
 // ...with double-spaced registers:
-def VST4LNq16 : VST4LN<0b0111, "16"> { let Inst{5} = 1; }
-def VST4LNq32 : VST4LN<0b1011, "32"> { let Inst{6} = 1; }
+def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16">;
+def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32">;
 
 // ...alternate versions to be allocated odd register numbers:
-def VST4LNq16odd : VST4LN<0b0111, "16"> { let Inst{5} = 1; }
-def VST4LNq32odd : VST4LN<0b1011, "32"> { let Inst{6} = 1; }
+def VST4LNq16odd : VST4LN<0b0111, {?,?,1,?}, "16">;
+def VST4LNq32odd : VST4LN<0b1011, {?,1,?,?}, "32">;
 
 // ...with address register writeback:
-class VST4LNWB<bits<4> op11_8, string Dt>
-  : NLdSt<1, 0b00, op11_8, {?,?,?,?}, (outs GPR:$wb),
+class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
           (ins addrmode6:$addr, am6offset:$offset,
            DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane),
           IIC_VST, "vst4", Dt,
   "\\{$src1[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $addr$offset",
           "$addr.addr = $wb", []>;
 
-def VST4LNd8_UPD  : VST4LNWB<0b0011, "8">;
-def VST4LNd16_UPD : VST4LNWB<0b0111, "16"> { let Inst{5} = 0; }
-def VST4LNd32_UPD : VST4LNWB<0b1011, "32"> { let Inst{6} = 0; }
+def VST4LNd8_UPD  : VST4LNWB<0b0011, {?,?,?,?}, "8">;
+def VST4LNd16_UPD : VST4LNWB<0b0111, {?,?,0,?}, "16">;
+def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32">;
 
-def VST4LNq16_UPD : VST4LNWB<0b0111, "16"> { let Inst{5} = 1; }
-def VST4LNq32_UPD : VST4LNWB<0b1011, "32"> { let Inst{6} = 1; }
+def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16">;
+def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32">;
 
 } // mayStore = 1, hasExtraSrcRegAllocReq = 1
 
@@ -906,18 +864,18 @@ class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
            bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
            string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
-        (ins DPR:$src), IIC_VUNAD, OpcodeStr, Dt, "$dst, $src", "",
+        (ins DPR:$src), IIC_VUNAD, OpcodeStr, Dt,"$dst, $src", "",
         [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src))))]>;
 class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
            bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
            string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst),
-        (ins QPR:$src), IIC_VUNAQ, OpcodeStr, Dt, "$dst, $src", "",
+        (ins QPR:$src), IIC_VUNAQ, OpcodeStr, Dt,"$dst, $src", "",
         [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src))))]>;
 
 // Basic 2-register intrinsics, both double- and quad-register.
 class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
-              bits<2> op17_16, bits<5> op11_7, bit op4, 
+              bits<2> op17_16, bits<5> op11_7, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
@@ -966,8 +924,8 @@ class N3VS<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
            SDNode OpNode, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND,
-        OpcodeStr, Dt, "$dst, $src1, $src2", "", []> {
+        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm,
+        IIC_VBIND, OpcodeStr, Dt, "$dst, $src1, $src2", "", []> {
   let isCommutable = Commutable;
 }
 
@@ -975,7 +933,7 @@ class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            InstrItinClass itin, string OpcodeStr, string Dt,
            ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, 
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
         [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
   let isCommutable = Commutable;
@@ -986,27 +944,28 @@ class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            ValueType ResTy, ValueType OpTy,
            SDNode OpNode, bit Commutable>
   : N3VX<op24, op23, op21_20, op11_8, 0, op4,
-         (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, 
+         (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin, 
          OpcodeStr, "$dst, $src1, $src2", "",
          [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]>{
   let isCommutable = Commutable;
 }
+
 class N3VDSL<bits<2> op21_20, bits<4> op11_8, 
              InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType Ty, SDNode ShOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
-        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (Ty DPR:$dst),
               (Ty (ShOp (Ty DPR:$src1),
-                        (Ty (NEONvduplane (Ty DPR_VFP2:$src2), imm:$lane)))))]>{
+                        (Ty (NEONvduplane (Ty DPR_VFP2:$src2),imm:$lane)))))]> {
   let isCommutable = 0;
 }
 class N3VDSL16<bits<2> op21_20, bits<4> op11_8, 
                string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
-        IIC_VMULi16D, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$dst, $src1, $src2[$lane]","",
         [(set (Ty DPR:$dst),
               (Ty (ShOp (Ty DPR:$src1),
                         (Ty (NEONvduplane (Ty DPR_8:$src2), imm:$lane)))))]> {
@@ -1017,7 +976,7 @@ class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            InstrItinClass itin, string OpcodeStr, string Dt,
            ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, 
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, itin, 
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
         [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
   let isCommutable = Commutable;
@@ -1026,7 +985,7 @@ class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
            InstrItinClass itin, string OpcodeStr,
            ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
   : N3VX<op24, op23, op21_20, op11_8, 1, op4,
-         (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, 
+         (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, itin, 
          OpcodeStr, "$dst, $src1, $src2", "",
          [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]>{
   let isCommutable = Commutable;
@@ -1036,7 +995,7 @@ class N3VQSL<bits<2> op21_20, bits<4> op11_8,
              ValueType ResTy, ValueType OpTy, SDNode ShOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
-        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (ResTy QPR:$dst),
               (ResTy (ShOp (ResTy QPR:$src1),
                            (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2),
@@ -1047,7 +1006,7 @@ class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt,
                ValueType ResTy, ValueType OpTy, SDNode ShOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane),
-        IIC_VMULi16Q, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$dst, $src1, $src2[$lane]","",
         [(set (ResTy QPR:$dst),
               (ResTy (ShOp (ResTy QPR:$src1),
                            (ResTy (NEONvduplane (OpTy DPR_8:$src2),
@@ -1057,10 +1016,10 @@ class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt,
 
 // Basic 3-register intrinsics, both double- and quad-register.
 class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-              InstrItinClass itin, string OpcodeStr, string Dt,
+              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, 
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), f, itin,
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
         [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
   let isCommutable = Commutable;
@@ -1069,7 +1028,7 @@ class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
-        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (Ty DPR:$dst),
               (Ty (IntOp (Ty DPR:$src1),
                          (Ty (NEONvduplane (Ty DPR_VFP2:$src2),
@@ -1080,19 +1039,18 @@ class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
-        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (Ty DPR:$dst),
               (Ty (IntOp (Ty DPR:$src1),
-                         (Ty (NEONvduplane (Ty DPR_8:$src2),
-                                           imm:$lane)))))]> {
+                         (Ty (NEONvduplane (Ty DPR_8:$src2), imm:$lane)))))]> {
   let isCommutable = 0;
 }
 
 class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
-              InstrItinClass itin, string OpcodeStr, string Dt,
+              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, 
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), f, itin,
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
         [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
   let isCommutable = Commutable;
@@ -1102,7 +1060,7 @@ class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
-        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (ResTy QPR:$dst),
               (ResTy (IntOp (ResTy QPR:$src1),
                             (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2),
@@ -1114,7 +1072,7 @@ class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane),
-        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (ResTy QPR:$dst),
               (ResTy (IntOp (ResTy QPR:$src1),
                             (ResTy (NEONvduplane (OpTy DPR_8:$src2),
@@ -1128,14 +1086,14 @@ class N3VSMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 ValueType Ty, SDNode MulOp, SDNode OpNode>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs DPR_VFP2:$dst),
-        (ins DPR_VFP2:$src1, DPR_VFP2:$src2, DPR_VFP2:$src3), itin,
+        (ins DPR_VFP2:$src1, DPR_VFP2:$src2, DPR_VFP2:$src3), N3RegFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", []>;
 
 class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 InstrItinClass itin, string OpcodeStr, string Dt,
                 ValueType Ty, SDNode MulOp, SDNode OpNode>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), itin,
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
         [(set DPR:$dst, (Ty (OpNode DPR:$src1,
                              (Ty (MulOp DPR:$src2, DPR:$src3)))))]>;
@@ -1144,7 +1102,8 @@ class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   ValueType Ty, SDNode MulOp, SDNode ShOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$dst),
-        (ins DPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin,
+        (ins DPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),
+        NVMulSLFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
         [(set (Ty DPR:$dst),
               (Ty (ShOp (Ty DPR:$src1),
@@ -1156,7 +1115,8 @@ class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                     ValueType Ty, SDNode MulOp, SDNode ShOp>
   : N3V<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$dst),
-        (ins DPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin,
+        (ins DPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane),
+        NVMulSLFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
         [(set (Ty DPR:$dst),
               (Ty (ShOp (Ty DPR:$src1),
@@ -1168,7 +1128,7 @@ class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                 InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty,
                 SDNode MulOp, SDNode OpNode>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), itin,
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
         [(set QPR:$dst, (Ty (OpNode QPR:$src1,
                              (Ty (MulOp QPR:$src2, QPR:$src3)))))]>;
@@ -1177,7 +1137,8 @@ class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   SDNode MulOp, SDNode ShOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst),
-        (ins QPR:$src1, QPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin,
+        (ins QPR:$src1, QPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),
+        NVMulSLFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
         [(set (ResTy QPR:$dst),
               (ResTy (ShOp (ResTy QPR:$src1),
@@ -1190,7 +1151,8 @@ class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                     SDNode MulOp, SDNode ShOp>
   : N3V<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst),
-        (ins QPR:$src1, QPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin,
+        (ins QPR:$src1, QPR:$src2, DPR_8:$src3, nohash_imm:$lane),
+        NVMulSLFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
         [(set (ResTy QPR:$dst),
               (ResTy (ShOp (ResTy QPR:$src1),
@@ -1204,7 +1166,7 @@ class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), itin,
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
         [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1),
                                       (OpTy DPR:$src2), (OpTy DPR:$src3))))]>;
@@ -1212,7 +1174,7 @@ class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
-        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), itin,
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), N3RegFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
         [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1),
                                       (OpTy QPR:$src2), (OpTy QPR:$src3))))]>;
@@ -1223,7 +1185,7 @@ class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
                ValueType TyQ, ValueType TyD, Intrinsic IntOp>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), itin,
+        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), N3RegFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
         [(set QPR:$dst,
           (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2), (TyD DPR:$src3))))]>;
@@ -1232,7 +1194,8 @@ class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                  ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst),
-        (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin,
+        (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane),
+        NVMulSLFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
         [(set (ResTy QPR:$dst),
               (ResTy (IntOp (ResTy QPR:$src1),
@@ -1244,7 +1207,8 @@ class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                    ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst),
-        (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin,
+        (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane),
+        NVMulSLFrm, itin,
         OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
         [(set (ResTy QPR:$dst),
               (ResTy (IntOp (ResTy QPR:$src1),
@@ -1257,7 +1221,7 @@ class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ,
               Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs DPR:$dst), (ins QPR:$src1, QPR:$src2), IIC_VBINi4D,
+        (outs DPR:$dst), (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINi4D,
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
         [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src1), (TyQ QPR:$src2))))]> {
   let isCommutable = Commutable;
@@ -1268,7 +1232,7 @@ class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType TyQ, ValueType TyD, Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), itin,
+        (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), N3RegFrm, itin,
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
         [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src1), (TyD DPR:$src2))))]> {
   let isCommutable = Commutable;
@@ -1277,8 +1241,8 @@ class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt,
                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, 1, op21_20, op11_8, 1, 0,
-        (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), 
-        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (ResTy QPR:$dst),
               (ResTy (IntOp (OpTy DPR:$src1),
                             (OpTy (NEONvduplane (OpTy DPR_VFP2:$src2),
@@ -1288,7 +1252,7 @@ class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
                   ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
   : N3V<op24, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), 
-        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
         [(set (ResTy QPR:$dst),
               (ResTy (IntOp (OpTy DPR:$src1),
                             (OpTy (NEONvduplane (OpTy DPR_8:$src2),
@@ -1299,7 +1263,7 @@ class N3VWInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
               string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD,
               Intrinsic IntOp, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
-        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), IIC_VSUBiD,
+        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), N3RegFrm, IIC_VSUBiD,
         OpcodeStr, Dt, "$dst, $src1, $src2", "",
         [(set QPR:$dst, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2))))]> {
   let isCommutable = Commutable;
@@ -1344,17 +1308,17 @@ class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 // Shift by immediate,
 // both double- and quad-register.
 class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
-             InstrItinClass itin, string OpcodeStr, string Dt,
+             Format f, InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType Ty, SDNode OpNode>
   : N2VImm<op24, op23, op11_8, op7, 0, op4,
-           (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), itin,
+           (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), f, itin,
            OpcodeStr, Dt, "$dst, $src, $SIMM", "",
            [(set DPR:$dst, (Ty (OpNode (Ty DPR:$src), (i32 imm:$SIMM))))]>;
 class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
-             InstrItinClass itin, string OpcodeStr, string Dt,
+             Format f, InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType Ty, SDNode OpNode>
   : N2VImm<op24, op23, op11_8, op7, 1, op4,
-           (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin,
+           (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), f, itin,
            OpcodeStr, Dt, "$dst, $src, $SIMM", "",
            [(set QPR:$dst, (Ty (OpNode (Ty QPR:$src), (i32 imm:$SIMM))))]>;
 
@@ -1363,8 +1327,8 @@ class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
              string OpcodeStr, string Dt,
              ValueType ResTy, ValueType OpTy, SDNode OpNode>
   : N2VImm<op24, op23, op11_8, op7, op6, op4,
-           (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VSHLiD,
-           OpcodeStr, Dt, "$dst, $src, $SIMM", "",
+           (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), N2RegVShLFrm,
+           IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src, $SIMM", "",
            [(set QPR:$dst, (ResTy (OpNode (OpTy DPR:$src),
                                           (i32 imm:$SIMM))))]>;
 
@@ -1373,7 +1337,7 @@ class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
              InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType ResTy, ValueType OpTy, SDNode OpNode>
   : N2VImm<op24, op23, op11_8, op7, op6, op4,
-           (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin,
+           (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), N2RegVShRFrm, itin,
            OpcodeStr, Dt, "$dst, $src, $SIMM", "",
            [(set DPR:$dst, (ResTy (OpNode (OpTy QPR:$src),
                                           (i32 imm:$SIMM))))]>;
@@ -1383,14 +1347,14 @@ class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
 class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
                 string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
   : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst),
-           (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), IIC_VPALiD, 
+           (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD,
            OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst",
            [(set DPR:$dst, (Ty (add DPR:$src1,
                                 (Ty (ShOp DPR:$src2, (i32 imm:$SIMM))))))]>;
 class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
                 string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
   : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst),
-           (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), IIC_VPALiD, 
+           (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), N2RegVShRFrm, IIC_VPALiD,
            OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst",
            [(set QPR:$dst, (Ty (add QPR:$src1,
                                 (Ty (ShOp QPR:$src2, (i32 imm:$SIMM))))))]>;
@@ -1398,15 +1362,15 @@ class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
 // Shift by immediate and insert,
 // both double- and quad-register.
 class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
-                string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
+                Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp>
   : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst),
-           (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), IIC_VSHLiD, 
+           (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), f, IIC_VSHLiD,
            OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst",
            [(set DPR:$dst, (Ty (ShOp DPR:$src1, DPR:$src2, (i32 imm:$SIMM))))]>;
 class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
-                string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
+                Format f, string OpcodeStr, string Dt, ValueType Ty,SDNode ShOp>
   : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst),
-           (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), IIC_VSHLiQ, 
+           (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), f, IIC_VSHLiQ,
            OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst",
            [(set QPR:$dst, (Ty (ShOp QPR:$src1, QPR:$src2, (i32 imm:$SIMM))))]>;
 
@@ -1416,15 +1380,15 @@ class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
               string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
               Intrinsic IntOp>
   : N2VImm<op24, op23, op11_8, op7, 0, op4,
-           (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VUNAD, 
-           OpcodeStr, Dt, "$dst, $src, $SIMM", "",
+           (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), NVCVTFrm,
+           IIC_VUNAD, OpcodeStr, Dt, "$dst, $src, $SIMM", "",
            [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src), (i32 imm:$SIMM))))]>;
 class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
               string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
               Intrinsic IntOp>
   : N2VImm<op24, op23, op11_8, op7, 1, op4,
-           (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), IIC_VUNAQ, 
-           OpcodeStr, Dt, "$dst, $src, $SIMM", "",
+           (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), NVCVTFrm,
+           IIC_VUNAQ, OpcodeStr, Dt, "$dst, $src, $SIMM", "",
            [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src), (i32 imm:$SIMM))))]>;
 
 //===----------------------------------------------------------------------===//
@@ -1568,24 +1532,24 @@ multiclass N2VLInt_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4,
 // Neon 3-register vector intrinsics.
 
 // First with only element sizes of 16 and 32 bits:
-multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                      InstrItinClass itinD16, InstrItinClass itinD32,
                      InstrItinClass itinQ16, InstrItinClass itinQ32,
                      string OpcodeStr, string Dt,
                      Intrinsic IntOp, bit Commutable = 0> {
   // 64-bit vector types.
-  def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, itinD16,
+  def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, f, itinD16,
                       OpcodeStr, !strconcat(Dt, "16"),
                       v4i16, v4i16, IntOp, Commutable>;
-  def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, itinD32,
+  def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, f, itinD32,
                       OpcodeStr, !strconcat(Dt, "32"),
                       v2i32, v2i32, IntOp, Commutable>;
 
   // 128-bit vector types.
-  def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, itinQ16,
+  def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, f, itinQ16,
                       OpcodeStr, !strconcat(Dt, "16"),
                       v8i16, v8i16, IntOp, Commutable>;
-  def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, itinQ32,
+  def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, f, itinQ32,
                       OpcodeStr, !strconcat(Dt, "32"),
                       v4i32, v4i32, IntOp, Commutable>;
 }
@@ -1605,38 +1569,37 @@ multiclass N3VIntSL_HS<bits<4> op11_8,
 }
 
 // ....then also with element size of 8 bits:
-multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                       InstrItinClass itinD16, InstrItinClass itinD32,
                       InstrItinClass itinQ16, InstrItinClass itinQ32,
                       string OpcodeStr, string Dt,
                       Intrinsic IntOp, bit Commutable = 0>
-  : N3VInt_HS<op24, op23, op11_8, op4, itinD16, itinD32, itinQ16, itinQ32,
+  : N3VInt_HS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
               OpcodeStr, Dt, IntOp, Commutable> {
-  def v8i8  : N3VDInt<op24, op23, 0b00, op11_8, op4, itinD16,
+  def v8i8  : N3VDInt<op24, op23, 0b00, op11_8, op4, f, itinD16,
                       OpcodeStr, !strconcat(Dt, "8"),
                       v8i8, v8i8, IntOp, Commutable>;
-  def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, itinQ16,
+  def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, f, itinQ16,
                       OpcodeStr, !strconcat(Dt, "8"),
                       v16i8, v16i8, IntOp, Commutable>;
 }
 
 // ....then also with element size of 64 bits:
-multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
                        InstrItinClass itinD16, InstrItinClass itinD32,
                        InstrItinClass itinQ16, InstrItinClass itinQ32,
                        string OpcodeStr, string Dt,
                        Intrinsic IntOp, bit Commutable = 0>
-  : N3VInt_QHS<op24, op23, op11_8, op4, itinD16, itinD32, itinQ16, itinQ32,
+  : N3VInt_QHS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
                OpcodeStr, Dt, IntOp, Commutable> {
-  def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, itinD32,
+  def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, f, itinD32,
                       OpcodeStr, !strconcat(Dt, "64"),
                       v1i64, v1i64, IntOp, Commutable>;
-  def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, itinQ32,
+  def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, f, itinQ32,
                       OpcodeStr, !strconcat(Dt, "64"),
                       v2i64, v2i64, IntOp, Commutable>;
 }
 
-
 // Neon Narrowing 3-register vector intrinsics,
 //   source operand element sizes of 16, 32 and 64 bits:
 multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4,
@@ -1866,46 +1829,46 @@ multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
 
 
 // Neon 2-register vector shift by immediate,
+//   with f of either N2RegVShLFrm or N2RegVShRFrm
 //   element sizes of 8, 16, 32 and 64 bits:
 multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
-                      InstrItinClass itin, string OpcodeStr, string Dt,
-                      SDNode OpNode> {
+                     InstrItinClass itin, string OpcodeStr, string Dt,
+                     SDNode OpNode, Format f> {
   // 64-bit vector types.
-  def v8i8  : N2VDSh<op24, op23, op11_8, 0, op4, itin,
+  def v8i8  : N2VDSh<op24, op23, op11_8, 0, op4, f, itin,
                      OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> {
     let Inst{21-19} = 0b001; // imm6 = 001xxx
   }
-  def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, itin,
+  def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, f, itin,
                      OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> {
     let Inst{21-20} = 0b01;  // imm6 = 01xxxx
   }
-  def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, itin,
+  def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, f, itin,
                      OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> {
     let Inst{21} = 0b1;      // imm6 = 1xxxxx
   }
-  def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, itin,
+  def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, f, itin,
                      OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>;
                              // imm6 = xxxxxx
 
   // 128-bit vector types.
-  def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, itin,
+  def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin,
                      OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> {
     let Inst{21-19} = 0b001; // imm6 = 001xxx
   }
-  def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, itin,
+  def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin,
                      OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> {
     let Inst{21-20} = 0b01;  // imm6 = 01xxxx
   }
-  def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, itin,
+  def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, f, itin,
                      OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> {
     let Inst{21} = 0b1;      // imm6 = 1xxxxx
   }
-  def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, itin,
+  def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, f, itin,
                      OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>;
                              // imm6 = xxxxxx
 }
 
-
 // Neon Shift-Accumulate vector operations,
 //   element sizes of 8, 16, 32 and 64 bits:
 multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
@@ -1947,41 +1910,43 @@ multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
 
 
 // Neon Shift-Insert vector operations,
+//   with f of either N2RegVShLFrm or N2RegVShRFrm
 //   element sizes of 8, 16, 32 and 64 bits:
 multiclass N2VShIns_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
-                         string OpcodeStr, SDNode ShOp> {
+                         string OpcodeStr, SDNode ShOp,
+                         Format f> {
   // 64-bit vector types.
   def v8i8  : N2VDShIns<op24, op23, op11_8, 0, op4,
-                        OpcodeStr, "8", v8i8, ShOp> {
+                        f, OpcodeStr, "8", v8i8, ShOp> {
     let Inst{21-19} = 0b001; // imm6 = 001xxx
   }
   def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4,
-                        OpcodeStr, "16", v4i16, ShOp> {
+                        f, OpcodeStr, "16", v4i16, ShOp> {
     let Inst{21-20} = 0b01;  // imm6 = 01xxxx
   }
   def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4,
-                        OpcodeStr, "32", v2i32, ShOp> {
+                        f, OpcodeStr, "32", v2i32, ShOp> {
     let Inst{21} = 0b1;      // imm6 = 1xxxxx
   }
   def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4,
-                        OpcodeStr, "64", v1i64, ShOp>;
+                        f, OpcodeStr, "64", v1i64, ShOp>;
                              // imm6 = xxxxxx
 
   // 128-bit vector types.
   def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4,
-                        OpcodeStr, "8", v16i8, ShOp> {
+                        f, OpcodeStr, "8", v16i8, ShOp> {
     let Inst{21-19} = 0b001; // imm6 = 001xxx
   }
   def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4,
-                        OpcodeStr, "16", v8i16, ShOp> {
+                        f, OpcodeStr, "16", v8i16, ShOp> {
     let Inst{21-20} = 0b01;  // imm6 = 01xxxx
   }
   def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4,
-                        OpcodeStr, "32", v4i32, ShOp> {
+                        f, OpcodeStr, "32", v4i32, ShOp> {
     let Inst{21} = 0b1;      // imm6 = 1xxxxx
   }
   def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4,
-                        OpcodeStr, "64", v2i64, ShOp>;
+                        f, OpcodeStr, "64", v2i64, ShOp>;
                              // imm6 = xxxxxx
 }
 
@@ -2044,20 +2009,26 @@ defm VADDLu   : N3VLInt_QHS<1,1,0b0000,0, IIC_VSHLiD, "vaddl", "u",
 defm VADDWs   : N3VWInt_QHS<0,1,0b0001,0, "vaddw", "s", int_arm_neon_vaddws, 0>;
 defm VADDWu   : N3VWInt_QHS<1,1,0b0001,0, "vaddw", "u", int_arm_neon_vaddwu, 0>;
 //   VHADD    : Vector Halving Add
-defm VHADDs   : N3VInt_QHS<0,0,0b0000,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                           IIC_VBINi4Q, "vhadd", "s", int_arm_neon_vhadds, 1>;
-defm VHADDu   : N3VInt_QHS<1,0,0b0000,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                           IIC_VBINi4Q, "vhadd", "u", int_arm_neon_vhaddu, 1>;
+defm VHADDs   : N3VInt_QHS<0, 0, 0b0000, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vhadd", "s", int_arm_neon_vhadds, 1>;
+defm VHADDu   : N3VInt_QHS<1, 0, 0b0000, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vhadd", "u", int_arm_neon_vhaddu, 1>;
 //   VRHADD   : Vector Rounding Halving Add
-defm VRHADDs  : N3VInt_QHS<0,0,0b0001,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                           IIC_VBINi4Q, "vrhadd", "s", int_arm_neon_vrhadds, 1>;
-defm VRHADDu  : N3VInt_QHS<1,0,0b0001,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                           IIC_VBINi4Q, "vrhadd", "u", int_arm_neon_vrhaddu, 1>;
+defm VRHADDs  : N3VInt_QHS<0, 0, 0b0001, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vrhadd", "s", int_arm_neon_vrhadds, 1>;
+defm VRHADDu  : N3VInt_QHS<1, 0, 0b0001, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vrhadd", "u", int_arm_neon_vrhaddu, 1>;
 //   VQADD    : Vector Saturating Add
-defm VQADDs   : N3VInt_QHSD<0,0,0b0000,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                            IIC_VBINi4Q, "vqadd", "s", int_arm_neon_vqadds, 1>;
-defm VQADDu   : N3VInt_QHSD<1,0,0b0000,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                            IIC_VBINi4Q, "vqadd", "u", int_arm_neon_vqaddu, 1>;
+defm VQADDs   : N3VInt_QHSD<0, 0, 0b0000, 1, N3RegFrm,
+                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                            "vqadd", "s", int_arm_neon_vqadds, 1>;
+defm VQADDu   : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm,
+                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                            "vqadd", "u", int_arm_neon_vqaddu, 1>;
 //   VADDHN   : Vector Add and Narrow Returning High Half (D = Q + Q)
 defm VADDHN   : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i",
                             int_arm_neon_vaddhn, 1>;
@@ -2070,10 +2041,10 @@ defm VRADDHN  : N3VNInt_HSD<1,1,0b0100,0, "vraddhn", "i",
 //   VMUL     : Vector Multiply (integer, polynomial and floating-point)
 defm VMUL     : N3V_QHS<0, 0, 0b1001, 1, IIC_VMULi16D, IIC_VMULi32D,
                         IIC_VMULi16Q, IIC_VMULi32Q, "vmul", "i", mul, 1>;
-def  VMULpd   : N3VDInt<1, 0, 0b00, 0b1001, 1, IIC_VMULi16D, "vmul", "p8",
-                        v8i8, v8i8, int_arm_neon_vmulp, 1>;
-def  VMULpq   : N3VQInt<1, 0, 0b00, 0b1001, 1, IIC_VMULi16Q, "vmul", "p8",
-                        v16i8, v16i8, int_arm_neon_vmulp, 1>;
+def  VMULpd   : N3VDInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16D, "vmul",
+                        "p8", v8i8, v8i8, int_arm_neon_vmulp, 1>;
+def  VMULpq   : N3VQInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16Q, "vmul",
+                        "p8", v16i8, v16i8, int_arm_neon_vmulp, 1>;
 def  VMULfd   : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VBIND, "vmul", "f32",
                      v2f32, v2f32, fmul, 1>;
 def  VMULfq   : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VBINQ, "vmul", "f32",
@@ -2103,7 +2074,7 @@ def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
                            (SubReg_i32_lane imm:$lane)))>;
 
 //   VQDMULH  : Vector Saturating Doubling Multiply Returning High Half
-defm VQDMULH  : N3VInt_HS<0, 0, 0b1011, 0, IIC_VMULi16D, IIC_VMULi32D,
+defm VQDMULH  : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D,
                           IIC_VMULi16Q, IIC_VMULi32Q, 
                           "vqdmulh", "s", int_arm_neon_vqdmulh, 1>;
 defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D,
@@ -2125,8 +2096,8 @@ def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1),
                                  (SubReg_i32_lane imm:$lane)))>;
 
 //   VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half
-defm VQRDMULH   : N3VInt_HS<1, 0, 0b1011, 0, IIC_VMULi16D, IIC_VMULi32D,
-                            IIC_VMULi16Q, IIC_VMULi32Q,
+defm VQRDMULH   : N3VInt_HS<1, 0, 0b1011, 0, N3RegFrm,
+                            IIC_VMULi16D,IIC_VMULi32D,IIC_VMULi16Q,IIC_VMULi32Q,
                             "vqrdmulh", "s", int_arm_neon_vqrdmulh, 1>;
 defm VQRDMULHsl : N3VIntSL_HS<0b1101, IIC_VMULi16D, IIC_VMULi32D,
                               IIC_VMULi16Q, IIC_VMULi32Q,
@@ -2285,18 +2256,18 @@ defm VSUBLu   : N3VLInt_QHS<1,1,0b0010,0, IIC_VSHLiD, "vsubl", "u",
 defm VSUBWs   : N3VWInt_QHS<0,1,0b0011,0, "vsubw", "s", int_arm_neon_vsubws, 0>;
 defm VSUBWu   : N3VWInt_QHS<1,1,0b0011,0, "vsubw", "u", int_arm_neon_vsubwu, 0>;
 //   VHSUB    : Vector Halving Subtract
-defm VHSUBs   : N3VInt_QHS<0, 0, 0b0010, 0, IIC_VBINi4D, IIC_VBINi4D,
-                           IIC_VBINi4Q, IIC_VBINi4Q,
+defm VHSUBs   : N3VInt_QHS<0, 0, 0b0010, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
                            "vhsub", "s", int_arm_neon_vhsubs, 0>;
-defm VHSUBu   : N3VInt_QHS<1, 0, 0b0010, 0, IIC_VBINi4D, IIC_VBINi4D,
-                           IIC_VBINi4Q, IIC_VBINi4Q,
+defm VHSUBu   : N3VInt_QHS<1, 0, 0b0010, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
                            "vhsub", "u", int_arm_neon_vhsubu, 0>;
 //   VQSUB    : Vector Saturing Subtract
-defm VQSUBs   : N3VInt_QHSD<0, 0, 0b0010, 1, IIC_VBINi4D, IIC_VBINi4D,
-                            IIC_VBINi4Q, IIC_VBINi4Q,
+defm VQSUBs   : N3VInt_QHSD<0, 0, 0b0010, 1, N3RegFrm,
+                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
                             "vqsub", "s", int_arm_neon_vqsubs, 0>;
-defm VQSUBu   : N3VInt_QHSD<1, 0, 0b0010, 1, IIC_VBINi4D, IIC_VBINi4D,
-                            IIC_VBINi4Q, IIC_VBINi4Q,
+defm VQSUBu   : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm,
+                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
                             "vqsub", "u", int_arm_neon_vqsubu, 0>;
 //   VSUBHN   : Vector Subtract and Narrow Returning High Half (D = Q - Q)
 defm VSUBHN   : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i",
@@ -2323,8 +2294,8 @@ defm VCGEs    : N3V_QHS<0, 0, 0b0011, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
                         IIC_VBINi4Q, "vcge", "s", NEONvcge, 0>;
 defm VCGEu    : N3V_QHS<1, 0, 0b0011, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, 
                         IIC_VBINi4Q, "vcge", "u", NEONvcgeu, 0>;
-def  VCGEfd   : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32",
-                     v2i32, v2f32, NEONvcge, 0>;
+def  VCGEfd   : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
+                     NEONvcge, 0>;
 def  VCGEfq   : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
                      NEONvcge, 0>;
 // For disassembly only.
@@ -2351,21 +2322,27 @@ defm VCLTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s",
                             "$dst, $src, #0">;
 
 //   VACGE    : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
-def  VACGEd   : N3VDInt<1, 0, 0b00, 0b1110, 1, IIC_VBIND, "vacge", "f32",
-                        v2i32, v2f32, int_arm_neon_vacged, 0>;
-def  VACGEq   : N3VQInt<1, 0, 0b00, 0b1110, 1, IIC_VBINQ, "vacge", "f32",
-                        v4i32, v4f32, int_arm_neon_vacgeq, 0>;
+def  VACGEd   : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge",
+                        "f32", v2i32, v2f32, int_arm_neon_vacged, 0>;
+def  VACGEq   : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge",
+                        "f32", v4i32, v4f32, int_arm_neon_vacgeq, 0>;
 //   VACGT    : Vector Absolute Compare Greater Than (aka VCAGT)
-def  VACGTd   : N3VDInt<1, 0, 0b10, 0b1110, 1, IIC_VBIND, "vacgt", "f32",
-                        v2i32, v2f32, int_arm_neon_vacgtd, 0>;
-def  VACGTq   : N3VQInt<1, 0, 0b10, 0b1110, 1, IIC_VBINQ, "vacgt", "f32",
-                        v4i32, v4f32, int_arm_neon_vacgtq, 0>;
+def  VACGTd   : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt",
+                        "f32", v2i32, v2f32, int_arm_neon_vacgtd, 0>;
+def  VACGTq   : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt",
+                        "f32", v4i32, v4f32, int_arm_neon_vacgtq, 0>;
 //   VTST     : Vector Test Bits
 defm VTST     : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, 
                         IIC_VBINi4Q, "vtst", "", NEONvtst, 1>;
 
 // Vector Bitwise Operations.
 
+def vnot8 : PatFrag<(ops node:$in),
+                    (xor node:$in, (bitconvert (v8i8 immAllOnesV)))>;
+def vnot16 : PatFrag<(ops node:$in),
+                     (xor node:$in, (bitconvert (v16i8 immAllOnesV)))>;
+
+
 //   VAND     : Vector Bitwise AND
 def  VANDd    : N3VDX<0, 0, 0b00, 0b0001, 1, IIC_VBINiD, "vand",
                       v2i32, v2i32, and, 1>;
@@ -2386,74 +2363,80 @@ def  VORRq    : N3VQX<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr",
 
 //   VBIC     : Vector Bitwise Bit Clear (AND NOT)
 def  VBICd    : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst),
-                    (ins DPR:$src1, DPR:$src2), IIC_VBINiD,
-                    "vbic", "$dst, $src1, $src2", "",
-                    [(set DPR:$dst, (v2i32 (and DPR:$src1,
-                                                (vnot_conv DPR:$src2))))]>;
+                     (ins DPR:$src1, DPR:$src2), N3RegFrm, IIC_VBINiD,
+                     "vbic", "$dst, $src1, $src2", "",
+                     [(set DPR:$dst, (v2i32 (and DPR:$src1,
+                                                 (vnot8 DPR:$src2))))]>;
 def  VBICq    : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst),
-                    (ins QPR:$src1, QPR:$src2), IIC_VBINiQ,
-                    "vbic", "$dst, $src1, $src2", "",
-                    [(set QPR:$dst, (v4i32 (and QPR:$src1,
-                                                (vnot_conv QPR:$src2))))]>;
+                     (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINiQ,
+                     "vbic", "$dst, $src1, $src2", "",
+                     [(set QPR:$dst, (v4i32 (and QPR:$src1,
+                                                 (vnot16 QPR:$src2))))]>;
 
 //   VORN     : Vector Bitwise OR NOT
 def  VORNd    : N3VX<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$dst),
-                    (ins DPR:$src1, DPR:$src2), IIC_VBINiD,
-                    "vorn", "$dst, $src1, $src2", "",
-                    [(set DPR:$dst, (v2i32 (or DPR:$src1,
-                                               (vnot_conv DPR:$src2))))]>;
+                     (ins DPR:$src1, DPR:$src2), N3RegFrm, IIC_VBINiD,
+                     "vorn", "$dst, $src1, $src2", "",
+                     [(set DPR:$dst, (v2i32 (or DPR:$src1,
+                                                (vnot8 DPR:$src2))))]>;
 def  VORNq    : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$dst),
-                    (ins QPR:$src1, QPR:$src2), IIC_VBINiQ,
-                    "vorn", "$dst, $src1, $src2", "",
-                    [(set QPR:$dst, (v4i32 (or QPR:$src1,
-                                               (vnot_conv QPR:$src2))))]>;
+                     (ins QPR:$src1, QPR:$src2), N3RegFrm, IIC_VBINiQ,
+                     "vorn", "$dst, $src1, $src2", "",
+                     [(set QPR:$dst, (v4i32 (or QPR:$src1,
+                                                (vnot16 QPR:$src2))))]>;
 
 //   VMVN     : Vector Bitwise NOT
 def  VMVNd    : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0,
-                    (outs DPR:$dst), (ins DPR:$src), IIC_VSHLiD,
-                    "vmvn", "$dst, $src", "",
-                    [(set DPR:$dst, (v2i32 (vnot DPR:$src)))]>;
+                     (outs DPR:$dst), (ins DPR:$src), IIC_VSHLiD,
+                     "vmvn", "$dst, $src", "",
+                     [(set DPR:$dst, (v2i32 (vnot8 DPR:$src)))]>;
 def  VMVNq    : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0,
-                    (outs QPR:$dst), (ins QPR:$src), IIC_VSHLiD,
-                    "vmvn", "$dst, $src", "",
-                    [(set QPR:$dst, (v4i32 (vnot QPR:$src)))]>;
-def : Pat<(v2i32 (vnot_conv DPR:$src)), (VMVNd DPR:$src)>;
-def : Pat<(v4i32 (vnot_conv QPR:$src)), (VMVNq QPR:$src)>;
+                     (outs QPR:$dst), (ins QPR:$src), IIC_VSHLiD,
+                     "vmvn", "$dst, $src", "",
+                     [(set QPR:$dst, (v4i32 (vnot16 QPR:$src)))]>;
+def : Pat<(v2i32 (vnot8 DPR:$src)), (VMVNd DPR:$src)>;
+def : Pat<(v4i32 (vnot16 QPR:$src)), (VMVNq QPR:$src)>;
 
 //   VBSL     : Vector Bitwise Select
 def  VBSLd    : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst),
-                    (ins DPR:$src1, DPR:$src2, DPR:$src3), IIC_VCNTiD,
-                    "vbsl", "$dst, $src2, $src3", "$src1 = $dst",
-                    [(set DPR:$dst,
-                      (v2i32 (or (and DPR:$src2, DPR:$src1),
-                                 (and DPR:$src3, (vnot_conv DPR:$src1)))))]>;
+                     (ins DPR:$src1, DPR:$src2, DPR:$src3),
+                     N3RegFrm, IIC_VCNTiD,
+                     "vbsl", "$dst, $src2, $src3", "$src1 = $dst",
+                     [(set DPR:$dst,
+                       (v2i32 (or (and DPR:$src2, DPR:$src1),
+                                  (and DPR:$src3, (vnot8 DPR:$src1)))))]>;
 def  VBSLq    : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst),
-                    (ins QPR:$src1, QPR:$src2, QPR:$src3), IIC_VCNTiQ,
-                    "vbsl", "$dst, $src2, $src3", "$src1 = $dst",
-                    [(set QPR:$dst,
-                      (v4i32 (or (and QPR:$src2, QPR:$src1),
-                                 (and QPR:$src3, (vnot_conv QPR:$src1)))))]>;
+                     (ins QPR:$src1, QPR:$src2, QPR:$src3),
+                     N3RegFrm, IIC_VCNTiQ,
+                     "vbsl", "$dst, $src2, $src3", "$src1 = $dst",
+                     [(set QPR:$dst,
+                       (v4i32 (or (and QPR:$src2, QPR:$src1),
+                                  (and QPR:$src3, (vnot16 QPR:$src1)))))]>;
 
 //   VBIF     : Vector Bitwise Insert if False
 //              like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst",
 def  VBIFd    : N3VX<1, 0, 0b11, 0b0001, 0, 1,
                      (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3),
-                     IIC_VBINiD, "vbif", "$dst, $src2, $src3", "$src1 = $dst",
+                     N3RegFrm, IIC_VBINiD,
+                     "vbif", "$dst, $src2, $src3", "$src1 = $dst",
                      [/* For disassembly only; pattern left blank */]>;
 def  VBIFq    : N3VX<1, 0, 0b11, 0b0001, 1, 1,
                      (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3),
-                     IIC_VBINiQ, "vbif", "$dst, $src2, $src3", "$src1 = $dst",
+                     N3RegFrm, IIC_VBINiQ,
+                     "vbif", "$dst, $src2, $src3", "$src1 = $dst",
                      [/* For disassembly only; pattern left blank */]>;
 
 //   VBIT     : Vector Bitwise Insert if True
 //              like VBSL but with: "vbit $dst, $src2, $src1", "$src3 = $dst",
 def  VBITd    : N3VX<1, 0, 0b10, 0b0001, 0, 1,
                      (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3),
-                     IIC_VBINiD, "vbit", "$dst, $src2, $src3", "$src1 = $dst",
+                     N3RegFrm, IIC_VBINiD,
+                     "vbit", "$dst, $src2, $src3", "$src1 = $dst",
                      [/* For disassembly only; pattern left blank */]>;
 def  VBITq    : N3VX<1, 0, 0b10, 0b0001, 1, 1,
                      (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3),
-                     IIC_VBINiQ, "vbit", "$dst, $src2, $src3", "$src1 = $dst",
+                     N3RegFrm, IIC_VBINiQ,
+                     "vbit", "$dst, $src2, $src3", "$src1 = $dst",
                      [/* For disassembly only; pattern left blank */]>;
 
 // VBIT/VBIF are not yet implemented.  The TwoAddress pass will not go looking
@@ -2463,15 +2446,15 @@ def  VBITq    : N3VX<1, 0, 0b10, 0b0001, 1, 1,
 // Vector Absolute Differences.
 
 //   VABD     : Vector Absolute Difference
-defm VABDs    : N3VInt_QHS<0, 0, 0b0111, 0, IIC_VBINi4D, IIC_VBINi4D,
-                           IIC_VBINi4Q, IIC_VBINi4Q,
+defm VABDs    : N3VInt_QHS<0, 0, 0b0111, 0, N3RegFrm,
+                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
                            "vabd", "s", int_arm_neon_vabds, 0>;
-defm VABDu    : N3VInt_QHS<1, 0, 0b0111, 0, IIC_VBINi4D, IIC_VBINi4D,
-                           IIC_VBINi4Q, IIC_VBINi4Q,
+defm VABDu    : N3VInt_QHS<1, 0, 0b0111, 0, N3RegFrm,
+                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
                            "vabd", "u", int_arm_neon_vabdu, 0>;
-def  VABDfd   : N3VDInt<1, 0, 0b10, 0b1101, 0, IIC_VBIND,
+def  VABDfd   : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND,
                         "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 0>;
-def  VABDfq   : N3VQInt<1, 0, 0b10, 0b1101, 0, IIC_VBINQ,
+def  VABDfq   : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ,
                         "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 0>;
 
 //   VABDL    : Vector Absolute Difference Long (Q = | D - D |)
@@ -2491,36 +2474,40 @@ defm VABALu   : N3VLInt3_QHS<1,1,0b0101,0, "vabal", "u", int_arm_neon_vabalu>;
 // Vector Maximum and Minimum.
 
 //   VMAX     : Vector Maximum
-defm VMAXs    : N3VInt_QHS<0,0,0b0110,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                           IIC_VBINi4Q, "vmax", "s", int_arm_neon_vmaxs, 1>;
-defm VMAXu    : N3VInt_QHS<1,0,0b0110,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                           IIC_VBINi4Q, "vmax", "u", int_arm_neon_vmaxu, 1>;
-def  VMAXfd   : N3VDInt<0, 0, 0b00, 0b1111, 0, IIC_VBIND, "vmax", "f32",
-                        v2f32, v2f32, int_arm_neon_vmaxs, 1>;
-def  VMAXfq   : N3VQInt<0, 0, 0b00, 0b1111, 0, IIC_VBINQ, "vmax", "f32",
-                        v4f32, v4f32, int_arm_neon_vmaxs, 1>;
+defm VMAXs    : N3VInt_QHS<0, 0, 0b0110, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vmax", "s", int_arm_neon_vmaxs, 1>;
+defm VMAXu    : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vmax", "u", int_arm_neon_vmaxu, 1>;
+def  VMAXfd   : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND, "vmax",
+                        "f32", v2f32, v2f32, int_arm_neon_vmaxs, 1>;
+def  VMAXfq   : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ, "vmax",
+                        "f32", v4f32, v4f32, int_arm_neon_vmaxs, 1>;
 
 //   VMIN     : Vector Minimum
-defm VMINs    : N3VInt_QHS<0,0,0b0110,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                           IIC_VBINi4Q, "vmin", "s", int_arm_neon_vmins, 1>;
-defm VMINu    : N3VInt_QHS<1,0,0b0110,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
-                           IIC_VBINi4Q, "vmin", "u", int_arm_neon_vminu, 1>;
-def  VMINfd   : N3VDInt<0, 0, 0b10, 0b1111, 0, IIC_VBIND, "vmin", "f32",
-                        v2f32, v2f32, int_arm_neon_vmins, 1>;
-def  VMINfq   : N3VQInt<0, 0, 0b10, 0b1111, 0, IIC_VBINQ, "vmin", "f32",
-                        v4f32, v4f32, int_arm_neon_vmins, 1>;
+defm VMINs    : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vmin", "s", int_arm_neon_vmins, 1>;
+defm VMINu    : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vmin", "u", int_arm_neon_vminu, 1>;
+def  VMINfd   : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND, "vmin",
+                        "f32", v2f32, v2f32, int_arm_neon_vmins, 1>;
+def  VMINfq   : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ, "vmin",
+                        "f32", v4f32, v4f32, int_arm_neon_vmins, 1>;
 
 // Vector Pairwise Operations.
 
 //   VPADD    : Vector Pairwise Add
-def  VPADDi8  : N3VDInt<0, 0, 0b00, 0b1011, 1, IIC_VBINiD, "vpadd", "i8",
-                        v8i8, v8i8, int_arm_neon_vpadd, 0>;
-def  VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, IIC_VBINiD, "vpadd", "i16",
-                        v4i16, v4i16, int_arm_neon_vpadd, 0>;
-def  VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, IIC_VBINiD, "vpadd", "i32",
-                        v2i32, v2i32, int_arm_neon_vpadd, 0>;
-def  VPADDf   : N3VDInt<1, 0, 0b00, 0b1101, 0, IIC_VBIND, "vpadd", "f32",
-                        v2f32, v2f32, int_arm_neon_vpadd, 0>;
+def  VPADDi8  : N3VDInt<0, 0, 0b00, 0b1011, 1, N3RegFrm, IIC_VBINiD, "vpadd",
+                        "i8", v8i8, v8i8, int_arm_neon_vpadd, 0>;
+def  VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, N3RegFrm, IIC_VBINiD, "vpadd",
+                        "i16", v4i16, v4i16, int_arm_neon_vpadd, 0>;
+def  VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, N3RegFrm, IIC_VBINiD, "vpadd",
+                        "i32", v2i32, v2i32, int_arm_neon_vpadd, 0>;
+def  VPADDf   : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm, IIC_VBIND, "vpadd",
+                        "f32", v2f32, v2f32, int_arm_neon_vpadd, 0>;
 
 //   VPADDL   : Vector Pairwise Add Long
 defm VPADDLs  : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl", "s",
@@ -2535,36 +2522,36 @@ defm VPADALu  : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01101, 0, "vpadal", "u",
                               int_arm_neon_vpadalu>;
 
 //   VPMAX    : Vector Pairwise Maximum
-def  VPMAXs8  : N3VDInt<0, 0, 0b00, 0b1010, 0, IIC_VBINi4D, "vpmax", "s8",
-                        v8i8, v8i8, int_arm_neon_vpmaxs, 0>;
-def  VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, IIC_VBINi4D, "vpmax", "s16",
-                        v4i16, v4i16, int_arm_neon_vpmaxs, 0>;
-def  VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, IIC_VBINi4D, "vpmax", "s32",
-                        v2i32, v2i32, int_arm_neon_vpmaxs, 0>;
-def  VPMAXu8  : N3VDInt<1, 0, 0b00, 0b1010, 0, IIC_VBINi4D, "vpmax", "u8",
-                        v8i8, v8i8, int_arm_neon_vpmaxu, 0>;
-def  VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, IIC_VBINi4D, "vpmax", "u16",
-                        v4i16, v4i16, int_arm_neon_vpmaxu, 0>;
-def  VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, IIC_VBINi4D, "vpmax", "u32",
-                        v2i32, v2i32, int_arm_neon_vpmaxu, 0>;
-def  VPMAXf   : N3VDInt<1, 0, 0b00, 0b1111, 0, IIC_VBINi4D, "vpmax", "f32",
-                        v2f32, v2f32, int_arm_neon_vpmaxs, 0>;
+def  VPMAXs8  : N3VDInt<0, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax",
+                        "s8", v8i8, v8i8, int_arm_neon_vpmaxs, 0>;
+def  VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax",
+                        "s16", v4i16, v4i16, int_arm_neon_vpmaxs, 0>;
+def  VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax",
+                        "s32", v2i32, v2i32, int_arm_neon_vpmaxs, 0>;
+def  VPMAXu8  : N3VDInt<1, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax",
+                        "u8", v8i8, v8i8, int_arm_neon_vpmaxu, 0>;
+def  VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax",
+                        "u16", v4i16, v4i16, int_arm_neon_vpmaxu, 0>;
+def  VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VBINi4D, "vpmax",
+                        "u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>;
+def  VPMAXf   : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINi4D, "vpmax",
+                        "f32", v2f32, v2f32, int_arm_neon_vpmaxs, 0>;
 
 //   VPMIN    : Vector Pairwise Minimum
-def  VPMINs8  : N3VDInt<0, 0, 0b00, 0b1010, 1, IIC_VBINi4D, "vpmin", "s8",
-                        v8i8, v8i8, int_arm_neon_vpmins, 0>;
-def  VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, IIC_VBINi4D, "vpmin", "s16",
-                        v4i16, v4i16, int_arm_neon_vpmins, 0>;
-def  VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, IIC_VBINi4D, "vpmin", "s32",
-                        v2i32, v2i32, int_arm_neon_vpmins, 0>;
-def  VPMINu8  : N3VDInt<1, 0, 0b00, 0b1010, 1, IIC_VBINi4D, "vpmin", "u8",
-                        v8i8, v8i8, int_arm_neon_vpminu, 0>;
-def  VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, IIC_VBINi4D, "vpmin", "u16",
-                        v4i16, v4i16, int_arm_neon_vpminu, 0>;
-def  VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, IIC_VBINi4D, "vpmin", "u32",
-                        v2i32, v2i32, int_arm_neon_vpminu, 0>;
-def  VPMINf   : N3VDInt<1, 0, 0b10, 0b1111, 0, IIC_VBINi4D, "vpmin", "f32",
-                        v2f32, v2f32, int_arm_neon_vpmins, 0>;
+def  VPMINs8  : N3VDInt<0, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin",
+                        "s8", v8i8, v8i8, int_arm_neon_vpmins, 0>;
+def  VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin",
+                        "s16", v4i16, v4i16, int_arm_neon_vpmins, 0>;
+def  VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin",
+                        "s32", v2i32, v2i32, int_arm_neon_vpmins, 0>;
+def  VPMINu8  : N3VDInt<1, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin",
+                        "u8", v8i8, v8i8, int_arm_neon_vpminu, 0>;
+def  VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin",
+                        "u16", v4i16, v4i16, int_arm_neon_vpminu, 0>;
+def  VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VBINi4D, "vpmin",
+                        "u32", v2i32, v2i32, int_arm_neon_vpminu, 0>;
+def  VPMINf   : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINi4D, "vpmin",
+                        "f32", v2f32, v2f32, int_arm_neon_vpmins, 0>;
 
 // Vector Reciprocal and Reciprocal Square Root Estimate and Step.
 
@@ -2583,10 +2570,10 @@ def  VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
                         v4f32, v4f32, int_arm_neon_vrecpe>;
 
 //   VRECPS   : Vector Reciprocal Step
-def  VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1,
+def  VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm,
                         IIC_VRECSD, "vrecps", "f32",
                         v2f32, v2f32, int_arm_neon_vrecps, 1>;
-def  VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1,
+def  VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, N3RegFrm,
                         IIC_VRECSQ, "vrecps", "f32",
                         v4f32, v4f32, int_arm_neon_vrecps, 1>;
 
@@ -2605,25 +2592,30 @@ def  VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
                          v4f32, v4f32, int_arm_neon_vrsqrte>;
 
 //   VRSQRTS  : Vector Reciprocal Square Root Step
-def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1,
+def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
                         IIC_VRECSD, "vrsqrts", "f32",
                         v2f32, v2f32, int_arm_neon_vrsqrts, 1>;
-def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1,
+def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
                         IIC_VRECSQ, "vrsqrts", "f32",
                         v4f32, v4f32, int_arm_neon_vrsqrts, 1>;
 
 // Vector Shifts.
 
 //   VSHL     : Vector Shift
-defm VSHLs    : N3VInt_QHSD<0, 0, 0b0100, 0, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ,
-                            IIC_VSHLiQ, "vshl", "s", int_arm_neon_vshifts, 0>;
-defm VSHLu    : N3VInt_QHSD<1, 0, 0b0100, 0, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ,
-                            IIC_VSHLiQ, "vshl", "u", int_arm_neon_vshiftu, 0>;
+defm VSHLs    : N3VInt_QHSD<0, 0, 0b0100, 0, N3RegVShFrm,
+                            IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ,
+                            "vshl", "s", int_arm_neon_vshifts, 0>;
+defm VSHLu    : N3VInt_QHSD<1, 0, 0b0100, 0, N3RegVShFrm,
+                            IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ,
+                            "vshl", "u", int_arm_neon_vshiftu, 0>;
 //   VSHL     : Vector Shift Left (Immediate)
-defm VSHLi    : N2VSh_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl>;
+defm VSHLi    : N2VSh_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl,
+                           N2RegVShLFrm>;
 //   VSHR     : Vector Shift Right (Immediate)
-defm VSHRs    : N2VSh_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", NEONvshrs>;
-defm VSHRu    : N2VSh_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", NEONvshru>;
+defm VSHRs    : N2VSh_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", NEONvshrs,
+                           N2RegVShRFrm>;
+defm VSHRu    : N2VSh_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", NEONvshru,
+                           N2RegVShRFrm>;
 
 //   VSHLL    : Vector Shift Left Long
 defm VSHLLs   : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s", NEONvshlls>;
@@ -2649,28 +2641,37 @@ defm VSHRN    : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i",
                            NEONvshrn>;
 
 //   VRSHL    : Vector Rounding Shift
-defm VRSHLs   : N3VInt_QHSD<0,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
-                            IIC_VSHLi4Q, "vrshl", "s", int_arm_neon_vrshifts,0>;
-defm VRSHLu   : N3VInt_QHSD<1,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
-                            IIC_VSHLi4Q, "vrshl", "u", int_arm_neon_vrshiftu,0>;
+defm VRSHLs   : N3VInt_QHSD<0, 0, 0b0101, 0, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vrshl", "s", int_arm_neon_vrshifts, 0>;
+defm VRSHLu   : N3VInt_QHSD<1, 0, 0b0101, 0, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vrshl", "u", int_arm_neon_vrshiftu, 0>;
 //   VRSHR    : Vector Rounding Shift Right
-defm VRSHRs   : N2VSh_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", NEONvrshrs>;
-defm VRSHRu   : N2VSh_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", NEONvrshru>;
+defm VRSHRs   : N2VSh_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", NEONvrshrs,
+                           N2RegVShRFrm>;
+defm VRSHRu   : N2VSh_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", NEONvrshru,
+                           N2RegVShRFrm>;
 
 //   VRSHRN   : Vector Rounding Shift Right and Narrow
 defm VRSHRN   : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i",
                            NEONvrshrn>;
 
 //   VQSHL    : Vector Saturating Shift
-defm VQSHLs   : N3VInt_QHSD<0,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
-                            IIC_VSHLi4Q, "vqshl", "s", int_arm_neon_vqshifts,0>;
-defm VQSHLu   : N3VInt_QHSD<1,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
-                            IIC_VSHLi4Q, "vqshl", "u", int_arm_neon_vqshiftu,0>;
+defm VQSHLs   : N3VInt_QHSD<0, 0, 0b0100, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqshl", "s", int_arm_neon_vqshifts, 0>;
+defm VQSHLu   : N3VInt_QHSD<1, 0, 0b0100, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqshl", "u", int_arm_neon_vqshiftu, 0>;
 //   VQSHL    : Vector Saturating Shift Left (Immediate)
-defm VQSHLsi  : N2VSh_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s", NEONvqshls>;
-defm VQSHLui  : N2VSh_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u", NEONvqshlu>;
+defm VQSHLsi  : N2VSh_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls,
+                           N2RegVShLFrm>;
+defm VQSHLui  : N2VSh_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu,
+                           N2RegVShLFrm>;
 //   VQSHLU   : Vector Saturating Shift Left (Immediate, Unsigned)
-defm VQSHLsu  : N2VSh_QHSD<1,1,0b0110,1, IIC_VSHLi4D, "vqshlu","s",NEONvqshlsu>;
+defm VQSHLsu  : N2VSh_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu,
+                           N2RegVShLFrm>;
 
 //   VQSHRN   : Vector Saturating Shift Right and Narrow
 defm VQSHRNs  : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s",
@@ -2683,12 +2684,12 @@ defm VQSHRUN  : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s",
                            NEONvqshrnsu>;
 
 //   VQRSHL   : Vector Saturating Rounding Shift
-defm VQRSHLs  : N3VInt_QHSD<0,0,0b0101,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
-                            IIC_VSHLi4Q, "vqrshl", "s",
-                            int_arm_neon_vqrshifts, 0>;
-defm VQRSHLu  : N3VInt_QHSD<1,0,0b0101,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
-                            IIC_VSHLi4Q, "vqrshl", "u",
-                            int_arm_neon_vqrshiftu, 0>;
+defm VQRSHLs  : N3VInt_QHSD<0, 0, 0b0101, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqrshl", "s", int_arm_neon_vqrshifts, 0>;
+defm VQRSHLu  : N3VInt_QHSD<1, 0, 0b0101, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqrshl", "u", int_arm_neon_vqrshiftu, 0>;
 
 //   VQRSHRN  : Vector Saturating Rounding Shift Right and Narrow
 defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "s",
@@ -2708,9 +2709,9 @@ defm VRSRAs   : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrs>;
 defm VRSRAu   : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshru>;
 
 //   VSLI     : Vector Shift Left and Insert
-defm VSLI     : N2VShIns_QHSD<1, 1, 0b0101, 1, "vsli", NEONvsli>;
+defm VSLI     : N2VShIns_QHSD<1, 1, 0b0101, 1, "vsli", NEONvsli, N2RegVShLFrm>;
 //   VSRI     : Vector Shift Right and Insert
-defm VSRI     : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri", NEONvsri>;
+defm VSRI     : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri", NEONvsri, N2RegVShRFrm>;
 
 // Vector Absolute and Saturating Absolute.
 
@@ -2732,19 +2733,22 @@ defm VQABS    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0,
 
 // Vector Negate.
 
-def vneg      : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>;
-def vneg_conv : PatFrag<(ops node:$in), (sub immAllZerosV_bc, node:$in)>;
+def vneg   : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>;
+def vneg8  : PatFrag<(ops node:$in),
+                     (sub (bitconvert (v8i8 immAllZerosV)), node:$in)>;
+def vneg16 : PatFrag<(ops node:$in),
+                     (sub (bitconvert (v16i8 immAllZerosV)), node:$in)>;
 
 class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
   : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$dst), (ins DPR:$src),
         IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "",
-        [(set DPR:$dst, (Ty (vneg DPR:$src)))]>;
+        [(set DPR:$dst, (Ty (vneg8 DPR:$src)))]>;
 class VNEGQ<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
   : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$dst), (ins QPR:$src),
         IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "",
-        [(set QPR:$dst, (Ty (vneg QPR:$src)))]>;
+        [(set QPR:$dst, (Ty (vneg16 QPR:$src)))]>;
 
-//   VNEG     : Vector Negate
+//   VNEG     : Vector Negate (integer)
 def  VNEGs8d  : VNEGD<0b00, "vneg", "s8", v8i8>;
 def  VNEGs16d : VNEGD<0b01, "vneg", "s16", v4i16>;
 def  VNEGs32d : VNEGD<0b10, "vneg", "s32", v2i32>;
@@ -2762,12 +2766,12 @@ def  VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0,
                     "vneg", "f32", "$dst, $src", "",
                     [(set QPR:$dst, (v4f32 (fneg QPR:$src)))]>;
 
-def : Pat<(v8i8 (vneg_conv DPR:$src)), (VNEGs8d DPR:$src)>;
-def : Pat<(v4i16 (vneg_conv DPR:$src)), (VNEGs16d DPR:$src)>;
-def : Pat<(v2i32 (vneg_conv DPR:$src)), (VNEGs32d DPR:$src)>;
-def : Pat<(v16i8 (vneg_conv QPR:$src)), (VNEGs8q QPR:$src)>;
-def : Pat<(v8i16 (vneg_conv QPR:$src)), (VNEGs16q QPR:$src)>;
-def : Pat<(v4i32 (vneg_conv QPR:$src)), (VNEGs32q QPR:$src)>;
+def : Pat<(v8i8  (vneg8  DPR:$src)), (VNEGs8d DPR:$src)>;
+def : Pat<(v4i16 (vneg8  DPR:$src)), (VNEGs16d DPR:$src)>;
+def : Pat<(v2i32 (vneg8  DPR:$src)), (VNEGs32d DPR:$src)>;
+def : Pat<(v16i8 (vneg16 QPR:$src)), (VNEGs8q QPR:$src)>;
+def : Pat<(v8i16 (vneg16 QPR:$src)), (VNEGs16q QPR:$src)>;
+def : Pat<(v4i32 (vneg16 QPR:$src)), (VNEGs32q QPR:$src)>;
 
 //   VQNEG    : Vector Saturating Negate
 defm VQNEG    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, 
@@ -2805,9 +2809,9 @@ def  VSWPq    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0,
 //   VMOV     : Vector Move (Register)
 
 def  VMOVDneon: N3VX<0, 0, 0b10, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src),
-                    IIC_VMOVD, "vmov", "$dst, $src", "", []>;
+                     N3RegFrm, IIC_VMOVD, "vmov", "$dst, $src", "", []>;
 def  VMOVQ    : N3VX<0, 0, 0b10, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src),
-                    IIC_VMOVD, "vmov", "$dst, $src", "", []>;
+                     N3RegFrm, IIC_VMOVD, "vmov", "$dst, $src", "", []>;
 
 //   VMOV     : Vector Move (Immediate)
 
@@ -3048,30 +3052,29 @@ def  VDUPfq   : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$dst), (ins GPR:$src),
 
 //   VDUP     : Vector Duplicate Lane (from scalar to all elements)
 
-class VDUPLND<bits<2> op19_18, bits<2> op17_16,
-              string OpcodeStr, string Dt, ValueType Ty>
-  : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 0, 0,
-        (outs DPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD,
-        OpcodeStr, Dt, "$dst, $src[$lane]", "",
-        [(set DPR:$dst, (Ty (NEONvduplane (Ty DPR:$src), imm:$lane)))]>;
+class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt,
+              ValueType Ty>
+  : NVDupLane<op19_16, 0, (outs DPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+              IIC_VMOVD, OpcodeStr, Dt, "$dst, $src[$lane]",
+              [(set DPR:$dst, (Ty (NEONvduplane (Ty DPR:$src), imm:$lane)))]>;
 
-class VDUPLNQ<bits<2> op19_18, bits<2> op17_16, string OpcodeStr, string Dt,
+class VDUPLNQ<bits<4> op19_16, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy>
-  : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 1, 0,
-        (outs QPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD,
-        OpcodeStr, Dt, "$dst, $src[$lane]", "",
-        [(set QPR:$dst, (ResTy (NEONvduplane (OpTy DPR:$src), imm:$lane)))]>;
+  : NVDupLane<op19_16, 1, (outs QPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+              IIC_VMOVD, OpcodeStr, Dt, "$dst, $src[$lane]",
+              [(set QPR:$dst, (ResTy (NEONvduplane (OpTy DPR:$src),
+                                      imm:$lane)))]>;
 
 // Inst{19-16} is partially specified depending on the element size.
 
-def VDUPLN8d  : VDUPLND<{?,?}, {?,1}, "vdup", "8", v8i8>;
-def VDUPLN16d : VDUPLND<{?,?}, {1,0}, "vdup", "16", v4i16>;
-def VDUPLN32d : VDUPLND<{?,1}, {0,0}, "vdup", "32", v2i32>;
-def VDUPLNfd  : VDUPLND<{?,1}, {0,0}, "vdup", "32", v2f32>;
-def VDUPLN8q  : VDUPLNQ<{?,?}, {?,1}, "vdup", "8", v16i8, v8i8>;
-def VDUPLN16q : VDUPLNQ<{?,?}, {1,0}, "vdup", "16", v8i16, v4i16>;
-def VDUPLN32q : VDUPLNQ<{?,1}, {0,0}, "vdup", "32", v4i32, v2i32>;
-def VDUPLNfq  : VDUPLNQ<{?,1}, {0,0}, "vdup", "32", v4f32, v2f32>;
+def VDUPLN8d  : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8>;
+def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16>;
+def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32>;
+def VDUPLNfd  : VDUPLND<{?,1,0,0}, "vdup", "32", v2f32>;
+def VDUPLN8q  : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8>;
+def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16>;
+def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32>;
+def VDUPLNfq  : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4f32, v2f32>;
 
 def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)),
           (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src,
@@ -3233,15 +3236,15 @@ def VREV16q8  : VREV16Q<0b00, "vrev16", "8", v16i8>;
 
 class VEXTd<string OpcodeStr, string Dt, ValueType Ty>
   : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$dst),
-        (ins DPR:$lhs, DPR:$rhs, i32imm:$index), IIC_VEXTD,
-        OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "",
+        (ins DPR:$lhs, DPR:$rhs, i32imm:$index), NVExtFrm,
+        IIC_VEXTD, OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "",
         [(set DPR:$dst, (Ty (NEONvext (Ty DPR:$lhs),
                                       (Ty DPR:$rhs), imm:$index)))]>;
 
 class VEXTq<string OpcodeStr, string Dt, ValueType Ty>
   : N3V<0,1,0b11,{?,?,?,?},1,0, (outs QPR:$dst),
-        (ins QPR:$lhs, QPR:$rhs, i32imm:$index), IIC_VEXTQ,
-        OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "",
+        (ins QPR:$lhs, QPR:$rhs, i32imm:$index), NVExtFrm,
+        IIC_VEXTQ, OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "",
         [(set QPR:$dst, (Ty (NEONvext (Ty QPR:$lhs),
                                       (Ty QPR:$rhs), imm:$index)))]>;
 
@@ -3290,25 +3293,26 @@ def  VZIPq32  : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip", "32">;
 //   VTBL     : Vector Table Lookup
 def  VTBL1
   : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$dst),
-        (ins DPR:$tbl1, DPR:$src), IIC_VTB1,
+        (ins DPR:$tbl1, DPR:$src), NVTBLFrm, IIC_VTB1,
         "vtbl", "8", "$dst, \\{$tbl1\\}, $src", "",
         [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl1 DPR:$tbl1, DPR:$src)))]>;
 let hasExtraSrcRegAllocReq = 1 in {
 def  VTBL2
   : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$dst),
-        (ins DPR:$tbl1, DPR:$tbl2, DPR:$src), IIC_VTB2,
+        (ins DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTB2,
         "vtbl", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "",
         [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl2
                                DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>;
 def  VTBL3
   : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$dst),
-        (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), IIC_VTB3,
+        (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), NVTBLFrm, IIC_VTB3,
         "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "",
         [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl3
                                DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>;
 def  VTBL4
   : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$dst),
-        (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), IIC_VTB4,
+        (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src),
+        NVTBLFrm, IIC_VTB4,
         "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", "",
         [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl4 DPR:$tbl1, DPR:$tbl2,
                                DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>;
@@ -3317,26 +3321,27 @@ def  VTBL4
 //   VTBX     : Vector Table Extension
 def  VTBX1
   : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$dst),
-        (ins DPR:$orig, DPR:$tbl1, DPR:$src), IIC_VTBX1,
+        (ins DPR:$orig, DPR:$tbl1, DPR:$src), NVTBLFrm, IIC_VTBX1,
         "vtbx", "8", "$dst, \\{$tbl1\\}, $src", "$orig = $dst",
         [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx1
                                DPR:$orig, DPR:$tbl1, DPR:$src)))]>;
 let hasExtraSrcRegAllocReq = 1 in {
 def  VTBX2
   : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$dst),
-        (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src), IIC_VTBX2,
+        (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTBX2,
         "vtbx", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "$orig = $dst",
         [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx2
                                DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>;
 def  VTBX3
   : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$dst),
-        (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), IIC_VTBX3,
+        (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src),
+        NVTBLFrm, IIC_VTBX3,
         "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "$orig = $dst",
         [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx3 DPR:$orig, DPR:$tbl1,
                                DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>;
 def  VTBX4
   : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$dst), (ins DPR:$orig, DPR:$tbl1,
-        DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), IIC_VTBX4,
+        DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), NVTBLFrm, IIC_VTBX4,
         "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src",
         "$orig = $dst",
         [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx4 DPR:$orig, DPR:$tbl1,
@@ -3396,12 +3401,12 @@ def : N3VSPat<fmul, VMULfd_sfp>;
 
 //let neverHasSideEffects = 1 in
 //def VMLAfd_sfp : N3VSMulOp<0,0,0b00,0b1101,1, IIC_VMACD, "vmla", "f32",
-//                            v2f32, fmul, fadd>;
+//                           v2f32, fmul, fadd>;
 //def : N3VSMulOpPat<fmul, fadd, VMLAfd_sfp>;
 
 //let neverHasSideEffects = 1 in
 //def VMLSfd_sfp : N3VSMulOp<0,0,0b10,0b1101,1, IIC_VMACD, "vmls", "f32",
-//                            v2f32, fmul, fsub>;
+//                           v2f32, fmul, fsub>;
 //def : N3VSMulOpPat<fmul, fsub, VMLSfd_sfp>;
 
 // Vector Absolute used for single-precision FP
@@ -3421,14 +3426,14 @@ def : N2VSPat<fneg, f32, v2f32, VNEGfd_sfp>;
 // Vector Maximum used for single-precision FP
 let neverHasSideEffects = 1 in
 def VMAXfd_sfp : N3V<0, 0, 0b00, 0b1111, 0, 0, (outs DPR_VFP2:$dst),
-                     (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND,
+                     (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm, IIC_VBIND,
                      "vmax", "f32", "$dst, $src1, $src2", "", []>;
 def : N3VSPat<NEONfmax, VMAXfd_sfp>;
 
 // Vector Minimum used for single-precision FP
 let neverHasSideEffects = 1 in
 def VMINfd_sfp : N3V<0, 0, 0b00, 0b1111, 0, 0, (outs DPR_VFP2:$dst),
-                     (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND,
+                     (ins DPR_VFP2:$src1, DPR_VFP2:$src2), N3RegFrm, IIC_VBIND,
                      "vmin", "f32", "$dst, $src1, $src2", "", []>;
 def : N3VSPat<NEONfmin, VMINfd_sfp>;
 
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index aca8230..0458389 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -545,7 +545,7 @@ def VULTOD : AVConv1XI<0b11101, 0b11, 0b1011, 0b1011, 1,
 // FP FMA Operations.
 //
 
-def VMLAD : ADbI<0b11100, 0b00, 0, 0,
+def VMLAD : ADbI_vmlX<0b11100, 0b00, 0, 0,
                 (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
                 IIC_fpMAC64, "vmla", ".f64\t$dst, $a, $b",
                 [(set DPR:$dst, (fadd (fmul DPR:$a, DPR:$b),
@@ -558,7 +558,7 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
                  [(set SPR:$dst, (fadd (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
                  RegConstraint<"$dstin = $dst">;
 
-def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
+def VNMLSD : ADbI_vmlX<0b11100, 0b01, 0, 0,
                 (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
                 IIC_fpMAC64, "vnmls", ".f64\t$dst, $a, $b",
                 [(set DPR:$dst, (fsub (fmul DPR:$a, DPR:$b),
@@ -571,7 +571,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
                 [(set SPR:$dst, (fsub (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
                 RegConstraint<"$dstin = $dst">;
 
-def VMLSD : ADbI<0b11100, 0b00, 1, 0,
+def VMLSD : ADbI_vmlX<0b11100, 0b00, 1, 0,
                  (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
                  IIC_fpMAC64, "vmls", ".f64\t$dst, $a, $b",
              [(set DPR:$dst, (fadd (fneg (fmul DPR:$a, DPR:$b)),
@@ -589,7 +589,7 @@ def : Pat<(fsub DPR:$dstin, (fmul DPR:$a, (f64 DPR:$b))),
 def : Pat<(fsub SPR:$dstin, (fmul SPR:$a, SPR:$b)),
           (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[DontUseNEONForFP]>;
 
-def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
+def VNMLAD : ADbI_vmlX<0b11100, 0b01, 1, 0,
                  (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
                  IIC_fpMAC64, "vnmla", ".f64\t$dst, $a, $b",
              [(set DPR:$dst, (fsub (fneg (fmul DPR:$a, DPR:$b)),
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index bdbec30..cb762a4 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -341,6 +341,7 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
   unsigned PReg = PMO.getReg();
   unsigned PRegNum = PMO.isUndef() ? UINT_MAX
     : ARMRegisterInfo::getRegisterNumbering(PReg);
+  unsigned Count = 1;
 
   for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
     int NewOffset = MemOps[i].Offset;
@@ -350,11 +351,14 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
       : ARMRegisterInfo::getRegisterNumbering(Reg);
     // AM4 - register numbers in ascending order.
     // AM5 - consecutive register numbers in ascending order.
+    //       Can only do up to 16 double-word registers per insn.
     if (Reg != ARM::SP &&
         NewOffset == Offset + (int)Size &&
-        ((isAM4 && RegNum > PRegNum) || RegNum == PRegNum+1)) {
+        ((isAM4 && RegNum > PRegNum)
+         || ((Size < 8 || Count < 16) && RegNum == PRegNum+1))) {
       Offset += Size;
       PRegNum = RegNum;
+      ++Count;
     } else {
       // Can't merge this in. Try merge the earlier ones first.
       MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset,
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 2dad7f1..9e55cd8 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -22,10 +22,6 @@ using namespace llvm;
 static cl::opt<bool>
 ReserveR9("arm-reserve-r9", cl::Hidden,
           cl::desc("Reserve R9, making it unavailable as GPR"));
-static cl::opt<bool>
-UseNEONFP("arm-use-neon-fp",
-          cl::desc("Use NEON for single-precision FP"),
-          cl::init(false), cl::Hidden);
 
 static cl::opt<bool>
 UseMOVT("arm-use-movt",
@@ -35,7 +31,8 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
                            bool isT)
   : ARMArchVersion(V4)
   , ARMFPUType(None)
-  , UseNEONForSinglePrecisionFP(UseNEONFP)
+  , UseNEONForSinglePrecisionFP(false)
+  , SlowVMLx(false)
   , IsThumb(isT)
   , ThumbMode(Thumb1)
   , PostRAScheduler(false)
@@ -115,14 +112,6 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
 
   if (!isThumb() || hasThumb2())
     PostRAScheduler = true;
-
-  // Set CPU specific features.
-  if (CPUString == "cortex-a8") {
-    // On Cortex-a8, it's faster to perform some single-precision FP
-    // operations with NEON instructions.
-    if (UseNEONFP.getPosition() == 0)
-      UseNEONForSinglePrecisionFP = true;
-  }
 }
 
 /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol.
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 2dc81a4..fa56a91 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -50,6 +50,10 @@ protected:
   /// determine if NEON should actually be used.
   bool UseNEONForSinglePrecisionFP;
 
+  /// SlowVMLx - If the VFP2 instructions are available, indicates whether
+  /// the VML[AS] instructions are slow (if so, don't use them).
+  bool SlowVMLx;
+
   /// IsThumb - True if we are in thumb mode, false if in ARM mode.
   bool IsThumb;
 
@@ -119,6 +123,7 @@ protected:
   bool hasNEON() const { return ARMFPUType >= NEON;  }
   bool useNEONForSinglePrecisionFP() const {
     return hasNEON() && UseNEONForSinglePrecisionFP; }
+  bool useVMLx() const {return hasVFP2() && !SlowVMLx; }
 
   bool hasFP16() const { return HasFP16; }
 
diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
index 4a7a1e4..ba736e3 100644
--- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
@@ -841,7 +841,7 @@ GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2,
   raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix()
     << getFunctionNumber() << '_' << uid << '_' << uid2
     << "_set_" << MBB->getNumber();
-  return OutContext.GetOrCreateTemporarySymbol(Name.str());
+  return OutContext.GetOrCreateSymbol(Name.str());
 }
 
 MCSymbol *ARMAsmPrinter::
@@ -849,7 +849,7 @@ GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const {
   SmallString<60> Name;
   raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() << "JTI"
     << getFunctionNumber() << '_' << uid << '_' << uid2;
-  return OutContext.GetOrCreateTemporarySymbol(Name.str());
+  return OutContext.GetOrCreateSymbol(Name.str());
 }
 
 void ARMAsmPrinter::printJTBlockOperand(const MachineInstr *MI, int OpNum) {
@@ -1132,6 +1132,11 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
           OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/);
         else
           // Internal to current translation unit.
+          //
+          // When we place the LSDA into the TEXT section, the type info pointers
+          // need to be indirect and pc-rel. We accomplish this by using NLPs.
+          // However, sometimes the types are local to the file. So we need to
+          // fill in the value for the NLP in those cases.
           OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
                                                         OutContext),
                                 4/*size*/, 0/*addrspace*/);
@@ -1186,7 +1191,7 @@ void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) {
     // FIXME: MOVE TO SHARED PLACE.
     unsigned Id = (unsigned)MI->getOperand(2).getImm();
     const char *Prefix = MAI->getPrivateGlobalPrefix();
-    MCSymbol *Label =OutContext.GetOrCreateTemporarySymbol(Twine(Prefix)
+    MCSymbol *Label =OutContext.GetOrCreateSymbol(Twine(Prefix)
                          + "PC" + Twine(getFunctionNumber()) + "_" + Twine(Id));
     OutStreamer.EmitLabel(Label);
     
diff --git a/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp
index 7cb305f..ab2b06b 100644
--- a/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp
@@ -75,7 +75,7 @@ GetJumpTableSymbol(const MachineOperand &MO) const {
 #endif
   
   // Create a symbol for the name.
-  return Ctx.GetOrCreateTemporarySymbol(Name.str());
+  return Ctx.GetOrCreateSymbol(Name.str());
 }
 
 MCSymbol *ARMMCInstLower::
@@ -91,7 +91,7 @@ GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
 #endif
   
   // Create a symbol for the name.
-  return Ctx.GetOrCreateTemporarySymbol(Name.str());
+  return Ctx.GetOrCreateSymbol(Name.str());
 }
   
 MCOperand ARMMCInstLower::
diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp
index c36fe63..7334259 100644
--- a/lib/Target/ARM/NEONPreAllocPass.cpp
+++ b/lib/Target/ARM/NEONPreAllocPass.cpp
@@ -46,10 +46,13 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
   default:
     break;
 
+  case ARM::VLD1q8:
+  case ARM::VLD1q16:
+  case ARM::VLD1q32:
+  case ARM::VLD1q64:
   case ARM::VLD2d8:
   case ARM::VLD2d16:
   case ARM::VLD2d32:
-  case ARM::VLD2d64:
   case ARM::VLD2LNd8:
   case ARM::VLD2LNd16:
   case ARM::VLD2LNd32:
@@ -83,7 +86,7 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
   case ARM::VLD3d8:
   case ARM::VLD3d16:
   case ARM::VLD3d32:
-  case ARM::VLD3d64:
+  case ARM::VLD1d64T:
   case ARM::VLD3LNd8:
   case ARM::VLD3LNd16:
   case ARM::VLD3LNd32:
@@ -128,7 +131,7 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
   case ARM::VLD4d8:
   case ARM::VLD4d16:
   case ARM::VLD4d32:
-  case ARM::VLD4d64:
+  case ARM::VLD1d64Q:
   case ARM::VLD4LNd8:
   case ARM::VLD4LNd16:
   case ARM::VLD4LNd32:
@@ -170,10 +173,13 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
     Stride = 2;
     return true;
 
+  case ARM::VST1q8:
+  case ARM::VST1q16:
+  case ARM::VST1q32:
+  case ARM::VST1q64:
   case ARM::VST2d8:
   case ARM::VST2d16:
   case ARM::VST2d32:
-  case ARM::VST2d64:
   case ARM::VST2LNd8:
   case ARM::VST2LNd16:
   case ARM::VST2LNd32:
@@ -207,7 +213,7 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
   case ARM::VST3d8:
   case ARM::VST3d16:
   case ARM::VST3d32:
-  case ARM::VST3d64:
+  case ARM::VST1d64T:
   case ARM::VST3LNd8:
   case ARM::VST3LNd16:
   case ARM::VST3LNd32:
@@ -252,7 +258,7 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
   case ARM::VST4d8:
   case ARM::VST4d16:
   case ARM::VST4d32:
-  case ARM::VST4d64:
+  case ARM::VST1d64Q:
   case ARM::VST4LNd8:
   case ARM::VST4LNd16:
   case ARM::VST4LNd32:
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index 57b65cf..85d5ca0 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -12,6 +12,9 @@ Reimplement 'select' in terms of 'SEL'.
 
 A few ARMv6T2 ops should be pattern matched: BFI, SBFX, and UBFX
 
+Interesting optimization for PIC codegen on arm-linux:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43129
+
 //===---------------------------------------------------------------------===//
 
 Crazy idea:  Consider code that uses lots of 8-bit or 16-bit values.  By the
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 29ae631..ad98839 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -200,6 +200,8 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   // It's illegal to emit pop instruction without operands.
   if (NumRegs)
     MBB.insert(MI, &*MIB);
+  else
+    MF.DeleteMachineInstr(MIB);
 
   return true;
 }
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index e4abcdb..55163f9 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -69,7 +69,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   DebugLoc DL = DebugLoc::getUnknownLoc();
   if (I != MBB.end()) DL = I->getDebugLoc();
 
-  if (RC == ARM::GPRRegisterClass) {
+  if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass) {
     MachineFunction &MF = *MBB.getParent();
     MachineFrameInfo &MFI = *MF.getFrameInfo();
     MachineMemOperand *MMO =
@@ -93,7 +93,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   DebugLoc DL = DebugLoc::getUnknownLoc();
   if (I != MBB.end()) DL = I->getDebugLoc();
 
-  if (RC == ARM::GPRRegisterClass) {
+  if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass) {
     MachineFunction &MF = *MBB.getParent();
     MachineFrameInfo &MFI = *MF.getFrameInfo();
     MachineMemOperand *MMO =
diff --git a/lib/Target/Alpha/AlphaInstrInfo.cpp b/lib/Target/Alpha/AlphaInstrInfo.cpp
index 39f0749..d539e08 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.cpp
+++ b/lib/Target/Alpha/AlphaInstrInfo.cpp
@@ -301,7 +301,15 @@ bool AlphaInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TB
                                    bool AllowModify) const {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
     return false;
 
   // Get the last instruction in the block.
@@ -362,6 +370,11 @@ unsigned AlphaInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin()) return 0;
   --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
   if (I->getOpcode() != Alpha::BR && 
       I->getOpcode() != Alpha::COND_BRANCH_I &&
       I->getOpcode() != Alpha::COND_BRANCH_F)
diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.td b/lib/Target/Blackfin/BlackfinInstrInfo.td
index e3c3993..2471688 100644
--- a/lib/Target/Blackfin/BlackfinInstrInfo.td
+++ b/lib/Target/Blackfin/BlackfinInstrInfo.td
@@ -65,23 +65,23 @@ def HI16 : SDNodeXForm<imm, [{
 //===----------------------------------------------------------------------===//
 
 def imm3  : PatLeaf<(imm), [{return isInt<3>(N->getSExtValue());}]>;
-def uimm3 : PatLeaf<(imm), [{return isUint<3>(N->getZExtValue());}]>;
-def uimm4 : PatLeaf<(imm), [{return isUint<4>(N->getZExtValue());}]>;
-def uimm5 : PatLeaf<(imm), [{return isUint<5>(N->getZExtValue());}]>;
+def uimm3 : PatLeaf<(imm), [{return isUInt<3>(N->getZExtValue());}]>;
+def uimm4 : PatLeaf<(imm), [{return isUInt<4>(N->getZExtValue());}]>;
+def uimm5 : PatLeaf<(imm), [{return isUInt<5>(N->getZExtValue());}]>;
 
 def uimm5m2 : PatLeaf<(imm), [{
     uint64_t value = N->getZExtValue();
-    return value % 2 == 0 && isUint<5>(value);
+    return value % 2 == 0 && isUInt<5>(value);
 }]>;
 
 def uimm6m4 : PatLeaf<(imm), [{
     uint64_t value = N->getZExtValue();
-    return value % 4 == 0 && isUint<6>(value);
+    return value % 4 == 0 && isUInt<6>(value);
 }]>;
 
 def imm7   : PatLeaf<(imm), [{return isInt<7>(N->getSExtValue());}]>;
 def imm16  : PatLeaf<(imm), [{return isInt<16>(N->getSExtValue());}]>;
-def uimm16 : PatLeaf<(imm), [{return isUint<16>(N->getZExtValue());}]>;
+def uimm16 : PatLeaf<(imm), [{return isUInt<16>(N->getZExtValue());}]>;
 
 def ximm16 : PatLeaf<(imm), [{
     int64_t value = N->getSExtValue();
@@ -610,8 +610,7 @@ def MOVE_ncccc : F1<(outs NotCC:$cc), (ins JustCC:$sb),
                     "cc = !cc;", []>;
 
 def MOVECC_zext : F1<(outs D:$dst), (ins JustCC:$cc),
-                      "$dst = $cc;",
-                     [/*(set D:$dst, (zext JustCC:$cc))*/]>;
+                      "$dst = $cc;", []>;
 
 def MOVENCC_z : F1<(outs D:$dst), (ins NotCC:$cc),
                    "$dst = cc;", []>;
@@ -859,17 +858,5 @@ def : Pat<(BfinCall (i32 tglobaladdr:$dst)),
           (CALLa tglobaladdr:$dst)>;
 def : Pat<(BfinCall (i32 texternalsym:$dst)),
           (CALLa texternalsym:$dst)>;
-
-//def : Pat<(sext JustCC:$cc),
-//          (NEG (MOVECC_zext JustCC:$cc))>;
-//def : Pat<(anyext JustCC:$cc),
-//          (MOVECC_zext JustCC:$cc)>;
-def : Pat<(i16 (zext JustCC:$cc)),
-          (EXTRACT_SUBREG (MOVECC_zext JustCC:$cc), bfin_subreg_lo16)>;
-def : Pat<(i16 (sext JustCC:$cc)),
-          (EXTRACT_SUBREG (NEG (MOVECC_zext JustCC:$cc)), bfin_subreg_lo16)>;
-def : Pat<(i16 (anyext JustCC:$cc)),
-          (EXTRACT_SUBREG (MOVECC_zext JustCC:$cc), bfin_subreg_lo16)>;
-
 def : Pat<(i16 (trunc D:$src)),
           (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS D:$src, D)), bfin_subreg_lo16)>;
diff --git a/lib/Target/Blackfin/BlackfinIntrinsics.td b/lib/Target/Blackfin/BlackfinIntrinsics.td
index bf02cfe..ce21b08 100644
--- a/lib/Target/Blackfin/BlackfinIntrinsics.td
+++ b/lib/Target/Blackfin/BlackfinIntrinsics.td
@@ -21,14 +21,14 @@ let TargetPrefix = "bfin", isTarget = 1 in {
 
 // Execute csync instruction with workarounds
 def int_bfin_csync : GCCBuiltin<"__builtin_bfin_csync">,
-        Intrinsic<[llvm_void_ty]>;
+        Intrinsic<[]>;
 
 // Execute ssync instruction with workarounds
 def int_bfin_ssync : GCCBuiltin<"__builtin_bfin_ssync">,
-        Intrinsic<[llvm_void_ty]>;
+        Intrinsic<[]>;
 
 // Execute idle instruction with workarounds
 def int_bfin_idle : GCCBuiltin<"__builtin_bfin_idle">,
-        Intrinsic<[llvm_void_ty]>;
+        Intrinsic<[]>;
 
 }
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
index b39a342..84dc9ca 100644
--- a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
@@ -164,7 +164,7 @@ void BlackfinRegisterInfo::loadConstant(MachineBasicBlock &MBB,
     return;
   }
 
-  if (isUint<16>(value)) {
+  if (isUInt<16>(value)) {
     BuildMI(MBB, I, DL, TII.get(BF::LOADuimm16), Reg).addImm(value);
     return;
   }
@@ -255,13 +255,13 @@ BlackfinRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     assert(FIPos==1 && "Bad frame index operand");
     MI.getOperand(FIPos).ChangeToRegister(BaseReg, false);
     MI.getOperand(FIPos+1).setImm(Offset);
-    if (isUint<6>(Offset)) {
+    if (isUInt<6>(Offset)) {
       MI.setDesc(TII.get(isStore
                          ? BF::STORE32p_uimm6m4
                          : BF::LOAD32p_uimm6m4));
       return 0;
     }
-    if (BaseReg == BF::FP && isUint<7>(-Offset)) {
+    if (BaseReg == BF::FP && isUInt<7>(-Offset)) {
       MI.setDesc(TII.get(isStore
                          ? BF::STORE32fp_nimm7m4
                          : BF::LOAD32fp_nimm7m4));
diff --git a/lib/Target/CellSPU/SPU.h b/lib/Target/CellSPU/SPU.h
index c960974..1f21511 100644
--- a/lib/Target/CellSPU/SPU.h
+++ b/lib/Target/CellSPU/SPU.h
@@ -15,7 +15,6 @@
 #ifndef LLVM_TARGET_IBMCELLSPU_H
 #define LLVM_TARGET_IBMCELLSPU_H
 
-#include "llvm/System/DataTypes.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
@@ -25,73 +24,7 @@ namespace llvm {
 
   FunctionPass *createSPUISelDag(SPUTargetMachine &TM);
 
-  /*--== Utility functions/predicates/etc used all over the place: --==*/
-  //! Predicate test for a signed 10-bit value
-  /*!
-    \param Value The input value to be tested
-
-    This predicate tests for a signed 10-bit value, returning the 10-bit value
-    as a short if true.
-   */
-  template<typename T>
-  inline bool isS10Constant(T Value);
-
-  template<>
-  inline bool isS10Constant<short>(short Value) {
-    int SExtValue = ((int) Value << (32 - 10)) >> (32 - 10);
-    return ((Value > 0 && Value <= (1 << 9) - 1)
-            || (Value < 0 && (short) SExtValue == Value));
-  }
-
-  template<>
-  inline bool isS10Constant<int>(int Value) {
-    return (Value >= -(1 << 9) && Value <= (1 << 9) - 1);
-  }
-
-  template<>
-  inline bool isS10Constant<uint32_t>(uint32_t Value) {
-    return (Value <= ((1 << 9) - 1));
-  }
-
-  template<>
-  inline bool isS10Constant<int64_t>(int64_t Value) {
-    return (Value >= -(1 << 9) && Value <= (1 << 9) - 1);
-  }
-
-  template<>
-  inline bool isS10Constant<uint64_t>(uint64_t Value) {
-    return (Value <= ((1 << 9) - 1));
-  }
-
-  //! Predicate test for an unsigned 10-bit value
-  /*!
-    \param Value The input value to be tested
-
-    This predicate tests for an unsigned 10-bit value, returning the 10-bit value
-    as a short if true.
-   */
-  inline bool isU10Constant(short Value) {
-    return (Value == (Value & 0x3ff));
-  }
-
-  inline bool isU10Constant(int Value) {
-    return (Value == (Value & 0x3ff));
-  }
-
-  inline bool isU10Constant(uint32_t Value) {
-    return (Value == (Value & 0x3ff));
-  }
-
-  inline bool isU10Constant(int64_t Value) {
-    return (Value == (Value & 0x3ff));
-  }
-
-  inline bool isU10Constant(uint64_t Value) {
-    return (Value == (Value & 0x3ff));
-  }
-
   extern Target TheCellSPUTarget;
-
 }
 
 // Defines symbolic names for the SPU instructions.
diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
index 396a921..90f8310 100644
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@@ -44,28 +44,28 @@ namespace {
   bool
   isI64IntS10Immediate(ConstantSDNode *CN)
   {
-    return isS10Constant(CN->getSExtValue());
+    return isInt<10>(CN->getSExtValue());
   }
 
   //! ConstantSDNode predicate for i32 sign-extended, 10-bit immediates
   bool
   isI32IntS10Immediate(ConstantSDNode *CN)
   {
-    return isS10Constant(CN->getSExtValue());
+    return isInt<10>(CN->getSExtValue());
   }
 
   //! ConstantSDNode predicate for i32 unsigned 10-bit immediate values
   bool
   isI32IntU10Immediate(ConstantSDNode *CN)
   {
-    return isU10Constant(CN->getSExtValue());
+    return isUInt<10>(CN->getSExtValue());
   }
 
   //! ConstantSDNode predicate for i16 sign-extended, 10-bit immediate values
   bool
   isI16IntS10Immediate(ConstantSDNode *CN)
   {
-    return isS10Constant(CN->getSExtValue());
+    return isInt<10>(CN->getSExtValue());
   }
 
   //! SDNode predicate for i16 sign-extended, 10-bit immediate values
@@ -80,7 +80,7 @@ namespace {
   bool
   isI16IntU10Immediate(ConstantSDNode *CN)
   {
-    return isU10Constant((short) CN->getZExtValue());
+    return isUInt<10>((short) CN->getZExtValue());
   }
 
   //! SDNode predicate for i16 sign-extended, 10-bit immediate values
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index e863ee3..4b0d442 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -1107,7 +1107,8 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain,
       VarArgsFrameIndex = MFI->CreateFixedObject(StackSlotSize, ArgOffset,
                                                  true, false);
       SDValue FIN = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
-      SDValue ArgVal = DAG.getRegister(ArgRegs[ArgRegIdx], MVT::v16i8);
+      unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::R32CRegClass);
+      SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8);
       SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, NULL, 0,
                                    false, false, 0);
       Chain = Store.getOperand(0);
@@ -1491,7 +1492,7 @@ SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
         return SDValue();
       Value = Value >> 32;
     }
-    if (isS10Constant(Value))
+    if (isInt<10>(Value))
       return DAG.getTargetConstant(Value, ValueType);
   }
 
diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp
index 2306665..86825c8 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.cpp
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@@ -450,7 +450,15 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                             bool AllowModify) const {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
     return false;
 
   // Get the last instruction in the block.
@@ -513,6 +521,11 @@ SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   if (I == MBB.begin())
     return 0;
   --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
   if (!isCondBranch(I) && !isUncondBranch(I))
     return 0;
 
diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td
index 5068f77..6d1f87d 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@@ -1268,7 +1268,12 @@ multiclass BitwiseAnd
 
 defm AND : BitwiseAnd;
 
-// N.B.: vnot_conv is one of those special target selection pattern fragments,
+
+def vnot_cell_conv : PatFrag<(ops node:$in),
+                             (xor node:$in, (bitconvert (v4i32 immAllOnesV)))>;
+
+// N.B.: vnot_cell_conv is one of those special target selection pattern
+// fragments,
 // in which we expect there to be a bit_convert on the constant. Bear in mind
 // that llvm translates "not <reg>" to "xor <reg>, -1" (or in this case, a
 // constant -1 vector.)
@@ -1301,7 +1306,7 @@ multiclass AndComplement
   def r8:   ANDCRegInst<R8C>;
 
   // Sometimes, the xor pattern has a bitcast constant:
-  def v16i8_conv: ANDCVecInst<v16i8, vnot_conv>;
+  def v16i8_conv: ANDCVecInst<v16i8, vnot_cell_conv>;
 }
 
 defm ANDC : AndComplement;
@@ -1934,7 +1939,7 @@ multiclass SelectBits
   def v16i8: SELBVecInst<v16i8>;
   def v8i16: SELBVecInst<v8i16>;
   def v4i32: SELBVecInst<v4i32>;
-  def v2i64: SELBVecInst<v2i64, vnot_conv>;
+  def v2i64: SELBVecInst<v2i64, vnot_cell_conv>;
 
   def r128:  SELBRegInst<GPRC>;
   def r64:   SELBRegInst<R64C>;
@@ -4373,7 +4378,7 @@ def : Pat<(v2f64 (bitconvert (v16i8 VECREG:$src))), (v2f64 VECREG:$src)>;
 def : Pat<(v2f64 (bitconvert (v8i16 VECREG:$src))), (v2f64 VECREG:$src)>;
 def : Pat<(v2f64 (bitconvert (v4i32 VECREG:$src))), (v2f64 VECREG:$src)>;
 def : Pat<(v2f64 (bitconvert (v2i64 VECREG:$src))), (v2f64 VECREG:$src)>;
-def : Pat<(v2f64 (bitconvert (v2f64 VECREG:$src))), (v2f64 VECREG:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 VECREG:$src))), (v2f64 VECREG:$src)>;
 
 def : Pat<(i128 (bitconvert (v16i8 VECREG:$src))),
           (ORi128_vec VECREG:$src)>;
diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp
index 8c78bab..f3071f2 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.cpp
+++ b/lib/Target/CellSPU/SPURegisterInfo.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/Target/TargetFrameInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -336,6 +337,7 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo *MFI = MF.getFrameInfo();
+  DebugLoc dl = II->getDebugLoc();
 
   while (!MI.getOperand(i).isFI()) {
     ++i;
@@ -364,11 +366,22 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
 
   // Replace the FrameIndex with base register with $sp (aka $r1)
   SPOp.ChangeToRegister(SPU::R1, false);
-  if (Offset > SPUFrameInfo::maxFrameOffset()
-      || Offset < SPUFrameInfo::minFrameOffset()) {
-    errs() << "Large stack adjustment ("
-         << Offset
-         << ") in SPURegisterInfo::eliminateFrameIndex.";
+
+  // if 'Offset' doesn't fit to the D-form instruction's
+  // immediate, convert the instruction to X-form
+  // if the instruction is not an AI (which takes a s10 immediate), assume
+  // it is a load/store that can take a s14 immediate
+  if ((MI.getOpcode() == SPU::AIr32 && !isInt<10>(Offset))
+      || !isInt<14>(Offset)) {
+    int newOpcode = convertDFormToXForm(MI.getOpcode());
+    unsigned tmpReg = findScratchRegister(II, RS, &SPU::R32CRegClass, SPAdj);
+    BuildMI(MBB, II, dl, TII.get(SPU::ILr32), tmpReg )
+        .addImm(Offset);
+    BuildMI(MBB, II, dl, TII.get(newOpcode), MI.getOperand(0).getReg())
+        .addReg(tmpReg, RegState::Kill)
+        .addReg(SPU::R1);
+    // remove the replaced D-form instruction
+    MBB.erase(II);
   } else {
     MO.ChangeToImmediate(Offset);
   }
@@ -423,6 +436,14 @@ void SPURegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   MF.getRegInfo().setPhysRegUnused(SPU::R0);
   MF.getRegInfo().setPhysRegUnused(SPU::R1);
   MF.getRegInfo().setPhysRegUnused(SPU::R2);
+
+  MachineFrameInfo *MFI = MF.getFrameInfo(); 
+  const TargetRegisterClass *RC = &SPU::R32CRegClass;
+  RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                     RC->getAlignment(),
+                                                     false));
+  
+  
 }
 
 void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
@@ -448,7 +469,8 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
   assert((FrameSize & 0xf) == 0
          && "SPURegisterInfo::emitPrologue: FrameSize not aligned");
 
-  if (FrameSize > 0 || MFI->hasCalls()) {
+  // the "empty" frame size is 16 - just the register scavenger spill slot
+  if (FrameSize > 16 || MFI->hasCalls()) {
     FrameSize = -(FrameSize + SPUFrameInfo::minStackSize());
     if (hasDebugInfo) {
       // Mark effective beginning of when frame pointer becomes valid.
@@ -460,14 +482,14 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
     // for the ABI
     BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R0).addImm(16)
       .addReg(SPU::R1);
-    if (isS10Constant(FrameSize)) {
+    if (isInt<10>(FrameSize)) {
       // Spill $sp to adjusted $sp
       BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R1).addImm(FrameSize)
         .addReg(SPU::R1);
       // Adjust $sp by required amout
       BuildMI(MBB, MBBI, dl, TII.get(SPU::AIr32), SPU::R1).addReg(SPU::R1)
         .addImm(FrameSize);
-    } else if (FrameSize <= (1 << 16) - 1 && FrameSize >= -(1 << 16)) {
+    } else if (isInt<16>(FrameSize)) {
       // Frame size can be loaded into ILr32n, so temporarily spill $r2 and use
       // $r2 to adjust $sp:
       BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr128), SPU::R2)
@@ -475,7 +497,7 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
         .addReg(SPU::R1);
       BuildMI(MBB, MBBI, dl, TII.get(SPU::ILr32), SPU::R2)
         .addImm(FrameSize);
-      BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R1)
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::STQXr32), SPU::R1)
         .addReg(SPU::R2)
         .addReg(SPU::R1);
       BuildMI(MBB, MBBI, dl, TII.get(SPU::Ar32), SPU::R1)
@@ -549,9 +571,11 @@ SPURegisterInfo::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const
          "Can only insert epilog into returning blocks");
   assert((FrameSize & 0xf) == 0
          && "SPURegisterInfo::emitEpilogue: FrameSize not aligned");
-  if (FrameSize > 0 || MFI->hasCalls()) {
+
+  // the "empty" frame size is 16 - just the register scavenger spill slot
+  if (FrameSize > 16 || MFI->hasCalls()) {
     FrameSize = FrameSize + SPUFrameInfo::minStackSize();
-    if (isS10Constant(FrameSize + LinkSlotOffset)) {
+    if (isInt<10>(FrameSize + LinkSlotOffset)) {
       // Reload $lr, adjust $sp by required amount
       // Note: We do this to slightly improve dual issue -- not by much, but it
       // is an opportunity for dual issue.
@@ -574,7 +598,7 @@ SPURegisterInfo::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const
         .addReg(SPU::R2);
       BuildMI(MBB, MBBI, dl, TII.get(SPU::LQDr128), SPU::R0)
         .addImm(16)
-        .addReg(SPU::R2);
+        .addReg(SPU::R1);
       BuildMI(MBB, MBBI, dl, TII.get(SPU::SFIr32), SPU::R2).
         addReg(SPU::R2)
         .addImm(16);
@@ -618,4 +642,43 @@ SPURegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return SPUGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
 }
 
+int 
+SPURegisterInfo::convertDFormToXForm(int dFormOpcode) const
+{
+  switch(dFormOpcode) 
+  {
+    case SPU::AIr32:     return SPU::Ar32;
+    case SPU::LQDr32:    return SPU::LQXr32;
+    case SPU::LQDr128:   return SPU::LQXr128;
+    case SPU::LQDv16i8:  return SPU::LQXv16i8;
+    case SPU::LQDv4f32:  return SPU::LQXv4f32;
+    case SPU::STQDr32:   return SPU::STQXr32;
+    case SPU::STQDr128:  return SPU::STQXr128;
+    case SPU::STQDv16i8: return SPU::STQXv16i8;
+    case SPU::STQDv4i32: return SPU::STQXv4i32;
+    case SPU::STQDv4f32: return SPU::STQXv4f32;
+
+    default: assert( false && "Unhandled D to X-form conversion");
+  }
+  // default will assert, but need to return something to keep the
+  // compiler happy.
+  return dFormOpcode;
+}
+
+// TODO this is already copied from PPC. Could this convenience function
+// be moved to the RegScavenger class?
+unsigned  
+SPURegisterInfo::findScratchRegister(MachineBasicBlock::iterator II, 
+                                     RegScavenger *RS,
+                                     const TargetRegisterClass *RC, 
+                                     int SPAdj) const
+{
+  assert(RS && "Register scavenging must be on");
+  unsigned Reg = RS->FindUnusedReg(RC);
+  if (Reg == 0)
+    Reg = RS->scavengeRegister(RC, II, SPAdj);
+  assert( Reg && "Register scavenger failed");
+  return Reg;
+}
+
 #include "SPUGenRegisterInfo.inc"
diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h
index 48feb5c..0a70318 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.h
+++ b/lib/Target/CellSPU/SPURegisterInfo.h
@@ -53,6 +53,10 @@ namespace llvm {
     virtual const TargetRegisterClass* const *
       getCalleeSavedRegClasses(const MachineFunction *MF) const;
 
+    //! Allow for scavenging, so we can get scratch registers when needed.
+    virtual bool requiresRegisterScavenging(const MachineFunction &MF) const
+    { return true; }
+
     //! Return the reserved registers
     BitVector getReservedRegs(const MachineFunction &MF) const;
 
@@ -97,6 +101,21 @@ namespace llvm {
 
     //! Get DWARF debugging register number
     int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+
+    //! Convert D-form load/store to X-form load/store
+    /*!
+      Converts a regiser displacement load/store into a register-indexed
+      load/store for large stack frames, when the stack frame exceeds the
+      range of a s10 displacement.
+     */
+    int convertDFormToXForm(int dFormOpcode) const;
+
+    //! Acquire an unused register in an emergency.
+    unsigned findScratchRegister(MachineBasicBlock::iterator II,
+                                 RegScavenger *RS,
+                                 const TargetRegisterClass *RC, 
+                                 int SPAdj) const;
+    
   };
 } // end namespace llvm
 
diff --git a/lib/Target/MBlaze/MBlazeIntrinsics.td b/lib/Target/MBlaze/MBlazeIntrinsics.td
index 76eb563..82552fa 100644
--- a/lib/Target/MBlaze/MBlazeIntrinsics.td
+++ b/lib/Target/MBlaze/MBlazeIntrinsics.td
@@ -21,11 +21,11 @@ let TargetPrefix = "mblaze", isTarget = 1 in {
                                         [llvm_i32_ty],
                                         [IntrWriteMem]>;
 
-  class MBFSL_Put_Intrinsic : Intrinsic<[llvm_void_ty],
+  class MBFSL_Put_Intrinsic : Intrinsic<[],
                                         [llvm_i32_ty, llvm_i32_ty],
                                         [IntrWriteMem]>;
 
-  class MBFSL_PutT_Intrinsic : Intrinsic<[llvm_void_ty],
+  class MBFSL_PutT_Intrinsic : Intrinsic<[],
                                          [llvm_i32_ty],
                                          [IntrWriteMem]>;
 }
diff --git a/lib/Target/MSIL/MSILWriter.cpp b/lib/Target/MSIL/MSILWriter.cpp
index ac41cc8..15d16ec 100644
--- a/lib/Target/MSIL/MSILWriter.cpp
+++ b/lib/Target/MSIL/MSILWriter.cpp
@@ -424,7 +424,7 @@ void MSILWriter::printPtrLoad(uint64_t N) {
   case Module::Pointer32:
     printSimpleInstruction("ldc.i4",utostr(N).c_str());
     // FIXME: Need overflow test?
-    if (!isUInt32(N)) {
+    if (!isUInt<32>(N)) {
       errs() << "Value = " << utostr(N) << '\n';
       llvm_unreachable("32-bit pointer overflowed");
     }
diff --git a/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.cpp b/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.cpp
index 32c6b04..f4d7d8a 100644
--- a/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.cpp
+++ b/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.cpp
@@ -59,7 +59,7 @@ GetJumpTableSymbol(const MachineOperand &MO) const {
   }
 
   // Create a symbol for the name.
-  return Ctx.GetOrCreateTemporarySymbol(Name.str());
+  return Ctx.GetOrCreateSymbol(Name.str());
 }
 
 MCSymbol *MSP430MCInstLower::
@@ -75,7 +75,7 @@ GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
   }
 
   // Create a symbol for the name.
-  return Ctx.GetOrCreateTemporarySymbol(Name.str());
+  return Ctx.GetOrCreateSymbol(Name.str());
 }
 
 MCOperand MSP430MCInstLower::
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index 6372482..e584770 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -173,6 +173,8 @@ unsigned MSP430InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 
   while (I != MBB.begin()) {
     --I;
+    if (I->isDebugValue())
+      continue;
     if (I->getOpcode() != MSP430::JMP &&
         I->getOpcode() != MSP430::JCC)
       break;
@@ -241,6 +243,9 @@ bool MSP430InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   MachineBasicBlock::iterator I = MBB.end();
   while (I != MBB.begin()) {
     --I;
+    if (I->isDebugValue())
+      continue;
+
     // Working from the bottom, when we see a non-terminator
     // instruction, we're done.
     if (!isUnpredicatedTerminator(I))
diff --git a/lib/Target/Mangler.cpp b/lib/Target/Mangler.cpp
index fb93706..1d5c511 100644
--- a/lib/Target/Mangler.cpp
+++ b/lib/Target/Mangler.cpp
@@ -235,10 +235,7 @@ std::string Mangler::getNameWithPrefix(const GlobalValue *GV,
 MCSymbol *Mangler::getSymbol(const GlobalValue *GV) {
   SmallString<60> NameStr;
   getNameWithPrefix(NameStr, GV, false);
-  if (!GV->hasPrivateLinkage())
-    return Context.GetOrCreateSymbol(NameStr.str());
-  
-  return Context.GetOrCreateTemporarySymbol(NameStr.str());
+  return Context.GetOrCreateSymbol(NameStr.str());
 }
 
 
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index fa4518d..e948917 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -26,8 +26,9 @@
 // Floating Point Compare and Branch
 def SDT_MipsFPBrcond : SDTypeProfile<0, 3, [SDTCisSameAs<0, 2>, SDTCisInt<0>,
                                      SDTCisVT<1, OtherVT>]>;
-def SDT_MipsFPCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisFP<0>, 
-                                  SDTCisInt<2>]>;
+def SDT_MipsFPCmp : SDTypeProfile<1, 3, [SDTCisVT<0, i32>,
+                                         SDTCisSameAs<1, 2>, SDTCisFP<1>, 
+                                         SDTCisInt<3>]>;
 def SDT_MipsFPSelectCC : SDTypeProfile<1, 4, [SDTCisInt<1>, SDTCisInt<4>,
                                   SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>;
 
@@ -244,12 +245,13 @@ def MIPS_FCOND_NGT  : PatLeaf<(i32 15)>;
 /// Floating Point Compare
 let hasDelaySlot = 1, Defs=[FCR31] in {
   def FCMP_S32 : FCC<0x0, (outs), (ins FGR32:$fs, FGR32:$ft, condcode:$cc),
-      "c.$cc.s $fs, $ft", [(MipsFPCmp FGR32:$fs, FGR32:$ft, imm:$cc), 
-      (implicit FCR31)]>;
+      "c.$cc.s $fs, $ft",
+        [(set FCR31, (MipsFPCmp FGR32:$fs, FGR32:$ft, imm:$cc))]>; 
   
   def FCMP_D32 : FCC<0x1, (outs), (ins AFGR64:$fs, AFGR64:$ft, condcode:$cc),
-      "c.$cc.d $fs, $ft", [(MipsFPCmp AFGR64:$fs, AFGR64:$ft, imm:$cc),
-      (implicit FCR31)]>, Requires<[In32BitMode]>;
+      "c.$cc.d $fs, $ft",
+       [(set FCR31, (MipsFPCmp AFGR64:$fs, AFGR64:$ft, imm:$cc))]>,
+      Requires<[In32BitMode]>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 1a9bffc..85cf064 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -433,7 +433,15 @@ bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
     return false;
   
   // Get the last instruction in the block.
@@ -562,6 +570,11 @@ RemoveBranch(MachineBasicBlock &MBB) const
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin()) return 0;
   --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
   if (I->getOpcode() != Mips::J && 
       GetCondFromBranchOpc(I->getOpcode()) == Mips::COND_INVALID)
     return 0;
diff --git a/lib/Target/PIC16/PIC16InstrInfo.cpp b/lib/Target/PIC16/PIC16InstrInfo.cpp
index 2fb405e..da16e83 100644
--- a/lib/Target/PIC16/PIC16InstrInfo.cpp
+++ b/lib/Target/PIC16/PIC16InstrInfo.cpp
@@ -226,6 +226,11 @@ bool PIC16InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
   // Get the terminator instruction.
   --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return true;
+    --I;
+  }
   // Handle unconditional branches. If the unconditional branch's target is
   // successor basic block then remove the unconditional branch. 
   if (I->getOpcode() == PIC16::br_uncond  && AllowModify) {
diff --git a/lib/Target/PIC16/PIC16Section.cpp b/lib/Target/PIC16/PIC16Section.cpp
index a96ebb8..2505b11 100644
--- a/lib/Target/PIC16/PIC16Section.cpp
+++ b/lib/Target/PIC16/PIC16Section.cpp
@@ -17,10 +17,9 @@ using namespace llvm;
 
 // This is the only way to create a PIC16Section. Sections created here
 // do not need to be explicitly deleted as they are managed by auto_ptrs.
-PIC16Section *PIC16Section::Create(const StringRef &Name,
-                                   PIC16SectionType Ty,
-                                   const std::string &Address, 
-                                   int Color, MCContext &Ctx) {
+PIC16Section *PIC16Section::Create(StringRef Name, PIC16SectionType Ty,
+                                   StringRef Address, int Color,
+                                   MCContext &Ctx) {
 
   /// Determine the internal SectionKind info.
   /// Users of PIC16Section class should not need to know the internal
@@ -59,8 +58,17 @@ PIC16Section *PIC16Section::Create(const StringRef &Name,
       
   }
 
+  // Copy strings into context allocated memory so they get free'd when the
+  // context is destroyed.
+  char *NameCopy = static_cast<char*>(Ctx.Allocate(Name.size(), 1));
+  memcpy(NameCopy, Name.data(), Name.size());
+  char *AddressCopy = static_cast<char*>(Ctx.Allocate(Address.size(), 1));
+  memcpy(AddressCopy, Address.data(), Address.size());
+
   // Create the Section.
-  PIC16Section *S = new (Ctx) PIC16Section(Name, K, Address, Color);
+  PIC16Section *S =
+    new (Ctx) PIC16Section(StringRef(NameCopy, Name.size()), K,
+                           StringRef(AddressCopy, Address.size()), Color);
   S->T = Ty;
   return S;
 }
diff --git a/lib/Target/PIC16/PIC16Section.h b/lib/Target/PIC16/PIC16Section.h
index 566f920..9039ca7 100644
--- a/lib/Target/PIC16/PIC16Section.h
+++ b/lib/Target/PIC16/PIC16Section.h
@@ -30,11 +30,11 @@ namespace llvm {
     PIC16SectionType T;
 
     /// Name of the section to uniquely identify it.
-    std::string Name;
+    StringRef Name;
 
     /// User can specify an address at which a section should be placed. 
     /// Negative value here means user hasn't specified any. 
-    std::string Address; 
+    StringRef Address; 
 
     /// Overlay information - Sections with same color can be overlaid on
     /// one another.
@@ -43,17 +43,16 @@ namespace llvm {
     /// Total size of all data objects contained here.
     unsigned Size;
     
-    PIC16Section(const StringRef &name, SectionKind K, const std::string &addr, 
-                 int color)
+    PIC16Section(StringRef name, SectionKind K, StringRef addr, int color)
       : MCSection(K), Name(name), Address(addr), Color(color), Size(0) {
     }
     
   public:
     /// Return the name of the section.
-    const std::string &getName() const { return Name; }
+    StringRef getName() const { return Name; }
 
     /// Return the Address of the section.
-    const std::string &getAddress() const { return Address; }
+    StringRef getAddress() const { return Address; }
 
     /// Return the Color of the section.
     int getColor() const { return Color; }
@@ -64,6 +63,8 @@ namespace llvm {
     void setSize(unsigned size) { Size = size; }
 
     /// Conatined data objects.
+    // FIXME: This vector is leaked because sections are allocated with a
+    //        BumpPtrAllocator.
     std::vector<const GlobalVariable *>Items;
 
     /// Check section type. 
@@ -77,8 +78,8 @@ namespace llvm {
     PIC16SectionType getType() const { return T; }
 
     /// This would be the only way to create a section. 
-    static PIC16Section *Create(const StringRef &Name, PIC16SectionType Ty, 
-                                const std::string &Address, int Color, 
+    static PIC16Section *Create(StringRef Name, PIC16SectionType Ty, 
+                                StringRef Address, int Color, 
                                 MCContext &Ctx);
     
     /// Override this as PIC16 has its own way of printing switching
diff --git a/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp b/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp
index ed6fc9d..5adefd3 100644
--- a/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp
@@ -309,8 +309,8 @@ namespace {
       const MCSymbol *&TOCEntry = TOC[Sym];
       if (TOCEntry == 0)
         TOCEntry = OutContext.
-          GetOrCreateTemporarySymbol(StringRef(MAI->getPrivateGlobalPrefix()) +
-                                     "C" + Twine(LabelID++));
+          GetOrCreateSymbol(StringRef(MAI->getPrivateGlobalPrefix()) +
+                            "C" + Twine(LabelID++));
 
       O << *TOCEntry << "@toc";
     }
@@ -674,14 +674,14 @@ static const MCSymbol *GetLazyPtr(const MCSymbol *Sym, MCContext &Ctx) {
   // Remove $stub suffix, add $lazy_ptr.
   SmallString<128> TmpStr(Sym->getName().begin(), Sym->getName().end()-5);
   TmpStr += "$lazy_ptr";
-  return Ctx.GetOrCreateTemporarySymbol(TmpStr.str());
+  return Ctx.GetOrCreateSymbol(TmpStr.str());
 }
 
 static const MCSymbol *GetAnonSym(const MCSymbol *Sym, MCContext &Ctx) {
   // Add $tmp suffix to $stub, yielding $stub$tmp.
   SmallString<128> TmpStr(Sym->getName().begin(), Sym->getName().end());
   TmpStr += "$tmp";
-  return Ctx.GetOrCreateTemporarySymbol(TmpStr.str());
+  return Ctx.GetOrCreateSymbol(TmpStr.str());
 }
 
 void PPCDarwinAsmPrinter::
@@ -811,6 +811,11 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
         OutStreamer.EmitIntValue(0, isPPC64 ? 8 : 4/*size*/, 0/*addrspace*/);
       else
         // Internal to current translation unit.
+        //
+        // When we place the LSDA into the TEXT section, the type info pointers
+        // need to be indirect and pc-rel. We accomplish this by using NLPs.
+        // However, sometimes the types are local to the file. So we need to
+        // fill in the value for the NLP in those cases.
         OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
                                                       OutContext),
                               isPPC64 ? 8 : 4/*size*/, 0/*addrspace*/);
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index a752421..52948c8 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -130,7 +130,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
         }
 
         // If this branch is in range, ignore it.
-        if (isInt16(BranchSize)) {
+        if (isInt<16>(BranchSize)) {
           MBBStartOffset += 4;
           continue;
         }
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 9d79c0d..4f88d35d 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -470,11 +470,11 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
     if (CC == ISD::SETEQ || CC == ISD::SETNE) {
       if (isInt32Immediate(RHS, Imm)) {
         // SETEQ/SETNE comparison with 16-bit immediate, fold it.
-        if (isUInt16(Imm))
+        if (isUInt<16>(Imm))
           return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
                                                 getI32Imm(Imm & 0xFFFF)), 0);
         // If this is a 16-bit signed immediate, fold it.
-        if (isInt16((int)Imm))
+        if (isInt<16>((int)Imm))
           return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
                                                 getI32Imm(Imm & 0xFFFF)), 0);
         
@@ -494,7 +494,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
       }
       Opc = PPC::CMPLW;
     } else if (ISD::isUnsignedIntSetCC(CC)) {
-      if (isInt32Immediate(RHS, Imm) && isUInt16(Imm))
+      if (isInt32Immediate(RHS, Imm) && isUInt<16>(Imm))
         return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
                                               getI32Imm(Imm & 0xFFFF)), 0);
       Opc = PPC::CMPLW;
@@ -511,11 +511,11 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
     if (CC == ISD::SETEQ || CC == ISD::SETNE) {
       if (isInt64Immediate(RHS.getNode(), Imm)) {
         // SETEQ/SETNE comparison with 16-bit immediate, fold it.
-        if (isUInt16(Imm))
+        if (isUInt<16>(Imm))
           return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
                                                 getI32Imm(Imm & 0xFFFF)), 0);
         // If this is a 16-bit signed immediate, fold it.
-        if (isInt16(Imm))
+        if (isInt<16>(Imm))
           return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
                                                 getI32Imm(Imm & 0xFFFF)), 0);
         
@@ -528,7 +528,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
         //   xoris r0,r3,0x1234
         //   cmpldi cr0,r0,0x5678
         //   beq cr0,L6
-        if (isUInt32(Imm)) {
+        if (isUInt<32>(Imm)) {
           SDValue Xor(CurDAG->getMachineNode(PPC::XORIS8, dl, MVT::i64, LHS,
                                              getI64Imm(Imm >> 16)), 0);
           return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, Xor,
@@ -537,7 +537,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
       }
       Opc = PPC::CMPLD;
     } else if (ISD::isUnsignedIntSetCC(CC)) {
-      if (isInt64Immediate(RHS.getNode(), Imm) && isUInt16(Imm))
+      if (isInt64Immediate(RHS.getNode(), Imm) && isUInt<16>(Imm))
         return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
                                               getI64Imm(Imm & 0xFFFF)), 0);
       Opc = PPC::CMPLD;
@@ -761,12 +761,12 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       unsigned Shift = 0;
       
       // If it can't be represented as a 32 bit value.
-      if (!isInt32(Imm)) {
+      if (!isInt<32>(Imm)) {
         Shift = CountTrailingZeros_64(Imm);
         int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
         
         // If the shifted value fits 32 bits.
-        if (isInt32(ImmSh)) {
+        if (isInt<32>(ImmSh)) {
           // Go with the shifted value.
           Imm = ImmSh;
         } else {
@@ -785,7 +785,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       unsigned Hi = (Imm >> 16) & 0xFFFF;
       
       // Simple value.
-      if (isInt16(Imm)) {
+      if (isInt<16>(Imm)) {
        // Just the Lo bits.
         Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
       } else if (Lo) {
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 2c072c1..e67666d 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -5539,8 +5539,16 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   return false;
 }
 
-EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
-                                           bool isSrcConst, bool isSrcStr,
+/// getOptimalMemOpType - Returns the target specific optimal type for load
+/// and store operations as a result of memset, memcpy, and memmove lowering.
+/// If DstAlign is zero that means it's safe to destination alignment can
+/// satisfy any constraint. Similarly if SrcAlign is zero it means there
+/// isn't a need to check it against alignment requirement, probably because
+/// the source does not need to be loaded. It returns EVT::Other if
+/// SelectionDAG should be responsible for determining it.
+EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
+                                           unsigned DstAlign, unsigned SrcAlign,
+                                           bool SafeToUseFP,
                                            SelectionDAG &DAG) const {
   if (this->PPCSubTarget.isPPC64()) {
     return MVT::i64;
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 9c390ac..19fefab 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -347,9 +347,16 @@ namespace llvm {
 
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
     
-    virtual EVT getOptimalMemOpType(uint64_t Size, unsigned Align,
-                                    bool isSrcConst, bool isSrcStr,
-                                    SelectionDAG &DAG) const;
+    /// getOptimalMemOpType - Returns the target specific optimal type for load
+    /// and store operations as a result of memset, memcpy, and memmove lowering.
+    /// If DstAlign is zero that means it's safe to destination alignment can
+    /// satisfy any constraint. Similarly if SrcAlign is zero it means there
+    /// isn't a need to check it against alignment requirement, probably because
+    /// the source does not need to be loaded. It returns EVT::Other if
+    /// SelectionDAG should be responsible for determining it.
+    virtual EVT getOptimalMemOpType(uint64_t Size,
+                                    unsigned DstAlign, unsigned SrcAlign,
+                                    bool SafeToUseFP, SelectionDAG &DAG) const;
 
     /// getFunctionAlignment - Return the Log2 alignment of this function.
     virtual unsigned getFunctionAlignment(const Function *F) const;
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 3ff8f27..256370f 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -15,6 +15,10 @@
 // Altivec transformation functions and pattern fragments.
 //
 
+// Since we canonicalize buildvectors to v16i8, all vnots "-1" operands will be
+// of that type.
+def vnot_ppc : PatFrag<(ops node:$in),
+                       (xor node:$in, (bitconvert (v16i8 immAllOnesV)))>;
 
 def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                               (vector_shuffle node:$lhs, node:$rhs), [{
@@ -321,7 +325,8 @@ def VAND : VXForm_1<1028, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
                     [(set VRRC:$vD, (and (v4i32 VRRC:$vA), VRRC:$vB))]>;
 def VANDC : VXForm_1<1092, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
                      "vandc $vD, $vA, $vB", VecFP,
-                     [(set VRRC:$vD, (and (v4i32 VRRC:$vA), (vnot VRRC:$vB)))]>;
+                     [(set VRRC:$vD, (and (v4i32 VRRC:$vA),
+                                          (vnot_ppc VRRC:$vB)))]>;
 
 def VCFSX  : VXForm_1<842, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
                       "vcfsx $vD, $vB, $UIMM", VecFP,
@@ -435,7 +440,8 @@ def VSUM4UBS: VX1_Int<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs>;
 
 def VNOR : VXForm_1<1284, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
                     "vnor $vD, $vA, $vB", VecFP,
-                    [(set VRRC:$vD, (vnot (or (v4i32 VRRC:$vA), VRRC:$vB)))]>;
+                    [(set VRRC:$vD, (vnot_ppc (or (v4i32 VRRC:$vA),
+                                                  VRRC:$vB)))]>;
 def VOR : VXForm_1<1156, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
                       "vor $vD, $vA, $vB", VecFP,
                       [(set VRRC:$vD, (or (v4i32 VRRC:$vA), VRRC:$vB))]>;
@@ -640,12 +646,11 @@ def:Pat<(vmrghw_unary_shuffle (v16i8 VRRC:$vA), undef),
         (VMRGHW VRRC:$vA, VRRC:$vA)>;
 
 // Logical Operations
-def : Pat<(v4i32 (vnot VRRC:$vA)),      (VNOR VRRC:$vA, VRRC:$vA)>;
-def : Pat<(v4i32 (vnot_conv VRRC:$vA)), (VNOR VRRC:$vA, VRRC:$vA)>;
+def : Pat<(v4i32 (vnot_ppc VRRC:$vA)), (VNOR VRRC:$vA, VRRC:$vA)>;
 
-def : Pat<(v4i32 (vnot_conv (or VRRC:$A, VRRC:$B))),
+def : Pat<(v4i32 (vnot_ppc (or VRRC:$A, VRRC:$B))),
           (VNOR VRRC:$A, VRRC:$B)>;
-def : Pat<(v4i32 (and VRRC:$A, (vnot_conv VRRC:$B))),
+def : Pat<(v4i32 (and VRRC:$A, (vnot_ppc VRRC:$B))),
           (VANDC VRRC:$A, VRRC:$B)>;
 
 def : Pat<(fmul VRRC:$vA, VRRC:$vB),
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 9895bea..82c637e 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -213,7 +213,15 @@ bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
                                  bool AllowModify) const {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
     return false;
 
   // Get the last instruction in the block.
@@ -281,6 +289,11 @@ unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin()) return 0;
   --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
   if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC)
     return 0;
   
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 6e7880e..44c5fe6 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -512,7 +512,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       MachineInstr *MI = I;
       DebugLoc dl = MI->getDebugLoc();
 
-      if (isInt16(CalleeAmt)) {
+      if (isInt<16>(CalleeAmt)) {
         BuildMI(MBB, I, dl, TII.get(ADDIInstr), StackReg).addReg(StackReg).
           addImm(CalleeAmt);
       } else {
@@ -596,7 +596,7 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II,
   else
     Reg = PPC::R0;
   
-  if (MaxAlign < TargetAlign && isInt16(FrameSize)) {
+  if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) {
     BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
       .addReg(PPC::R31)
       .addImm(FrameSize);
@@ -798,7 +798,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // clear can be encoded.  This is extremely uncommon, because normally you
   // only "std" to a stack slot that is at least 4-byte aligned, but it can
   // happen in invalid code.
-  if (isInt16(Offset) && (!isIXAddr || (Offset & 3) == 0)) {
+  if (isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) {
     if (isIXAddr)
       Offset >>= 2;    // The actual encoded value has the low two bits zero.
     MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
@@ -1375,8 +1375,9 @@ PPCRegisterInfo::emitPrologue(MachineFunction &MF) const {
   if (!isPPC64) {
     // PPC32.
     if (ALIGN_STACK && MaxAlign > TargetAlign) {
-      assert(isPowerOf2_32(MaxAlign)&&isInt16(MaxAlign)&&"Invalid alignment!");
-      assert(isInt16(NegFrameSize) && "Unhandled stack size and alignment!");
+      assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
+             "Invalid alignment!");
+      assert(isInt<16>(NegFrameSize) && "Unhandled stack size and alignment!");
 
       BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), PPC::R0)
         .addReg(PPC::R1)
@@ -1390,7 +1391,7 @@ PPCRegisterInfo::emitPrologue(MachineFunction &MF) const {
         .addReg(PPC::R1)
         .addReg(PPC::R1)
         .addReg(PPC::R0);
-    } else if (isInt16(NegFrameSize)) {
+    } else if (isInt<16>(NegFrameSize)) {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::STWU), PPC::R1)
         .addReg(PPC::R1)
         .addImm(NegFrameSize)
@@ -1408,8 +1409,9 @@ PPCRegisterInfo::emitPrologue(MachineFunction &MF) const {
     }
   } else {    // PPC64.
     if (ALIGN_STACK && MaxAlign > TargetAlign) {
-      assert(isPowerOf2_32(MaxAlign)&&isInt16(MaxAlign)&&"Invalid alignment!");
-      assert(isInt16(NegFrameSize) && "Unhandled stack size and alignment!");
+      assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
+             "Invalid alignment!");
+      assert(isInt<16>(NegFrameSize) && "Unhandled stack size and alignment!");
 
       BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), PPC::X0)
         .addReg(PPC::X1)
@@ -1422,7 +1424,7 @@ PPCRegisterInfo::emitPrologue(MachineFunction &MF) const {
         .addReg(PPC::X1)
         .addReg(PPC::X1)
         .addReg(PPC::X0);
-    } else if (isInt16(NegFrameSize)) {
+    } else if (isInt<16>(NegFrameSize)) {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::STDU), PPC::X1)
         .addReg(PPC::X1)
         .addImm(NegFrameSize / 4)
@@ -1591,7 +1593,7 @@ void PPCRegisterInfo::emitEpilogue(MachineFunction &MF,
       // enabled (=> hasFastCall()==true) the fastcc call might contain a tail
       // call which invalidates the stack pointer value in SP(0). So we use the
       // value of R31 in this case.
-      if (FI->hasFastCall() && isInt16(FrameSize)) {
+      if (FI->hasFastCall() && isInt<16>(FrameSize)) {
         assert(hasFP(MF) && "Expecting a valid the frame pointer.");
         BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), PPC::R1)
           .addReg(PPC::R31).addImm(FrameSize);
@@ -1605,7 +1607,7 @@ void PPCRegisterInfo::emitEpilogue(MachineFunction &MF,
           .addReg(PPC::R1)
           .addReg(PPC::R31)
           .addReg(PPC::R0);
-      } else if (isInt16(FrameSize) &&
+      } else if (isInt<16>(FrameSize) &&
                  (!ALIGN_STACK || TargetAlign >= MaxAlign) &&
                  !MFI->hasVarSizedObjects()) {
         BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), PPC::R1)
@@ -1615,7 +1617,7 @@ void PPCRegisterInfo::emitEpilogue(MachineFunction &MF,
           .addImm(0).addReg(PPC::R1);
       }
     } else {
-      if (FI->hasFastCall() && isInt16(FrameSize)) {
+      if (FI->hasFastCall() && isInt<16>(FrameSize)) {
         assert(hasFP(MF) && "Expecting a valid the frame pointer.");
         BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI8), PPC::X1)
           .addReg(PPC::X31).addImm(FrameSize);
@@ -1629,7 +1631,7 @@ void PPCRegisterInfo::emitEpilogue(MachineFunction &MF,
           .addReg(PPC::X1)
           .addReg(PPC::X31)
           .addReg(PPC::X0);
-      } else if (isInt16(FrameSize) && TargetAlign >= MaxAlign &&
+      } else if (isInt<16>(FrameSize) && TargetAlign >= MaxAlign &&
             !MFI->hasVarSizedObjects()) {
         BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI8), PPC::X1)
            .addReg(PPC::X1).addImm(FrameSize);
@@ -1678,7 +1680,7 @@ void PPCRegisterInfo::emitEpilogue(MachineFunction &MF,
      unsigned LISInstr = isPPC64 ? PPC::LIS8 : PPC::LIS;
      unsigned ORIInstr = isPPC64 ? PPC::ORI8 : PPC::ORI;
 
-     if (CallerAllocatedAmt && isInt16(CallerAllocatedAmt)) {
+     if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) {
        BuildMI(MBB, MBBI, dl, TII.get(ADDIInstr), StackReg)
          .addReg(StackReg).addImm(CallerAllocatedAmt);
      } else {
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index f46840c..8c5e905 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -316,19 +316,19 @@ def FBCONVF64   : Pseudo<(outs FP64:$dst), (ins GR64:$src),
 let Defs = [PSW] in {
 def FCMP32rr : Pseudo<(outs), (ins FP32:$src1, FP32:$src2),
                       "cebr\t$src1, $src2",
-                      [(SystemZcmp FP32:$src1, FP32:$src2), (implicit PSW)]>;
+                      [(set PSW, (SystemZcmp FP32:$src1, FP32:$src2))]>;
 def FCMP64rr : Pseudo<(outs), (ins FP64:$src1, FP64:$src2),
                       "cdbr\t$src1, $src2",
-                      [(SystemZcmp FP64:$src1, FP64:$src2), (implicit PSW)]>;
+                      [(set PSW, (SystemZcmp FP64:$src1, FP64:$src2))]>;
 
 def FCMP32rm : Pseudo<(outs), (ins FP32:$src1, rriaddr12:$src2),
                       "ceb\t$src1, $src2",
-                      [(SystemZcmp FP32:$src1, (load rriaddr12:$src2)),
-                       (implicit PSW)]>;
+                      [(set PSW, (SystemZcmp FP32:$src1,
+                                             (load rriaddr12:$src2)))]>;
 def FCMP64rm : Pseudo<(outs), (ins FP64:$src1, rriaddr12:$src2),
                       "cdb\t$src1, $src2",
-                      [(SystemZcmp FP64:$src1, (load rriaddr12:$src2)),
-                       (implicit PSW)]>;
+                      [(set PSW, (SystemZcmp FP64:$src1,
+                                             (load rriaddr12:$src2)))]>;
 } // Defs = [PSW]
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 5fa7e8c..06f01e7 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -424,6 +424,8 @@ bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   MachineBasicBlock::iterator I = MBB.end();
   while (I != MBB.begin()) {
     --I;
+    if (I->isDebugValue())
+      continue;
     // Working from the bottom, when we see a non-terminator
     // instruction, we're done.
     if (!isUnpredicatedTerminator(I))
@@ -500,6 +502,8 @@ unsigned SystemZInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 
   while (I != MBB.begin()) {
     --I;
+    if (I->isDebugValue())
+      continue;
     if (I->getOpcode() != SystemZ::JMP &&
         getCondFromBranchOpc(I->getOpcode()) == SystemZCC::INVALID)
       break;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index 0d1af23..22bde4e 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -31,7 +31,8 @@ class SDTCisI64<int OpNum> : SDTCisVT<OpNum, i64>;
 def SDT_SystemZCall         : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
 def SDT_SystemZCallSeqStart : SDCallSeqStart<[SDTCisI64<0>]>;
 def SDT_SystemZCallSeqEnd   : SDCallSeqEnd<[SDTCisI64<0>, SDTCisI64<1>]>;
-def SDT_CmpTest             : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_CmpTest             : SDTypeProfile<1, 2, [SDTCisI64<0>,
+                                                   SDTCisSameAs<1, 2>]>;
 def SDT_BrCond              : SDTypeProfile<0, 3,
                                            [SDTCisVT<0, OtherVT>,
                                             SDTCisI8<1>, SDTCisVT<2, i64>]>;
@@ -980,100 +981,89 @@ let Defs = [PSW] in {
 def CMP32rr : RRI<0x19,
                   (outs), (ins GR32:$src1, GR32:$src2),
                   "cr\t$src1, $src2",
-                  [(SystemZcmp GR32:$src1, GR32:$src2), 
-                   (implicit PSW)]>;
+                  [(set PSW, (SystemZcmp GR32:$src1, GR32:$src2))]>; 
 def CMP64rr : RREI<0xB920,
                    (outs), (ins GR64:$src1, GR64:$src2),
                    "cgr\t$src1, $src2",
-                   [(SystemZcmp GR64:$src1, GR64:$src2), 
-                    (implicit PSW)]>;
+                   [(set PSW, (SystemZcmp GR64:$src1, GR64:$src2))]>;
 
 def CMP32ri   : RILI<0xC2D,
                      (outs), (ins GR32:$src1, s32imm:$src2),
                      "cfi\t$src1, $src2",
-                     [(SystemZcmp GR32:$src1, imm:$src2), 
-                      (implicit PSW)]>;
+                     [(set PSW, (SystemZcmp GR32:$src1, imm:$src2))]>;
 def CMP64ri32 : RILI<0xC2C,
                      (outs), (ins GR64:$src1, s32imm64:$src2),
                      "cgfi\t$src1, $src2",
-                     [(SystemZcmp GR64:$src1, i64immSExt32:$src2),
-                      (implicit PSW)]>;
+                     [(set PSW, (SystemZcmp GR64:$src1, i64immSExt32:$src2))]>;
 
 def CMP32rm : RXI<0x59,
                   (outs), (ins GR32:$src1, rriaddr12:$src2),
                   "c\t$src1, $src2",
-                  [(SystemZcmp GR32:$src1, (load rriaddr12:$src2)),
-                   (implicit PSW)]>;
+                  [(set PSW, (SystemZcmp GR32:$src1, (load rriaddr12:$src2)))]>;
 def CMP32rmy : RXYI<0xE359,
                     (outs), (ins GR32:$src1, rriaddr:$src2),
                     "cy\t$src1, $src2",
-                    [(SystemZcmp GR32:$src1, (load rriaddr:$src2)),
-                     (implicit PSW)]>;
+                    [(set PSW, (SystemZcmp GR32:$src1, (load rriaddr:$src2)))]>;
 def CMP64rm  : RXYI<0xE320,
                     (outs), (ins GR64:$src1, rriaddr:$src2),
                     "cg\t$src1, $src2",
-                    [(SystemZcmp GR64:$src1, (load rriaddr:$src2)),
-                     (implicit PSW)]>;
+                    [(set PSW, (SystemZcmp GR64:$src1, (load rriaddr:$src2)))]>;
 
 def UCMP32rr : RRI<0x15,
                    (outs), (ins GR32:$src1, GR32:$src2),
                    "clr\t$src1, $src2",
-                   [(SystemZucmp GR32:$src1, GR32:$src2),
-                    (implicit PSW)]>;
+                   [(set PSW, (SystemZucmp GR32:$src1, GR32:$src2))]>;
 def UCMP64rr : RREI<0xB921,
                     (outs), (ins GR64:$src1, GR64:$src2),
                     "clgr\t$src1, $src2",
-                    [(SystemZucmp GR64:$src1, GR64:$src2), 
-                     (implicit PSW)]>;
+                    [(set PSW, (SystemZucmp GR64:$src1, GR64:$src2))]>;
 
 def UCMP32ri   : RILI<0xC2F,
                       (outs), (ins GR32:$src1, i32imm:$src2),
                       "clfi\t$src1, $src2",
-                      [(SystemZucmp GR32:$src1, imm:$src2),
-                       (implicit PSW)]>;
+                      [(set PSW, (SystemZucmp GR32:$src1, imm:$src2))]>;
 def UCMP64ri32 : RILI<0xC2E,
                       (outs), (ins GR64:$src1, i64i32imm:$src2),
                       "clgfi\t$src1, $src2",
-                      [(SystemZucmp GR64:$src1, i64immZExt32:$src2),
-                       (implicit PSW)]>;
+                      [(set PSW,(SystemZucmp GR64:$src1, i64immZExt32:$src2))]>;
 
 def UCMP32rm  : RXI<0x55,
                     (outs), (ins GR32:$src1, rriaddr12:$src2),
                     "cl\t$src1, $src2",
-                    [(SystemZucmp GR32:$src1, (load rriaddr12:$src2)),
-                     (implicit PSW)]>;
+                    [(set PSW, (SystemZucmp GR32:$src1,
+                                            (load rriaddr12:$src2)))]>;
 def UCMP32rmy : RXYI<0xE355,
                      (outs), (ins GR32:$src1, rriaddr:$src2),
                      "cly\t$src1, $src2",
-                     [(SystemZucmp GR32:$src1, (load rriaddr:$src2)),
-                      (implicit PSW)]>;
+                     [(set PSW, (SystemZucmp GR32:$src1,
+                                             (load rriaddr:$src2)))]>;
 def UCMP64rm  : RXYI<0xE351,
                      (outs), (ins GR64:$src1, rriaddr:$src2),
                      "clg\t$src1, $src2",
-                     [(SystemZucmp GR64:$src1, (load rriaddr:$src2)),
-                      (implicit PSW)]>;
+                     [(set PSW, (SystemZucmp GR64:$src1,
+                                             (load rriaddr:$src2)))]>;
 
 def CMPSX64rr32  : RREI<0xB930,
                         (outs), (ins GR64:$src1, GR32:$src2),
                         "cgfr\t$src1, $src2",
-                        [(SystemZucmp GR64:$src1, (sext GR32:$src2)),
-                         (implicit PSW)]>;
+                        [(set PSW, (SystemZucmp GR64:$src1,
+                                                (sext GR32:$src2)))]>;
 def UCMPZX64rr32 : RREI<0xB931,
                         (outs), (ins GR64:$src1, GR32:$src2),
                         "clgfr\t$src1, $src2",
-                        [(SystemZucmp GR64:$src1, (zext GR32:$src2)),
-                         (implicit PSW)]>;
+                        [(set PSW, (SystemZucmp GR64:$src1,
+                                                (zext GR32:$src2)))]>;
 
 def CMPSX64rm32   : RXYI<0xE330,
                          (outs), (ins GR64:$src1, rriaddr:$src2),
                          "cgf\t$src1, $src2",
-                         [(SystemZucmp GR64:$src1, (sextloadi64i32 rriaddr:$src2)),
-                          (implicit PSW)]>;
+                         [(set PSW, (SystemZucmp GR64:$src1,
+                                             (sextloadi64i32 rriaddr:$src2)))]>;
 def UCMPZX64rm32  : RXYI<0xE331,
                          (outs), (ins GR64:$src1, rriaddr:$src2),
                          "clgf\t$src1, $src2",
-                         [(SystemZucmp GR64:$src1, (zextloadi64i32 rriaddr:$src2)),
-                          (implicit PSW)]>;
+                         [(set PSW, (SystemZucmp GR64:$src1,
+                                             (zextloadi64i32 rriaddr:$src2)))]>;
 
 // FIXME: Add other crazy ucmp forms
 
diff --git a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
index c3dcf8e..66bb914 100644
--- a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
+++ b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
@@ -524,6 +524,11 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
           OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/);
         else
           // Internal to current translation unit.
+          //
+          // When we place the LSDA into the TEXT section, the type info
+          // pointers need to be indirect and pc-rel. We accomplish this by
+          // using NLPs.  However, sometimes the types are local to the file. So
+          // we need to fill in the value for the NLP in those cases.
           OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
                                                         OutContext),
                                 4/*size*/, 0/*addrspace*/);
diff --git a/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp b/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp
index 7d29d97..c851ca3 100644
--- a/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp
+++ b/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp
@@ -83,7 +83,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
   case X86II::MO_DARWIN_NONLAZY:
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
     Name += "$non_lazy_ptr";
-    MCSymbol *Sym = Ctx.GetOrCreateTemporarySymbol(Name.str());
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
 
     MachineModuleInfoImpl::StubValueTy &StubSym =
       getMachOMMI().getGVStubEntry(Sym);
@@ -98,7 +98,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
   }
   case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: {
     Name += "$non_lazy_ptr";
-    MCSymbol *Sym = Ctx.GetOrCreateTemporarySymbol(Name.str());
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
     MachineModuleInfoImpl::StubValueTy &StubSym =
       getMachOMMI().getHiddenGVStubEntry(Sym);
     if (StubSym.getPointer() == 0) {
@@ -112,7 +112,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
   }
   case X86II::MO_DARWIN_STUB: {
     Name += "$stub";
-    MCSymbol *Sym = Ctx.GetOrCreateTemporarySymbol(Name.str());
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
     MachineModuleInfoImpl::StubValueTy &StubSym =
       getMachOMMI().getFnStubEntry(Sym);
     if (StubSym.getPointer())
@@ -127,7 +127,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
       Name.erase(Name.end()-5, Name.end());
       StubSym =
         MachineModuleInfoImpl::
-        StubValueTy(Ctx.GetOrCreateTemporarySymbol(Name.str()), false);
+        StubValueTy(Ctx.GetOrCreateSymbol(Name.str()), false);
     }
     return Sym;
   }
@@ -287,7 +287,9 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     LowerUnaryToTwoAddr(OutMI, X86::MMX_PCMPEQDrr); break;
   case X86::FsFLD0SS:     LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
   case X86::FsFLD0SD:     LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
-  case X86::V_SET0:       LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break;
+  case X86::V_SET0PS:     LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break;
+  case X86::V_SET0PD:     LowerUnaryToTwoAddr(OutMI, X86::XORPDrr); break;
+  case X86::V_SET0PI:     LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
   case X86::V_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break;
 
   case X86::MOV16r0:
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 4d3dedf..22285f1 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -15,6 +15,7 @@ tablegen(X86GenCallingConv.inc -gen-callingconv)
 tablegen(X86GenSubtarget.inc -gen-subtarget)
 
 set(sources
+  SSEDomainFix.cpp
   X86AsmBackend.cpp
   X86CodeEmitter.cpp
   X86COFFMachineModuleInfo.cpp
diff --git a/lib/Target/X86/SSEDomainFix.cpp b/lib/Target/X86/SSEDomainFix.cpp
new file mode 100644
index 0000000..395ab57
--- /dev/null
+++ b/lib/Target/X86/SSEDomainFix.cpp
@@ -0,0 +1,499 @@
+//===- SSEDomainFix.cpp - Use proper int/float domain for SSE ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SSEDomainFix pass.
+//
+// Some SSE instructions like mov, and, or, xor are available in different
+// variants for different operand types. These variant instructions are
+// equivalent, but on Nehalem and newer cpus there is extra latency
+// transferring data between integer and floating point domains.
+//
+// This pass changes the variant instructions to minimize domain crossings.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sse-domain-fix"
+#include "X86InstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+
+/// Allocate objects from a pool, allow objects to be recycled, and provide a
+/// way of deleting everything.
+template<typename T, unsigned PageSize = 64>
+class PoolAllocator {
+  std::vector<T*> Pages, Avail;
+public:
+  ~PoolAllocator() { Clear(); }
+
+  T* Alloc() {
+    if (Avail.empty()) {
+      T *p = new T[PageSize];
+      Pages.push_back(p);
+      Avail.reserve(PageSize);
+      for (unsigned n = 0; n != PageSize; ++n)
+        Avail.push_back(p+n);
+    }
+    T *p = Avail.back();
+    Avail.pop_back();
+    return p;
+  }
+
+  // Allow object to be reallocated. It won't be reconstructed.
+  void Recycle(T *p) {
+    p->clear();
+    Avail.push_back(p);
+  }
+
+  // Destroy all objects, make sure there are no external pointers to them.
+  void Clear() {
+    Avail.clear();
+    while (!Pages.empty()) {
+      delete[] Pages.back();
+      Pages.pop_back();
+    }
+  }
+};
+
+/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track
+/// of execution domains.
+///
+/// An open DomainValue represents a set of instructions that can still switch
+/// execution domain. Multiple registers may refer to the same open
+/// DomainValue - they will eventually be collapsed to the same execution
+/// domain.
+///
+/// A collapsed DomainValue represents a single register that has been forced
+/// into one of more execution domains. There is a separate collapsed
+/// DomainValue for each register, but it may contain multiple execution
+/// domains. A register value is initially created in a single execution
+/// domain, but if we were forced to pay the penalty of a domain crossing, we
+/// keep track of the fact the the register is now available in multiple
+/// domains.
+struct DomainValue {
+  // Basic reference counting.
+  unsigned Refs;
+
+  // Available domains. For an open DomainValue, it is the still possible
+  // domains for collapsing. For a collapsed DomainValue it is the domains where
+  // the register is available for free.
+  unsigned Mask;
+
+  // Position of the last defining instruction.
+  unsigned Dist;
+
+  // Twiddleable instructions using or defining these registers.
+  SmallVector<MachineInstr*, 8> Instrs;
+
+  // Collapsed DomainValue have no instructions to twiddle - it simply keeps
+  // track of the domains where the registers are already available.
+  bool collapsed() const { return Instrs.empty(); }
+
+  // Is any domain in mask available?
+  bool compat(unsigned mask) const {
+    return Mask & mask;
+  }
+
+  // Mark domain as available.
+  void add(unsigned domain) {
+    Mask |= 1u << domain;
+  }
+
+  // First domain available in mask.
+  unsigned firstDomain() const {
+    return CountTrailingZeros_32(Mask);
+  }
+
+  DomainValue() { clear(); }
+
+  void clear() {
+    Refs = Mask = Dist = 0;
+    Instrs.clear();
+  }
+};
+
+static const unsigned NumRegs = 16;
+
+class SSEDomainFixPass : public MachineFunctionPass {
+  static char ID;
+  PoolAllocator<DomainValue> Pool;
+
+  MachineFunction *MF;
+  const X86InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineBasicBlock *MBB;
+  DomainValue **LiveRegs;
+  typedef DenseMap<MachineBasicBlock*,DomainValue**> LiveOutMap;
+  LiveOutMap LiveOuts;
+  unsigned Distance;
+
+public:
+  SSEDomainFixPass() : MachineFunctionPass(&ID) {}
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual const char *getPassName() const {
+    return "SSE execution domain fixup";
+  }
+
+private:
+  // Register mapping.
+  int RegIndex(unsigned Reg);
+
+  // LiveRegs manipulations.
+  void SetLiveReg(int rx, DomainValue *DV);
+  void Kill(int rx);
+  void Force(int rx, unsigned domain);
+  void Collapse(DomainValue *dv, unsigned domain);
+  bool Merge(DomainValue *A, DomainValue *B);
+
+  void enterBasicBlock();
+  void visitGenericInstr(MachineInstr*);
+  void visitSoftInstr(MachineInstr*, unsigned mask);
+  void visitHardInstr(MachineInstr*, unsigned domain);
+};
+}
+
+char SSEDomainFixPass::ID = 0;
+
+/// Translate TRI register number to an index into our smaller tables of
+/// interesting registers. Return -1 for boring registers.
+int SSEDomainFixPass::RegIndex(unsigned reg) {
+  // Registers are sorted lexicographically.
+  // We just need them to be consecutive, ordering doesn't matter.
+  assert(X86::XMM9 == X86::XMM0+NumRegs-1 && "Unexpected sort");
+  reg -= X86::XMM0;
+  return reg < NumRegs ? reg : -1;
+}
+
+/// Set LiveRegs[rx] = dv, updating reference counts.
+void SSEDomainFixPass::SetLiveReg(int rx, DomainValue *dv) {
+  assert(unsigned(rx) < NumRegs && "Invalid index");
+  if (!LiveRegs)
+    LiveRegs = (DomainValue**)calloc(sizeof(DomainValue*), NumRegs);
+
+  if (LiveRegs[rx] == dv)
+    return;
+  if (LiveRegs[rx]) {
+    assert(LiveRegs[rx]->Refs && "Bad refcount");
+    if (--LiveRegs[rx]->Refs == 0) Pool.Recycle(LiveRegs[rx]);
+  }
+  LiveRegs[rx] = dv;
+  if (dv) ++dv->Refs;
+}
+
+// Kill register rx, recycle or collapse any DomainValue.
+void SSEDomainFixPass::Kill(int rx) {
+  assert(unsigned(rx) < NumRegs && "Invalid index");
+  if (!LiveRegs || !LiveRegs[rx]) return;
+
+  // Before killing the last reference to an open DomainValue, collapse it to
+  // the first available domain.
+  if (LiveRegs[rx]->Refs == 1 && !LiveRegs[rx]->collapsed())
+    Collapse(LiveRegs[rx], LiveRegs[rx]->firstDomain());
+  else
+    SetLiveReg(rx, 0);
+}
+
+/// Force register rx into domain.
+void SSEDomainFixPass::Force(int rx, unsigned domain) {
+  assert(unsigned(rx) < NumRegs && "Invalid index");
+  DomainValue *dv;
+  if (LiveRegs && (dv = LiveRegs[rx])) {
+    if (dv->collapsed())
+      dv->add(domain);
+    else
+      Collapse(dv, domain);
+  } else {
+    // Set up basic collapsed DomainValue.
+    dv = Pool.Alloc();
+    dv->Dist = Distance;
+    dv->add(domain);
+    SetLiveReg(rx, dv);
+  }
+}
+
+/// Collapse open DomainValue into given domain. If there are multiple
+/// registers using dv, they each get a unique collapsed DomainValue.
+void SSEDomainFixPass::Collapse(DomainValue *dv, unsigned domain) {
+  assert(dv->compat(1u << domain) && "Cannot collapse");
+
+  // Collapse all the instructions.
+  while (!dv->Instrs.empty()) {
+    MachineInstr *mi = dv->Instrs.back();
+    TII->SetSSEDomain(mi, domain);
+    dv->Instrs.pop_back();
+  }
+  dv->Mask = 1u << domain;
+
+  // If there are multiple users, give them new, unique DomainValues.
+  if (LiveRegs && dv->Refs > 1) {
+    for (unsigned rx = 0; rx != NumRegs; ++rx)
+      if (LiveRegs[rx] == dv) {
+        DomainValue *dv2 = Pool.Alloc();
+        dv2->Dist = Distance;
+        dv2->add(domain);
+        SetLiveReg(rx, dv2);
+      }
+  }
+}
+
+/// Merge - All instructions and registers in B are moved to A, and B is
+/// released.
+bool SSEDomainFixPass::Merge(DomainValue *A, DomainValue *B) {
+  assert(!A->collapsed() && "Cannot merge into collapsed");
+  assert(!B->collapsed() && "Cannot merge from collapsed");
+  if (A == B)
+    return true;
+  if (!A->compat(B->Mask))
+    return false;
+  A->Mask &= B->Mask;
+  A->Dist = std::max(A->Dist, B->Dist);
+  A->Instrs.append(B->Instrs.begin(), B->Instrs.end());
+  for (unsigned rx = 0; rx != NumRegs; ++rx)
+    if (LiveRegs[rx] == B)
+      SetLiveReg(rx, A);
+  return true;
+}
+
+void SSEDomainFixPass::enterBasicBlock() {
+  // Try to coalesce live-out registers from predecessors.
+  for (MachineBasicBlock::const_livein_iterator i = MBB->livein_begin(),
+         e = MBB->livein_end(); i != e; ++i) {
+    int rx = RegIndex(*i);
+    if (rx < 0) continue;
+    for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(),
+           pe = MBB->pred_end(); pi != pe; ++pi) {
+      LiveOutMap::const_iterator fi = LiveOuts.find(*pi);
+      if (fi == LiveOuts.end()) continue;
+      DomainValue *pdv = fi->second[rx];
+      if (!pdv) continue;
+      if (!LiveRegs || !LiveRegs[rx]) {
+        SetLiveReg(rx, pdv);
+        continue;
+      }
+
+      // We have a live DomainValue from more than one predecessor.
+      if (LiveRegs[rx]->collapsed()) {
+        // We are already collapsed, but predecessor is not. Force him.
+        if (!pdv->collapsed())
+          Collapse(pdv, LiveRegs[rx]->firstDomain());
+        continue;
+      }
+      
+      // Currently open, merge in predecessor.
+      if (!pdv->collapsed())
+        Merge(LiveRegs[rx], pdv);
+      else
+        Collapse(LiveRegs[rx], pdv->firstDomain());
+    }
+  }
+}
+
+// A hard instruction only works in one domain. All input registers will be
+// forced into that domain.
+void SSEDomainFixPass::visitHardInstr(MachineInstr *mi, unsigned domain) {
+  // Collapse all uses.
+  for (unsigned i = mi->getDesc().getNumDefs(),
+                e = mi->getDesc().getNumOperands(); i != e; ++i) {
+    MachineOperand &mo = mi->getOperand(i);
+    if (!mo.isReg()) continue;
+    int rx = RegIndex(mo.getReg());
+    if (rx < 0) continue;
+    Force(rx, domain);
+  }
+
+  // Kill all defs and force them.
+  for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) {
+    MachineOperand &mo = mi->getOperand(i);
+    if (!mo.isReg()) continue;
+    int rx = RegIndex(mo.getReg());
+    if (rx < 0) continue;
+    Kill(rx);
+    Force(rx, domain);
+  }
+}
+
+// A soft instruction can be changed to work in other domains given by mask.
+void SSEDomainFixPass::visitSoftInstr(MachineInstr *mi, unsigned mask) {
+  // Scan the explicit use operands for incoming domains.
+  unsigned collmask = mask;
+  SmallVector<int, 4> used;
+  if (LiveRegs)
+    for (unsigned i = mi->getDesc().getNumDefs(),
+                  e = mi->getDesc().getNumOperands(); i != e; ++i) {
+      MachineOperand &mo = mi->getOperand(i);
+      if (!mo.isReg()) continue;
+      int rx = RegIndex(mo.getReg());
+      if (rx < 0) continue;
+      if (DomainValue *dv = LiveRegs[rx]) {
+        // Is it possible to use this collapsed register for free?
+        if (dv->collapsed()) {
+          if (unsigned m = collmask & dv->Mask)
+            collmask = m;
+        } else if (dv->compat(collmask))
+          used.push_back(rx);
+        else
+          Kill(rx);
+      }
+    }
+
+  // If the collapsed operands force a single domain, propagate the collapse.
+  if (isPowerOf2_32(collmask)) {
+    unsigned domain = CountTrailingZeros_32(collmask);
+    TII->SetSSEDomain(mi, domain);
+    visitHardInstr(mi, domain);
+    return;
+  }
+
+  // Kill off any remaining uses that don't match collmask, and build a list of
+  // incoming DomainValue that we want to merge.
+  SmallVector<DomainValue*,4> doms;
+  for (SmallVector<int, 4>::iterator i=used.begin(), e=used.end(); i!=e; ++i) {
+    int rx = *i;
+    DomainValue *dv = LiveRegs[rx];
+    // This useless DomainValue could have been missed above.
+    if (!dv->compat(collmask)) {
+      Kill(*i);
+      continue;
+    }
+    // sorted, uniqued insert.
+    bool inserted = false;
+    for (SmallVector<DomainValue*,4>::iterator i = doms.begin(), e = doms.end();
+           i != e && !inserted; ++i) {
+      if (dv == *i)
+        inserted = true;
+      else if (dv->Dist < (*i)->Dist) {
+        inserted = true;
+        doms.insert(i, dv);
+      }
+    }
+    if (!inserted)
+      doms.push_back(dv);
+  }
+
+  //  doms are now sorted in order of appearance. Try to merge them all, giving
+  //  priority to the latest ones.
+  DomainValue *dv = 0;
+  while (!doms.empty()) {
+    if (!dv) {
+      dv = doms.pop_back_val();
+      continue;
+    }
+    
+    DomainValue *ThisDV = doms.pop_back_val();
+    if (Merge(dv, ThisDV)) continue;
+    
+    for (SmallVector<int,4>::iterator i=used.begin(), e=used.end(); i != e; ++i)
+      if (LiveRegs[*i] == ThisDV)
+        Kill(*i);
+  }
+
+  // dv is the DomainValue we are going to use for this instruction.
+  if (!dv)
+    dv = Pool.Alloc();
+  dv->Dist = Distance;
+  dv->Mask = collmask;
+  dv->Instrs.push_back(mi);
+
+  // Finally set all defs and non-collapsed uses to dv.
+  for (unsigned i = 0, e = mi->getDesc().getNumOperands(); i != e; ++i) {
+    MachineOperand &mo = mi->getOperand(i);
+    if (!mo.isReg()) continue;
+    int rx = RegIndex(mo.getReg());
+    if (rx < 0) continue;
+    if (!LiveRegs || !LiveRegs[rx] || (mo.isDef() && LiveRegs[rx]!=dv)) {
+      Kill(rx);
+      SetLiveReg(rx, dv);
+    }
+  }
+}
+
+void SSEDomainFixPass::visitGenericInstr(MachineInstr *mi) {
+  // Process explicit defs, kill any XMM registers redefined.
+  for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) {
+    MachineOperand &mo = mi->getOperand(i);
+    if (!mo.isReg()) continue;
+    int rx = RegIndex(mo.getReg());
+    if (rx < 0) continue;
+    Kill(rx);
+  }
+}
+
+bool SSEDomainFixPass::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  TII = static_cast<const X86InstrInfo*>(MF->getTarget().getInstrInfo());
+  TRI = MF->getTarget().getRegisterInfo();
+  MBB = 0;
+  LiveRegs = 0;
+  Distance = 0;
+  assert(NumRegs == X86::VR128RegClass.getNumRegs() && "Bad regclass");
+
+  // If no XMM registers are used in the function, we can skip it completely.
+  bool anyregs = false;
+  for (TargetRegisterClass::const_iterator I = X86::VR128RegClass.begin(),
+         E = X86::VR128RegClass.end(); I != E; ++I)
+    if (MF->getRegInfo().isPhysRegUsed(*I)) {
+      anyregs = true;
+      break;
+    }
+  if (!anyregs) return false;
+
+  MachineBasicBlock *Entry = MF->begin();
+  SmallPtrSet<MachineBasicBlock*, 16> Visited;
+  for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*, 16> >
+         DFI = df_ext_begin(Entry, Visited), DFE = df_ext_end(Entry, Visited);
+         DFI != DFE; ++DFI) {
+    MBB = *DFI;
+    enterBasicBlock();
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+        ++I) {
+      MachineInstr *mi = I;
+      if (mi->isDebugValue()) continue;
+      ++Distance;
+      std::pair<uint16_t, uint16_t> domp = TII->GetSSEDomain(mi);
+      if (domp.first)
+        if (domp.second)
+          visitSoftInstr(mi, domp.second);
+        else
+          visitHardInstr(mi, domp.first);
+      else if (LiveRegs)
+        visitGenericInstr(mi);
+    }
+
+    // Save live registers at end of MBB - used by enterBasicBlock().
+    if (LiveRegs)
+      LiveOuts.insert(std::make_pair(MBB, LiveRegs));
+    LiveRegs = 0;
+  }
+
+  // Clear the LiveOuts vectors. Should we also collapse any remaining
+  // DomainValues?
+  for (LiveOutMap::const_iterator i = LiveOuts.begin(), e = LiveOuts.end();
+         i != e; ++i)
+    free(i->second);
+  LiveOuts.clear();
+  Pool.Clear();
+
+  return false;
+}
+
+FunctionPass *llvm::createSSEDomainFixPass() {
+  return new SSEDomainFixPass();
+}
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index c753cf2..9be38a4 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -41,6 +41,10 @@ FunctionPass *createX86ISelDag(X86TargetMachine &TM,
 ///
 FunctionPass *createX86FloatingPointStackifierPass();
 
+/// createSSEDomainFixPass - This pass twiddles SSE opcodes to prevent domain
+/// crossings.
+FunctionPass *createSSEDomainFixPass();
+
 /// createX87FPRegKillInserterPass - This function returns a pass which
 /// inserts FP_REG_KILL instructions where needed.
 ///
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 2be51e1..6b62795 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -59,6 +59,9 @@ def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
                                       [FeatureCMOV]>;
 def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
                                        "Bit testing of memory is slow">;
+def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
+                                        "IsUAMemFast", "true",
+                                        "Fast unaligned memory access">;
 def FeatureSSE4A   : SubtargetFeature<"sse4a", "HasSSE4A", "true",
                                       "Support SSE 4a instructions">;
 
@@ -98,8 +101,10 @@ def : Proc<"nocona",          [FeatureSSE3,   Feature64Bit, FeatureSlowBTMem]>;
 def : Proc<"core2",           [FeatureSSSE3,  Feature64Bit, FeatureSlowBTMem]>;
 def : Proc<"penryn",          [FeatureSSE41,  Feature64Bit, FeatureSlowBTMem]>;
 def : Proc<"atom",            [FeatureSSE3,   Feature64Bit, FeatureSlowBTMem]>;
-def : Proc<"corei7",          [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem]>;
-def : Proc<"nehalem",         [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"corei7",          [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem,
+                               FeatureFastUAMem]>;
+def : Proc<"nehalem",         [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem,
+                               FeatureFastUAMem]>;
 // Sandy Bridge does not have FMA
 def : Proc<"sandybridge",     [FeatureSSE42,  FeatureAVX,   Feature64Bit]>;
 
@@ -160,10 +165,11 @@ def X86InstrInfo : InstrInfo {
                        "hasAdSizePrefix",
                        "Prefix",
                        "hasREX_WPrefix",
-                       "ImmTypeBits",
-                       "FPFormBits",
+                       "ImmT.Value",
+                       "FPForm.Value",
                        "hasLockPrefix",
                        "SegOvrBits",
+                       "ExeDomain.Value",
                        "Opcode"];
   let TSFlagsShifts = [0,
                        6,
@@ -174,6 +180,7 @@ def X86InstrInfo : InstrInfo {
                        16,
                        19,
                        20,
+                       22,
                        24];
 }
 
diff --git a/lib/Target/X86/X86AsmBackend.cpp b/lib/Target/X86/X86AsmBackend.cpp
index 754a200..8e2928c3 100644
--- a/lib/Target/X86/X86AsmBackend.cpp
+++ b/lib/Target/X86/X86AsmBackend.cpp
@@ -10,10 +10,14 @@
 #include "llvm/Target/TargetAsmBackend.h"
 #include "X86.h"
 #include "X86FixupKinds.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MachObjectWriter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegistry.h"
 #include "llvm/Target/TargetAsmBackend.h"
 using namespace llvm;
@@ -48,8 +52,135 @@ public:
     for (unsigned i = 0; i != Size; ++i)
       DF.getContents()[Fixup.Offset + i] = uint8_t(Value >> (i * 8));
   }
+
+  bool MayNeedRelaxation(const MCInst &Inst,
+                         const SmallVectorImpl<MCAsmFixup> &Fixups) const;
+
+  void RelaxInstruction(const MCInstFragment *IF, MCInst &Res) const;
+
+  bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const;
 };
 
+static unsigned getRelaxedOpcode(unsigned Op) {
+  switch (Op) {
+  default:
+    return Op;
+
+  case X86::JAE_1: return X86::JAE_4;
+  case X86::JA_1:  return X86::JA_4;
+  case X86::JBE_1: return X86::JBE_4;
+  case X86::JB_1:  return X86::JB_4;
+  case X86::JE_1:  return X86::JE_4;
+  case X86::JGE_1: return X86::JGE_4;
+  case X86::JG_1:  return X86::JG_4;
+  case X86::JLE_1: return X86::JLE_4;
+  case X86::JL_1:  return X86::JL_4;
+  case X86::JMP_1: return X86::JMP_4;
+  case X86::JNE_1: return X86::JNE_4;
+  case X86::JNO_1: return X86::JNO_4;
+  case X86::JNP_1: return X86::JNP_4;
+  case X86::JNS_1: return X86::JNS_4;
+  case X86::JO_1:  return X86::JO_4;
+  case X86::JP_1:  return X86::JP_4;
+  case X86::JS_1:  return X86::JS_4;
+  }
+}
+
+bool X86AsmBackend::MayNeedRelaxation(const MCInst &Inst,
+                              const SmallVectorImpl<MCAsmFixup> &Fixups) const {
+  // Check for a 1byte pcrel fixup, and enforce that we would know how to relax
+  // this instruction.
+  for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
+    if (unsigned(Fixups[i].Kind) == X86::reloc_pcrel_1byte) {
+      assert(getRelaxedOpcode(Inst.getOpcode()) != Inst.getOpcode());
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// FIXME: Can tblgen help at all here to verify there aren't other instructions
+// we can relax?
+void X86AsmBackend::RelaxInstruction(const MCInstFragment *IF,
+                                     MCInst &Res) const {
+  // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
+  unsigned RelaxedOp = getRelaxedOpcode(IF->getInst().getOpcode());
+
+  if (RelaxedOp == IF->getInst().getOpcode()) {
+    SmallString<256> Tmp;
+    raw_svector_ostream OS(Tmp);
+    IF->getInst().dump_pretty(OS);
+    llvm_report_error("unexpected instruction to relax: " + OS.str());
+  }
+
+  Res = IF->getInst();
+  Res.setOpcode(RelaxedOp);
+}
+
+/// WriteNopData - Write optimal nops to the output file for the \arg Count
+/// bytes.  This returns the number of bytes written.  It may return 0 if
+/// the \arg Count is more than the maximum optimal nops.
+///
+/// FIXME this is X86 32-bit specific and should move to a better place.
+bool X86AsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
+  static const uint8_t Nops[16][16] = {
+    // nop
+    {0x90},
+    // xchg %ax,%ax
+    {0x66, 0x90},
+    // nopl (%[re]ax)
+    {0x0f, 0x1f, 0x00},
+    // nopl 0(%[re]ax)
+    {0x0f, 0x1f, 0x40, 0x00},
+    // nopl 0(%[re]ax,%[re]ax,1)
+    {0x0f, 0x1f, 0x44, 0x00, 0x00},
+    // nopw 0(%[re]ax,%[re]ax,1)
+    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+    // nopl 0L(%[re]ax)
+    {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+    // nopl 0L(%[re]ax,%[re]ax,1)
+    {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    // nopw 0L(%[re]ax,%[re]ax,1)
+    {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    // nopw %cs:0L(%[re]ax,%[re]ax,1)
+    {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    // nopl 0(%[re]ax,%[re]ax,1)
+    // nopw 0(%[re]ax,%[re]ax,1)
+    {0x0f, 0x1f, 0x44, 0x00, 0x00,
+     0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+    // nopw 0(%[re]ax,%[re]ax,1)
+    // nopw 0(%[re]ax,%[re]ax,1)
+    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00,
+     0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+    // nopw 0(%[re]ax,%[re]ax,1)
+    // nopl 0L(%[re]ax) */
+    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00,
+     0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+    // nopl 0L(%[re]ax)
+    // nopl 0L(%[re]ax)
+    {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00,
+     0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+    // nopl 0L(%[re]ax)
+    // nopl 0L(%[re]ax,%[re]ax,1)
+    {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00,
+     0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}
+  };
+
+  // Write an optimal sequence for the first 15 bytes.
+  uint64_t OptimalCount = (Count < 16) ? Count : 15;
+  for (uint64_t i = 0, e = OptimalCount; i != e; i++)
+    OW->Write8(Nops[OptimalCount - 1][i]);
+
+  // Finish with single byte nops.
+  for (uint64_t i = OptimalCount, e = Count; i != e; ++i)
+   OW->Write8(0x90);
+
+  return true;
+}
+
+/* *** */
+
 class ELFX86AsmBackend : public X86AsmBackend {
 public:
   ELFX86AsmBackend(const Target &T)
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 5d3edbb..c69eeb3 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -383,7 +383,7 @@ bool X86FastISel::X86SelectAddress(Value *V, X86AddressMode &AM) {
     if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
       uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
       // They have to fit in the 32-bit signed displacement field though.
-      if (isInt32(Disp)) {
+      if (isInt<32>(Disp)) {
         AM.Disp = (uint32_t)Disp;
         return X86SelectAddress(U->getOperand(0), AM);
       }
@@ -427,7 +427,7 @@ bool X86FastISel::X86SelectAddress(Value *V, X86AddressMode &AM) {
       }
     }
     // Check for displacement overflow.
-    if (!isInt32(Disp))
+    if (!isInt<32>(Disp))
       break;
     // Ok, the GEP indices were covered by constant-offset and scaled-index
     // addressing. Update the address state and move on to examining the base.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 704f9c6..b24d5a1 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -802,6 +802,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
       if (!VT.is128BitVector()) {
         continue;
       }
+      
       setOperationAction(ISD::AND,    SVT, Promote);
       AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
       setOperationAction(ISD::OR,     SVT, Promote);
@@ -1008,7 +1009,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   // FIXME: These should be based on subtarget info. Plus, the values should
   // be smaller when we are in optimizing for size mode.
   maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
-  maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores
+  maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
   maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
   setPrefLoopAlignment(16);
   benefitFromCodePlacementOpt = true;
@@ -1066,23 +1067,37 @@ unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
 }
 
 /// getOptimalMemOpType - Returns the target specific optimal type for load
-/// and store operations as a result of memset, memcpy, and memmove
-/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for
-/// determining it.
+/// and store operations as a result of memset, memcpy, and memmove lowering.
+/// If DstAlign is zero that means it's safe to destination alignment can
+/// satisfy any constraint. Similarly if SrcAlign is zero it means there
+/// isn't a need to check it against alignment requirement, probably because
+/// the source does not need to be loaded. It returns EVT::Other if
+/// SelectionDAG should be responsible for determining it.
 EVT
-X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
-                                       bool isSrcConst, bool isSrcStr,
+X86TargetLowering::getOptimalMemOpType(uint64_t Size,
+                                       unsigned DstAlign, unsigned SrcAlign,
+                                       bool SafeToUseFP,
                                        SelectionDAG &DAG) const {
   // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
   // linux.  This is because the stack realignment code can't handle certain
   // cases like PR2962.  This should be removed when PR2962 is fixed.
   const Function *F = DAG.getMachineFunction().getFunction();
-  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
-  if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) {
-    if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16)
-      return MVT::v4i32;
-    if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16)
-      return MVT::v4f32;
+  if (!F->hasFnAttr(Attribute::NoImplicitFloat)) {
+    if (Size >= 16 &&
+        (Subtarget->isUnalignedMemAccessFast() ||
+         ((DstAlign == 0 || DstAlign >= 16) &&
+          (SrcAlign == 0 || SrcAlign >= 16))) &&
+        Subtarget->getStackAlignment() >= 16) {
+      if (Subtarget->hasSSE2())
+        return MVT::v4i32;
+      if (SafeToUseFP && Subtarget->hasSSE1())
+        return MVT::v4f32;
+    } else if (SafeToUseFP &&
+               Size >= 8 &&
+               !Subtarget->is64Bit() &&
+               Subtarget->getStackAlignment() >= 8 &&
+               Subtarget->hasSSE2())
+      return MVT::f64;
   }
   if (Subtarget->is64Bit() && Size >= 8)
     return MVT::i64;
@@ -1108,8 +1123,8 @@ MCSymbol *
 X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF,
                                     MCContext &Ctx) const {
   const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo();
-  return Ctx.GetOrCreateTemporarySymbol(Twine(MAI.getPrivateGlobalPrefix())+
-                                        Twine(MF->getFunctionNumber())+"$pb");
+  return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+
+                               Twine(MF->getFunctionNumber())+"$pb");
 }
 
 
@@ -2290,6 +2305,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     return false;
 
   // If -tailcallopt is specified, make fastcc functions tail-callable.
+  const MachineFunction &MF = DAG.getMachineFunction();
   const Function *CallerF = DAG.getMachineFunction().getFunction();
   if (GuaranteedTailCallOpt) {
     if (IsTailCallConvention(CalleeCC) &&
@@ -2301,8 +2317,14 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   // Look for obvious safe cases to perform tail call optimization that does not
   // requite ABI changes. This is what gcc calls sibcall.
 
-  // Do not sibcall optimize vararg calls for now.
-  if (isVarArg)
+  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
+  // emit a special epilogue.
+  if (RegInfo->needsStackRealignment(MF))
+    return false;
+
+  // Do not sibcall optimize vararg calls unless the call site is not passing any
+  // arguments.
+  if (isVarArg && !Outs.empty())
     return false;
 
   // Also avoid sibcall optimization if either caller or callee uses struct
@@ -2417,7 +2439,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
                                        bool hasSymbolicDisplacement) {
   // Offset should fit into 32 bit immediate field.
-  if (!isInt32(Offset))
+  if (!isInt<32>(Offset))
     return false;
 
   // If we don't have a symbolic displacement - we don't have any extra
@@ -3613,6 +3635,69 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
   return SDValue();
 }
 
+/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 
+/// vector of type 'VT', see if the elements can be replaced by a single large 
+/// load which has the same value as a build_vector whose operands are 'elts'.
+///
+/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
+/// 
+/// FIXME: we'd also like to handle the case where the last elements are zero
+/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
+/// There's even a handy isZeroNode for that purpose.
+static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
+                                        DebugLoc &dl, SelectionDAG &DAG) {
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = Elts.size();
+  
+  LoadSDNode *LDBase = NULL;
+  unsigned LastLoadedElt = -1U;
+  
+  // For each element in the initializer, see if we've found a load or an undef.
+  // If we don't find an initial load element, or later load elements are 
+  // non-consecutive, bail out.
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue Elt = Elts[i];
+    
+    if (!Elt.getNode() ||
+        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
+      return SDValue();
+    if (!LDBase) {
+      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
+        return SDValue();
+      LDBase = cast<LoadSDNode>(Elt.getNode());
+      LastLoadedElt = i;
+      continue;
+    }
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+
+    LoadSDNode *LD = cast<LoadSDNode>(Elt);
+    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
+      return SDValue();
+    LastLoadedElt = i;
+  }
+
+  // If we have found an entire vector of loads and undefs, then return a large
+  // load of the entire vector width starting at the base pointer.  If we found
+  // consecutive loads for the low half, generate a vzext_load node.
+  if (LastLoadedElt == NumElems - 1) {
+    if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
+      return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
+                         LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
+                         LDBase->isVolatile(), LDBase->isNonTemporal(), 0);
+    return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
+                       LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
+                       LDBase->isVolatile(), LDBase->isNonTemporal(),
+                       LDBase->getAlignment());
+  } else if (NumElems == 4 && LastLoadedElt == 1) {
+    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
+    SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
+  }
+  return SDValue();
+}
+
 SDValue
 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
@@ -3841,14 +3926,18 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
   }
 
-  if (Values.size() > 2) {
-    // If we have SSE 4.1, Expand into a number of inserts unless the number of
-    // values to be inserted is equal to the number of elements, in which case
-    // use the unpack code below in the hopes of matching the consecutive elts
-    // load merge pattern for shuffles.
-    // FIXME: We could probably just check that here directly.
-    if (Values.size() < NumElems && VT.getSizeInBits() == 128 &&
-        getSubtarget()->hasSSE41()) {
+  if (Values.size() > 1 && VT.getSizeInBits() == 128) {
+    // Check for a build vector of consecutive loads.
+    for (unsigned i = 0; i < NumElems; ++i)
+      V[i] = Op.getOperand(i);
+    
+    // Check for elements which are consecutive loads.
+    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
+    if (LD.getNode())
+      return LD;
+    
+    // For SSE 4.1, use inserts into undef.  
+    if (getSubtarget()->hasSSE41()) {
       V[0] = DAG.getUNDEF(VT);
       for (unsigned i = 0; i < NumElems; ++i)
         if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
@@ -3856,7 +3945,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
                              Op.getOperand(i), DAG.getIntPtrConstant(i));
       return V[0];
     }
-    // Expand into a number of unpckl*.
+    
+    // Otherwise, expand into a number of unpckl*
     // e.g. for v4f32
     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
@@ -3871,7 +3961,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
     }
     return V[0];
   }
-
   return SDValue();
 }
 
@@ -8797,83 +8886,24 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
   return TargetLowering::isGAPlusOffset(N, GA, Offset);
 }
 
-static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
-                                     EVT EltVT, LoadSDNode *&LDBase,
-                                     unsigned &LastLoadedElt,
-                                     SelectionDAG &DAG, MachineFrameInfo *MFI,
-                                     const TargetLowering &TLI) {
-  LDBase = NULL;
-  LastLoadedElt = -1U;
-  for (unsigned i = 0; i < NumElems; ++i) {
-    if (N->getMaskElt(i) < 0) {
-      if (!LDBase)
-        return false;
-      continue;
-    }
-
-    SDValue Elt = DAG.getShuffleScalarElt(N, i);
-    if (!Elt.getNode() ||
-        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
-      return false;
-    if (!LDBase) {
-      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
-        return false;
-      LDBase = cast<LoadSDNode>(Elt.getNode());
-      LastLoadedElt = i;
-      continue;
-    }
-    if (Elt.getOpcode() == ISD::UNDEF)
-      continue;
-
-    LoadSDNode *LD = cast<LoadSDNode>(Elt);
-    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
-      return false;
-    LastLoadedElt = i;
-  }
-  return true;
-}
-
 /// PerformShuffleCombine - Combine a vector_shuffle that is equal to
 /// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
 /// if the load addresses are consecutive, non-overlapping, and in the right
-/// order.  In the case of v2i64, it will see if it can rewrite the
-/// shuffle to be an appropriate build vector so it can take advantage of
-// performBuildVectorCombine.
+/// order.
 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
                                      const TargetLowering &TLI) {
   DebugLoc dl = N->getDebugLoc();
   EVT VT = N->getValueType(0);
-  EVT EltVT = VT.getVectorElementType();
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
-  unsigned NumElems = VT.getVectorNumElements();
 
   if (VT.getSizeInBits() != 128)
     return SDValue();
 
-  // Try to combine a vector_shuffle into a 128-bit load.
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  LoadSDNode *LD = NULL;
-  unsigned LastLoadedElt;
-  if (!EltsFromConsecutiveLoads(SVN, NumElems, EltVT, LD, LastLoadedElt, DAG,
-                                MFI, TLI))
-    return SDValue();
-
-  if (LastLoadedElt == NumElems - 1) {
-    if (DAG.InferPtrAlignment(LD->getBasePtr()) >= 16)
-      return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
-                         LD->getSrcValue(), LD->getSrcValueOffset(),
-                         LD->isVolatile(), LD->isNonTemporal(), 0);
-    return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
-                       LD->getSrcValue(), LD->getSrcValueOffset(),
-                       LD->isVolatile(), LD->isNonTemporal(),
-                       LD->getAlignment());
-  } else if (NumElems == 4 && LastLoadedElt == 1) {
-    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
-    SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
-    SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
-    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
-  }
-  return SDValue();
+  SmallVector<SDValue, 16> Elts;
+  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
+    Elts.push_back(DAG.getShuffleScalarElt(SVN, i));
+  
+  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
 }
 
 /// PerformShuffleCombine - Detect vector gather/scatter index generation
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 0f15eba..4549cba 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -417,12 +417,15 @@ namespace llvm {
     virtual unsigned getByValTypeAlignment(const Type *Ty) const;
 
     /// getOptimalMemOpType - Returns the target specific optimal type for load
-    /// and store operations as a result of memset, memcpy, and memmove
-    /// lowering. It returns EVT::iAny if SelectionDAG should be responsible for
-    /// determining it.
-    virtual EVT getOptimalMemOpType(uint64_t Size, unsigned Align,
-                                    bool isSrcConst, bool isSrcStr,
-                                    SelectionDAG &DAG) const;
+    /// and store operations as a result of memset, memcpy, and memmove lowering.
+    /// If DstAlign is zero that means it's safe to destination alignment can
+    /// satisfy any constraint. Similarly if SrcAlign is zero it means there
+    /// isn't a need to check it against alignment requirement, probably because
+    /// the source does not need to be loaded. It returns EVT::Other if
+    /// SelectionDAG should be responsible for determining it.
+    virtual EVT getOptimalMemOpType(uint64_t Size,
+                                    unsigned DstAlign, unsigned SrcAlign,
+                                    bool SafeToUseFP, SelectionDAG &DAG) const;
 
     /// allowsUnalignedMemoryAccesses - Returns true if the target allows
     /// unaligned memory accesses. of the specified type.
diff --git a/lib/Target/X86/X86Instr64bit.td b/lib/Target/X86/X86Instr64bit.td
index 8cbb756..eef2ca0 100644
--- a/lib/Target/X86/X86Instr64bit.td
+++ b/lib/Target/X86/X86Instr64bit.td
@@ -295,19 +295,17 @@ def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
 let Defs = [EFLAGS] in {
 def BSF64rr  : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, (X86bsf GR64:$src)), (implicit EFLAGS)]>, TB;
+                  [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>, TB;
 def BSF64rm  : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, (X86bsf (loadi64 addr:$src))),
-                   (implicit EFLAGS)]>, TB;
+                  [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>, TB;
 
 def BSR64rr  : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, (X86bsr GR64:$src)), (implicit EFLAGS)]>, TB;
+                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>, TB;
 def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, (X86bsr (loadi64 addr:$src))),
-                   (implicit EFLAGS)]>, TB;
+                  [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>, TB;
 } // Defs = [EFLAGS]
 
 // Repeat string ops
@@ -508,8 +506,8 @@ let isCommutable = 1 in
 def ADD64rr    : RI<0x01, MRMDestReg, (outs GR64:$dst), 
                     (ins GR64:$src1, GR64:$src2),
                     "add{q}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR64:$dst, (add GR64:$src1, GR64:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR64:$dst, EFLAGS,
+                          (X86add_flag GR64:$src1, GR64:$src2))]>;
 
 // These are alternate spellings for use by the disassembler, we mark them as
 // code gen only to ensure they aren't matched by the assembler.
@@ -523,21 +521,21 @@ let isCodeGenOnly = 1 in {
 def ADD64ri8  : RIi8<0x83, MRM0r, (outs GR64:$dst), 
                      (ins GR64:$src1, i64i8imm:$src2),
                      "add{q}\t{$src2, $dst|$dst, $src2}",
-                     [(set GR64:$dst, (add GR64:$src1, i64immSExt8:$src2)),
-                      (implicit EFLAGS)]>;
+                     [(set GR64:$dst, EFLAGS,
+                           (X86add_flag GR64:$src1, i64immSExt8:$src2))]>;
 def ADD64ri32 : RIi32<0x81, MRM0r, (outs GR64:$dst), 
                       (ins GR64:$src1, i64i32imm:$src2),
                       "add{q}\t{$src2, $dst|$dst, $src2}",
-                      [(set GR64:$dst, (add GR64:$src1, i64immSExt32:$src2)),
-                       (implicit EFLAGS)]>;
+                      [(set GR64:$dst, EFLAGS,
+                            (X86add_flag GR64:$src1, i64immSExt32:$src2))]>;
 } // isConvertibleToThreeAddress
 
 // Register-Memory Addition
 def ADD64rm     : RI<0x03, MRMSrcMem, (outs GR64:$dst), 
                      (ins GR64:$src1, i64mem:$src2),
                      "add{q}\t{$src2, $dst|$dst, $src2}",
-                     [(set GR64:$dst, (add GR64:$src1, (load addr:$src2))),
-                      (implicit EFLAGS)]>;
+                     [(set GR64:$dst, EFLAGS,
+                           (X86add_flag GR64:$src1, (load addr:$src2)))]>;
 
 } // isTwoAddress
 
@@ -604,8 +602,8 @@ let isTwoAddress = 1 in {
 def SUB64rr  : RI<0x29, MRMDestReg, (outs GR64:$dst), 
                   (ins GR64:$src1, GR64:$src2),
                   "sub{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (sub GR64:$src1, GR64:$src2)),
-                   (implicit EFLAGS)]>;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86sub_flag GR64:$src1, GR64:$src2))]>;
 
 def SUB64rr_REV : RI<0x2B, MRMSrcReg, (outs GR64:$dst), 
                      (ins GR64:$src1, GR64:$src2),
@@ -615,20 +613,20 @@ def SUB64rr_REV : RI<0x2B, MRMSrcReg, (outs GR64:$dst),
 def SUB64rm  : RI<0x2B, MRMSrcMem, (outs GR64:$dst), 
                   (ins GR64:$src1, i64mem:$src2),
                   "sub{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (sub GR64:$src1, (load addr:$src2))),
-                   (implicit EFLAGS)]>;
+                  [(set GR64:$dst, EFLAGS, 
+                        (X86sub_flag GR64:$src1, (load addr:$src2)))]>;
 
 // Register-Integer Subtraction
 def SUB64ri8 : RIi8<0x83, MRM5r, (outs GR64:$dst),
                                  (ins GR64:$src1, i64i8imm:$src2),
                     "sub{q}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR64:$dst, (sub GR64:$src1, i64immSExt8:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR64:$dst, EFLAGS,
+                          (X86sub_flag GR64:$src1, i64immSExt8:$src2))]>;
 def SUB64ri32 : RIi32<0x81, MRM5r, (outs GR64:$dst),
                                    (ins GR64:$src1, i64i32imm:$src2),
                       "sub{q}\t{$src2, $dst|$dst, $src2}",
-                      [(set GR64:$dst, (sub GR64:$src1, i64immSExt32:$src2)),
-                       (implicit EFLAGS)]>;
+                      [(set GR64:$dst, EFLAGS,
+                            (X86sub_flag GR64:$src1, i64immSExt32:$src2))]>;
 } // isTwoAddress
 
 def SUB64i32 : RIi32<0x2D, RawFrm, (outs), (ins i32imm:$src),
@@ -716,15 +714,15 @@ let isCommutable = 1 in
 def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
                                    (ins GR64:$src1, GR64:$src2),
                   "imul{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (mul GR64:$src1, GR64:$src2)),
-                   (implicit EFLAGS)]>, TB;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86smul_flag GR64:$src1, GR64:$src2))]>, TB;
 
 // Register-Memory Signed Integer Multiplication
 def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
                                    (ins GR64:$src1, i64mem:$src2),
                   "imul{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (mul GR64:$src1, (load addr:$src2))),
-                   (implicit EFLAGS)]>, TB;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86smul_flag GR64:$src1, (load addr:$src2)))]>, TB;
 } // isTwoAddress
 
 // Suprisingly enough, these are not two address instructions!
@@ -733,27 +731,27 @@ def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
 def IMUL64rri8 : RIi8<0x6B, MRMSrcReg,                      // GR64 = GR64*I8
                       (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
                       "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      [(set GR64:$dst, (mul GR64:$src1, i64immSExt8:$src2)),
-                       (implicit EFLAGS)]>;
+                      [(set GR64:$dst, EFLAGS,
+                            (X86smul_flag GR64:$src1, i64immSExt8:$src2))]>;
 def IMUL64rri32 : RIi32<0x69, MRMSrcReg,                    // GR64 = GR64*I32
                         (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
                         "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       [(set GR64:$dst, (mul GR64:$src1, i64immSExt32:$src2)),
-                        (implicit EFLAGS)]>;
+                       [(set GR64:$dst, EFLAGS,
+                             (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>;
 
 // Memory-Integer Signed Integer Multiplication
 def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem,                      // GR64 = [mem64]*I8
                       (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
                       "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      [(set GR64:$dst, (mul (load addr:$src1),
-                                            i64immSExt8:$src2)),
-                       (implicit EFLAGS)]>;
+                      [(set GR64:$dst, EFLAGS,
+                            (X86smul_flag (load addr:$src1),
+                                          i64immSExt8:$src2))]>;
 def IMUL64rmi32 : RIi32<0x69, MRMSrcMem,                   // GR64 = [mem64]*I32
                         (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
                         "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                        [(set GR64:$dst, (mul (load addr:$src1),
-                                              i64immSExt32:$src2)),
-                         (implicit EFLAGS)]>;
+                        [(set GR64:$dst, EFLAGS,
+                              (X86smul_flag (load addr:$src1),
+                                            i64immSExt32:$src2))]>;
 } // Defs = [EFLAGS]
 
 // Unsigned division / remainder
@@ -787,16 +785,14 @@ def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
 
 let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
 def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src), "inc{q}\t$dst",
-                [(set GR64:$dst, (add GR64:$src, 1)),
-                 (implicit EFLAGS)]>;
+                [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src))]>;
 def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
                 [(store (add (loadi64 addr:$dst), 1), addr:$dst),
                  (implicit EFLAGS)]>;
 
 let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
 def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src), "dec{q}\t$dst",
-                [(set GR64:$dst, (add GR64:$src, -1)),
-                 (implicit EFLAGS)]>;
+                [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src))]>;
 def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
                 [(store (add (loadi64 addr:$dst), -1), addr:$dst),
                  (implicit EFLAGS)]>;
@@ -806,23 +802,19 @@ let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in {
 // Can transform into LEA.
 def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src), 
                   "inc{w}\t$dst",
-                  [(set GR16:$dst, (add GR16:$src, 1)),
-                   (implicit EFLAGS)]>,
+                  [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src))]>,
                 OpSize, Requires<[In64BitMode]>;
 def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src), 
                   "inc{l}\t$dst",
-                  [(set GR32:$dst, (add GR32:$src, 1)),
-                   (implicit EFLAGS)]>,
+                  [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src))]>,
                 Requires<[In64BitMode]>;
 def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src), 
                   "dec{w}\t$dst",
-                  [(set GR16:$dst, (add GR16:$src, -1)),
-                   (implicit EFLAGS)]>,
+                  [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src))]>,
                 OpSize, Requires<[In64BitMode]>;
 def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src), 
                   "dec{l}\t$dst",
-                  [(set GR32:$dst, (add GR32:$src, -1)),
-                   (implicit EFLAGS)]>,
+                  [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src))]>,
                 Requires<[In64BitMode]>;
 } // isConvertibleToThreeAddress
 
@@ -1092,26 +1084,26 @@ let isCommutable = 1 in
 def AND64rr  : RI<0x21, MRMDestReg, 
                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
                   "and{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (and GR64:$src1, GR64:$src2)),
-                   (implicit EFLAGS)]>;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86and_flag GR64:$src1, GR64:$src2))]>;
 def AND64rr_REV : RI<0x23, MRMSrcReg, (outs GR64:$dst), 
                      (ins GR64:$src1, GR64:$src2),
                      "and{q}\t{$src2, $dst|$dst, $src2}", []>;
 def AND64rm  : RI<0x23, MRMSrcMem,
                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
                   "and{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (and GR64:$src1, (load addr:$src2))),
-                   (implicit EFLAGS)]>;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86and_flag GR64:$src1, (load addr:$src2)))]>;
 def AND64ri8 : RIi8<0x83, MRM4r, 
                     (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
                     "and{q}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR64:$dst, (and GR64:$src1, i64immSExt8:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR64:$dst, EFLAGS,
+                          (X86and_flag GR64:$src1, i64immSExt8:$src2))]>;
 def AND64ri32  : RIi32<0x81, MRM4r, 
                        (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
                        "and{q}\t{$src2, $dst|$dst, $src2}",
-                       [(set GR64:$dst, (and GR64:$src1, i64immSExt32:$src2)),
-                        (implicit EFLAGS)]>;
+                       [(set GR64:$dst, EFLAGS,
+                             (X86and_flag GR64:$src1, i64immSExt32:$src2))]>;
 } // isTwoAddress
 
 def AND64mr  : RI<0x21, MRMDestMem,
@@ -1135,26 +1127,26 @@ let isCommutable = 1 in
 def OR64rr   : RI<0x09, MRMDestReg, (outs GR64:$dst), 
                   (ins GR64:$src1, GR64:$src2),
                   "or{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (or GR64:$src1, GR64:$src2)),
-                   (implicit EFLAGS)]>;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86or_flag GR64:$src1, GR64:$src2))]>;
 def OR64rr_REV : RI<0x0B, MRMSrcReg, (outs GR64:$dst), 
                     (ins GR64:$src1, GR64:$src2),
                     "or{q}\t{$src2, $dst|$dst, $src2}", []>;
 def OR64rm   : RI<0x0B, MRMSrcMem , (outs GR64:$dst),
                   (ins GR64:$src1, i64mem:$src2),
                   "or{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (or GR64:$src1, (load addr:$src2))),
-                   (implicit EFLAGS)]>;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86or_flag GR64:$src1, (load addr:$src2)))]>;
 def OR64ri8  : RIi8<0x83, MRM1r, (outs GR64:$dst),
                     (ins GR64:$src1, i64i8imm:$src2),
                     "or{q}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR64:$dst, (or GR64:$src1, i64immSExt8:$src2)),
-                    (implicit EFLAGS)]>;
+                   [(set GR64:$dst, EFLAGS,
+                         (X86or_flag GR64:$src1, i64immSExt8:$src2))]>;
 def OR64ri32 : RIi32<0x81, MRM1r, (outs GR64:$dst),
                      (ins GR64:$src1, i64i32imm:$src2),
                      "or{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (or GR64:$src1, i64immSExt32:$src2)),
-                    (implicit EFLAGS)]>;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86or_flag GR64:$src1, i64immSExt32:$src2))]>;
 } // isTwoAddress
 
 def OR64mr : RI<0x09, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
@@ -1178,26 +1170,26 @@ let isCommutable = 1 in
 def XOR64rr  : RI<0x31, MRMDestReg,  (outs GR64:$dst), 
                   (ins GR64:$src1, GR64:$src2), 
                   "xor{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (xor GR64:$src1, GR64:$src2)),
-                   (implicit EFLAGS)]>;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86xor_flag GR64:$src1, GR64:$src2))]>;
 def XOR64rr_REV : RI<0x33, MRMSrcReg, (outs GR64:$dst), 
                      (ins GR64:$src1, GR64:$src2),
                     "xor{q}\t{$src2, $dst|$dst, $src2}", []>;
 def XOR64rm  : RI<0x33, MRMSrcMem, (outs GR64:$dst), 
                   (ins GR64:$src1, i64mem:$src2), 
                   "xor{q}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR64:$dst, (xor GR64:$src1, (load addr:$src2))),
-                   (implicit EFLAGS)]>;
+                  [(set GR64:$dst, EFLAGS,
+                        (X86xor_flag GR64:$src1, (load addr:$src2)))]>;
 def XOR64ri8 : RIi8<0x83, MRM6r,  (outs GR64:$dst), 
                     (ins GR64:$src1, i64i8imm:$src2),
                     "xor{q}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR64:$dst, (xor GR64:$src1, i64immSExt8:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR64:$dst, EFLAGS,
+                          (X86xor_flag GR64:$src1, i64immSExt8:$src2))]>;
 def XOR64ri32 : RIi32<0x81, MRM6r, 
                       (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), 
                       "xor{q}\t{$src2, $dst|$dst, $src2}",
-                      [(set GR64:$dst, (xor GR64:$src1, i64immSExt32:$src2)),
-                       (implicit EFLAGS)]>;
+                      [(set GR64:$dst, EFLAGS,
+                            (X86xor_flag GR64:$src1, i64immSExt32:$src2))]>;
 } // isTwoAddress
 
 def XOR64mr  : RI<0x31, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
@@ -2181,14 +2173,11 @@ def : Pat<(store (shld (loadi64 addr:$dst), (i8 imm:$amt1),
 
 // (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
 let AddedComplexity = 5 in {  // Try this before the selecting to OR
-def : Pat<(parallel (or_is_add GR64:$src1, i64immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or_is_add GR64:$src1, i64immSExt8:$src2),
           (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(parallel (or_is_add GR64:$src1, i64immSExt32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or_is_add GR64:$src1, i64immSExt32:$src2),
           (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
-def : Pat<(parallel (or_is_add GR64:$src1, GR64:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or_is_add GR64:$src1, GR64:$src2),
           (ADD64rr GR64:$src1, GR64:$src2)>;
 } // AddedComplexity
 
@@ -2215,136 +2204,76 @@ def : Pat<(subc GR64:$src1, imm:$src2),
 // EFLAGS-defining Patterns
 //===----------------------------------------------------------------------===//
 
-// Register-Register Addition with EFLAGS result
-def : Pat<(parallel (X86add_flag GR64:$src1, GR64:$src2),
-                    (implicit EFLAGS)),
+// addition
+def : Pat<(add GR64:$src1, GR64:$src2),
           (ADD64rr GR64:$src1, GR64:$src2)>;
-
-// Register-Integer Addition with EFLAGS result
-def : Pat<(parallel (X86add_flag GR64:$src1, i64immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(add GR64:$src1, i64immSExt8:$src2),
           (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(parallel (X86add_flag GR64:$src1, i64immSExt32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(add GR64:$src1, i64immSExt32:$src2),
           (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
-
-// Register-Memory Addition with EFLAGS result
-def : Pat<(parallel (X86add_flag GR64:$src1, (loadi64 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
           (ADD64rm GR64:$src1, addr:$src2)>;
 
-// Register-Register Subtraction with EFLAGS result
-def : Pat<(parallel (X86sub_flag GR64:$src1, GR64:$src2),
-                    (implicit EFLAGS)),
+// subtraction
+def : Pat<(sub GR64:$src1, GR64:$src2),
           (SUB64rr GR64:$src1, GR64:$src2)>;
-
-// Register-Memory Subtraction with EFLAGS result
-def : Pat<(parallel (X86sub_flag GR64:$src1, (loadi64 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
           (SUB64rm GR64:$src1, addr:$src2)>;
-
-// Register-Integer Subtraction with EFLAGS result
-def : Pat<(parallel (X86sub_flag GR64:$src1, i64immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
           (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(parallel (X86sub_flag GR64:$src1, i64immSExt32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
           (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
 
-// Register-Register Signed Integer Multiplication with EFLAGS result
-def : Pat<(parallel (X86smul_flag GR64:$src1, GR64:$src2),
-                    (implicit EFLAGS)),
+// Multiply
+def : Pat<(mul GR64:$src1, GR64:$src2),
           (IMUL64rr GR64:$src1, GR64:$src2)>;
-
-// Register-Memory Signed Integer Multiplication with EFLAGS result
-def : Pat<(parallel (X86smul_flag GR64:$src1, (loadi64 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
           (IMUL64rm GR64:$src1, addr:$src2)>;
-
-// Register-Integer Signed Integer Multiplication with EFLAGS result
-def : Pat<(parallel (X86smul_flag GR64:$src1, i64immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
           (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(parallel (X86smul_flag GR64:$src1, i64immSExt32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
           (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
-
-// Memory-Integer Signed Integer Multiplication with EFLAGS result
-def : Pat<(parallel (X86smul_flag (loadi64 addr:$src1), i64immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
           (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
-def : Pat<(parallel (X86smul_flag (loadi64 addr:$src1), i64immSExt32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
           (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
 
-// INC and DEC with EFLAGS result. Note that these do not set CF.
-def : Pat<(parallel (X86inc_flag GR16:$src), (implicit EFLAGS)),
-          (INC64_16r GR16:$src)>, Requires<[In64BitMode]>;
-def : Pat<(parallel (X86dec_flag GR16:$src), (implicit EFLAGS)),
-          (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>;
-
-def : Pat<(parallel (X86inc_flag GR32:$src), (implicit EFLAGS)),
-          (INC64_32r GR32:$src)>, Requires<[In64BitMode]>;
-def : Pat<(parallel (X86dec_flag GR32:$src), (implicit EFLAGS)),
-          (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>;
-
-def : Pat<(parallel (X86inc_flag GR64:$src), (implicit EFLAGS)),
-          (INC64r GR64:$src)>;
-def : Pat<(parallel (X86dec_flag GR64:$src), (implicit EFLAGS)),
-          (DEC64r GR64:$src)>;
-
-// Register-Register Logical Or with EFLAGS result
-def : Pat<(parallel (X86or_flag GR64:$src1, GR64:$src2),
-                    (implicit EFLAGS)),
-          (OR64rr GR64:$src1, GR64:$src2)>;
+// inc/dec
+def : Pat<(add GR16:$src, 1),  (INC64_16r GR16:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR16:$src, -1), (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR32:$src, 1),  (INC64_32r GR32:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR32:$src, -1), (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR64:$src, 1),  (INC64r GR64:$src)>;
+def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>;
 
-// Register-Integer Logical Or with EFLAGS result
-def : Pat<(parallel (X86or_flag GR64:$src1, i64immSExt8:$src2),
-                    (implicit EFLAGS)),
+// or
+def : Pat<(or GR64:$src1, GR64:$src2),
+          (OR64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt8:$src2),
           (OR64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(parallel (X86or_flag GR64:$src1, i64immSExt32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or GR64:$src1, i64immSExt32:$src2),
           (OR64ri32 GR64:$src1, i64immSExt32:$src2)>;
-
-// Register-Memory Logical Or with EFLAGS result
-def : Pat<(parallel (X86or_flag GR64:$src1, (loadi64 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(or GR64:$src1, (loadi64 addr:$src2)),
           (OR64rm GR64:$src1, addr:$src2)>;
 
-// Register-Register Logical XOr with EFLAGS result
-def : Pat<(parallel (X86xor_flag GR64:$src1, GR64:$src2),
-                    (implicit EFLAGS)),
+// xor
+def : Pat<(xor GR64:$src1, GR64:$src2),
           (XOR64rr GR64:$src1, GR64:$src2)>;
-
-// Register-Integer Logical XOr with EFLAGS result
-def : Pat<(parallel (X86xor_flag GR64:$src1, i64immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(xor GR64:$src1, i64immSExt8:$src2),
           (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(parallel (X86xor_flag GR64:$src1, i64immSExt32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(xor GR64:$src1, i64immSExt32:$src2),
           (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
-
-// Register-Memory Logical XOr with EFLAGS result
-def : Pat<(parallel (X86xor_flag GR64:$src1, (loadi64 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)),
           (XOR64rm GR64:$src1, addr:$src2)>;
 
-// Register-Register Logical And with EFLAGS result
-def : Pat<(parallel (X86and_flag GR64:$src1, GR64:$src2),
-                    (implicit EFLAGS)),
+// and
+def : Pat<(and GR64:$src1, GR64:$src2),
           (AND64rr GR64:$src1, GR64:$src2)>;
-
-// Register-Integer Logical And with EFLAGS result
-def : Pat<(parallel (X86and_flag GR64:$src1, i64immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(and GR64:$src1, i64immSExt8:$src2),
           (AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(parallel (X86and_flag GR64:$src1, i64immSExt32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(and GR64:$src1, i64immSExt32:$src2),
           (AND64ri32 GR64:$src1, i64immSExt32:$src2)>;
-
-// Register-Memory Logical And with EFLAGS result
-def : Pat<(parallel (X86and_flag GR64:$src1, (loadi64 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(and GR64:$src1, (loadi64 addr:$src2)),
           (AND64rm GR64:$src1, addr:$src2)>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index bb81cbf..d25ec26 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -68,6 +68,16 @@ def CompareFP  : FPFormat<5>;
 def CondMovFP  : FPFormat<6>;
 def SpecialFP  : FPFormat<7>;
 
+// Class specifying the SSE execution domain, used by the SSEDomainFix pass.
+// Keep in sync with tables in X86InstrInfo.cpp.
+class Domain<bits<2> val> {
+  bits<2> Value = val;
+}
+def GenericDomain   : Domain<0>;
+def SSEPackedSingle : Domain<1>;
+def SSEPackedDouble : Domain<2>;
+def SSEPackedInt    : Domain<3>;
+
 // Prefix byte classes which are used to indicate to the ad-hoc machine code
 // emitter that various prefix bytes are required.
 class OpSize { bit hasOpSizePrefix = 1; }
@@ -93,7 +103,7 @@ class TA     { bits<4> Prefix = 14; }
 class TF     { bits<4> Prefix = 15; }
 
 class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
-              string AsmStr>
+              string AsmStr, Domain d = GenericDomain>
   : Instruction {
   let Namespace = "X86";
 
@@ -101,7 +111,6 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   Format Form = f;
   bits<6> FormBits = Form.Value;
   ImmType ImmT = i;
-  bits<3> ImmTypeBits = ImmT.Value;
 
   dag OutOperandList = outs;
   dag InOperandList = ins;
@@ -115,20 +124,21 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
 
   bits<4> Prefix = 0;       // Which prefix byte does this inst have?
   bit hasREX_WPrefix  = 0;  // Does this inst requires the REX.W prefix?
-  FPFormat FPForm;          // What flavor of FP instruction is this?
-  bits<3> FPFormBits = 0;
+  FPFormat FPForm = NotFP;  // What flavor of FP instruction is this?
   bit hasLockPrefix = 0;    // Does this inst have a 0xF0 prefix?
   bits<2> SegOvrBits = 0;   // Segment override prefix.
+  Domain ExeDomain = d;
 }
 
-class I<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern>
-  : X86Inst<o, f, NoImm, outs, ins, asm> {
+class I<bits<8> o, Format f, dag outs, dag ins, string asm,
+        list<dag> pattern, Domain d = GenericDomain>
+  : X86Inst<o, f, NoImm, outs, ins, asm, d> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
 class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, 
-           list<dag> pattern>
-  : X86Inst<o, f, Imm8 , outs, ins, asm> {
+           list<dag> pattern, Domain d = GenericDomain>
+  : X86Inst<o, f, Imm8, outs, ins, asm, d> {
   let Pattern = pattern;
   let CodeSize = 3;
 }
@@ -166,7 +176,7 @@ class FPI<bits<8> o, Format F, dag outs, dag ins, string asm>
 // FpI_ - Floating Point Psuedo Instruction template. Not Predicated.
 class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern>
   : X86Inst<0, Pseudo, NoImm, outs, ins, ""> {
-  let FPForm = fp; let FPFormBits = FPForm.Value;
+  let FPForm = fp;
   let Pattern = pattern;
 }
 
@@ -196,14 +206,16 @@ class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
 
 class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
       : I<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>;
-class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, 
+class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
       : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>;
 class PSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasSSE1]>;
+      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB,
+        Requires<[HasSSE1]>;
 class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, TB, Requires<[HasSSE1]>;
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB,
+        Requires<[HasSSE1]>;
 
 // SSE2 Instruction Templates:
 // 
@@ -222,10 +234,12 @@ class SSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern>
       : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
 class PDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize,
+        Requires<[HasSSE2]>;
 class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize,
+        Requires<[HasSSE2]>;
 
 // SSE3 Instruction Templates:
 // 
@@ -235,12 +249,15 @@ class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, 
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE3]>;
+      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, XS,
+        Requires<[HasSSE3]>;
 class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, 
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE3]>;
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, XD,
+        Requires<[HasSSE3]>;
 class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasSSE3]>;
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize,
+        Requires<[HasSSE3]>;
 
 
 // SSSE3 Instruction Templates:
@@ -254,10 +271,12 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
 
 class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSSE3]>;
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
+        Requires<[HasSSSE3]>;
 class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSSE3]>;
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+        Requires<[HasSSSE3]>;
 
 // SSE4.1 Instruction Templates:
 // 
@@ -266,17 +285,20 @@ class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
 //
 class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSE41]>;
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
+        Requires<[HasSSE41]>;
 class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSE41]>;
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+        Requires<[HasSSE41]>;
 
 // SSE4.2 Instruction Templates:
 // 
 //   SS428I - SSE 4.2 instructions with T8 prefix.
 class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSE42]>;
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
+        Requires<[HasSSE42]>;
 
 //   SS42FI - SSE 4.2 instructions with TF prefix.
 class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
@@ -286,7 +308,8 @@ class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   SS42AI = SSE 4.2 instructions with TA prefix
 class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSE42]>;
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+        Requires<[HasSSE42]>;
 
 // X86-64 Instruction templates...
 //
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 139a905..c0c9d98 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -597,7 +597,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::PMULHUWrr,       X86::PMULHUWrm, 16 },
     { X86::PMULHWrr,        X86::PMULHWrm, 16 },
     { X86::PMULLDrr,        X86::PMULLDrm, 16 },
-    { X86::PMULLDrr_int,    X86::PMULLDrm_int, 16 },
     { X86::PMULLWrr,        X86::PMULLWrm, 16 },
     { X86::PMULUDQrr,       X86::PMULUDQrm, 16 },
     { X86::PORrr,           X86::PORrm, 16 },
@@ -992,8 +991,10 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
 /// a few instructions in each direction it assumes it's not safe.
 static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I) {
+  MachineBasicBlock::iterator E = MBB.end();
+
   // It's always safe to clobber EFLAGS at the end of a block.
-  if (I == MBB.end())
+  if (I == E)
     return true;
 
   // For compile time consideration, if we are not able to determine the
@@ -1017,20 +1018,28 @@ static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
       // This instruction defines EFLAGS, no need to look any further.
       return true;
     ++Iter;
+    // Skip over DBG_VALUE.
+    while (Iter != E && Iter->isDebugValue())
+      ++Iter;
 
     // If we make it to the end of the block, it's safe to clobber EFLAGS.
-    if (Iter == MBB.end())
+    if (Iter == E)
       return true;
   }
 
+  MachineBasicBlock::iterator B = MBB.begin();
   Iter = I;
   for (unsigned i = 0; i < 4; ++i) {
     // If we make it to the beginning of the block, it's safe to clobber
     // EFLAGS iff EFLAGS is not live-in.
-    if (Iter == MBB.begin())
+    if (Iter == B)
       return !MBB.isLiveIn(X86::EFLAGS);
 
     --Iter;
+    // Skip over DBG_VALUE.
+    while (Iter != B && Iter->isDebugValue())
+      --Iter;
+
     bool SawKill = false;
     for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
       MachineOperand &MO = Iter->getOperand(j);
@@ -1677,6 +1686,8 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   MachineBasicBlock::iterator I = MBB.end();
   while (I != MBB.begin()) {
     --I;
+    if (I->isDebugValue())
+      continue;
 
     // Working from the bottom, when we see a non-terminator instruction, we're
     // done.
@@ -1773,6 +1784,8 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 
   while (I != MBB.begin()) {
     --I;
+    if (I->isDebugValue())
+      continue;
     if (I->getOpcode() != X86::JMP_4 &&
         GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
       break;
@@ -2505,7 +2518,9 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     Alignment = (*LoadMI->memoperands_begin())->getAlignment();
   else
     switch (LoadMI->getOpcode()) {
-    case X86::V_SET0:
+    case X86::V_SET0PS:
+    case X86::V_SET0PD:
+    case X86::V_SET0PI:
     case X86::V_SETALLONES:
       Alignment = 16;
       break;
@@ -2535,11 +2550,13 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
 
   SmallVector<MachineOperand,X86AddrNumOperands> MOs;
   switch (LoadMI->getOpcode()) {
-  case X86::V_SET0:
+  case X86::V_SET0PS:
+  case X86::V_SET0PD:
+  case X86::V_SET0PI:
   case X86::V_SETALLONES:
   case X86::FsFLD0SD:
   case X86::FsFLD0SS: {
-    // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
+    // Folding a V_SET0P? or V_SETALLONES as a load, to ease register pressure.
     // Create a constant-pool entry and operands to load from it.
 
     // Medium and large mode can't fold loads this way.
@@ -3648,3 +3665,51 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
   X86FI->setGlobalBaseReg(GlobalBaseReg);
   return GlobalBaseReg;
 }
+
+// These are the replaceable SSE instructions. Some of these have Int variants
+// that we don't include here. We don't want to replace instructions selected
+// by intrinsics.
+static const unsigned ReplaceableInstrs[][3] = {
+  //PackedInt       PackedSingle     PackedDouble
+  { X86::MOVAPSmr,   X86::MOVAPDmr,  X86::MOVDQAmr  },
+  { X86::MOVAPSrm,   X86::MOVAPDrm,  X86::MOVDQArm  },
+  { X86::MOVAPSrr,   X86::MOVAPDrr,  X86::MOVDQArr  },
+  { X86::MOVUPSmr,   X86::MOVUPDmr,  X86::MOVDQUmr  },
+  { X86::MOVUPSrm,   X86::MOVUPDrm,  X86::MOVDQUrm  },
+  { X86::MOVNTPSmr,  X86::MOVNTPDmr, X86::MOVNTDQmr },
+  { X86::ANDNPSrm,   X86::ANDNPDrm,  X86::PANDNrm   },
+  { X86::ANDNPSrr,   X86::ANDNPDrr,  X86::PANDNrr   },
+  { X86::ANDPSrm,    X86::ANDPDrm,   X86::PANDrm    },
+  { X86::ANDPSrr,    X86::ANDPDrr,   X86::PANDrr    },
+  { X86::ORPSrm,     X86::ORPDrm,    X86::PORrm     },
+  { X86::ORPSrr,     X86::ORPDrr,    X86::PORrr     },
+  { X86::V_SET0PS,   X86::V_SET0PD,  X86::V_SET0PI  },
+  { X86::XORPSrm,    X86::XORPDrm,   X86::PXORrm    },
+  { X86::XORPSrr,    X86::XORPDrr,   X86::PXORrr    },
+};
+
+// FIXME: Some shuffle and unpack instructions have equivalents in different
+// domains, but they require a bit more work than just switching opcodes.
+
+static const unsigned *lookup(unsigned opcode, unsigned domain) {
+  for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
+    if (ReplaceableInstrs[i][domain-1] == opcode)
+      return ReplaceableInstrs[i];
+  return 0;
+}
+
+std::pair<uint16_t, uint16_t>
+X86InstrInfo::GetSSEDomain(const MachineInstr *MI) const {
+  uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+  return std::make_pair(domain,
+                        domain && lookup(MI->getOpcode(), domain) ? 0xe : 0);
+}
+
+void X86InstrInfo::SetSSEDomain(MachineInstr *MI, unsigned Domain) const {
+  assert(Domain>0 && Domain<4 && "Invalid execution domain");
+  uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+  assert(dom && "Not an SSE instruction");
+  const unsigned *table = lookup(MI->getOpcode(), dom);
+  assert(table && "Cannot change domain");
+  MI->setDesc(get(table[Domain-1]));
+}
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 5111719..f0bdd06 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -398,7 +398,10 @@ namespace X86II {
     FS          = 1 << SegOvrShift,
     GS          = 2 << SegOvrShift,
 
-    // Bits 22 -> 23 are unused
+    // Execution domain for SSE instructions in bits 22, 23.
+    // 0 in bits 22-23 means normal, non-SSE instruction.
+    SSEDomainShift = 22,
+
     OpcodeShift   = 24,
     OpcodeMask    = 0xFF << OpcodeShift
   };
@@ -486,7 +489,7 @@ class X86InstrInfo : public TargetInstrInfoImpl {
   /// MemOp2RegOpTable - Load / store unfolding opcode map.
   ///
   DenseMap<unsigned*, std::pair<unsigned, unsigned> > MemOp2RegOpTable;
-  
+
 public:
   explicit X86InstrInfo(X86TargetMachine &tm);
 
@@ -716,6 +719,13 @@ public:
   ///
   unsigned getGlobalBaseReg(MachineFunction *MF) const;
 
+  /// GetSSEDomain - Return the SSE execution domain of MI as the first element,
+  /// and a bitmask of possible arguments to SetSSEDomain ase the second.
+  std::pair<uint16_t, uint16_t> GetSSEDomain(const MachineInstr *MI) const;
+
+  /// SetSSEDomain - Set the SSEDomain of MI.
+  void SetSSEDomain(MachineInstr *MI, unsigned Domain) const;
+
 private:
   MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
                                               MachineFunction::iterator &MFI,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index c80a18d..8fccc8a 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -1,4 +1,4 @@
-
+//===----------------------------------------------------------------------===//
 // 
 //                     The LLVM Compiler Infrastructure
 //
@@ -28,12 +28,13 @@ def SDTX86Cmov    : SDTypeProfile<1, 4,
                                    SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
 
 // Unary and binary operator instructions that set EFLAGS as a side-effect.
-def SDTUnaryArithWithFlags  : SDTypeProfile<1, 1,
-                                            [SDTCisInt<0>]>;
-def SDTBinaryArithWithFlags : SDTypeProfile<1, 2,
-                                            [SDTCisSameAs<0, 1>,
-                                             SDTCisSameAs<0, 2>,
-                                             SDTCisInt<0>]>;
+def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
+                                           [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>, SDTCisVT<1, i32>]>;
 def SDTX86BrCond  : SDTypeProfile<0, 3,
                                   [SDTCisVT<0, OtherVT>,
                                    SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
@@ -77,8 +78,8 @@ def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
 
-def X86bsf     : SDNode<"X86ISD::BSF",      SDTIntUnaryOp>;
-def X86bsr     : SDNode<"X86ISD::BSR",      SDTIntUnaryOp>;
+def X86bsf     : SDNode<"X86ISD::BSF",      SDTUnaryArithWithFlags>;
+def X86bsr     : SDNode<"X86ISD::BSR",      SDTUnaryArithWithFlags>;
 def X86shld    : SDNode<"X86ISD::SHLD",     SDTIntShiftDOp>;
 def X86shrd    : SDNode<"X86ISD::SHRD",     SDTIntShiftDOp>;
 
@@ -167,6 +168,7 @@ def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags,
                           [SDNPCommutative]>;
 def X86umul_flag : SDNode<"X86ISD::UMUL", SDTUnaryArithWithFlags,
                           [SDNPCommutative]>;
+                          
 def X86inc_flag  : SDNode<"X86ISD::INC",  SDTUnaryArithWithFlags>;
 def X86dec_flag  : SDNode<"X86ISD::DEC",  SDTUnaryArithWithFlags>;
 def X86or_flag   : SDNode<"X86ISD::OR",   SDTBinaryArithWithFlags,
@@ -323,6 +325,7 @@ def FarData      : Predicate<"TM.getCodeModel() != CodeModel::Small &&"
 def NearData     : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
                              "TM.getCodeModel() == CodeModel::Kernel">;
 def IsStatic     : Predicate<"TM.getRelocationModel() == Reloc::Static">;
+def IsNotPIC     : Predicate<"TM.getRelocationModel() != Reloc::PIC_">;
 def OptForSize   : Predicate<"OptForSize">;
 def OptForSpeed  : Predicate<"!OptForSize">;
 def FastBTMem    : Predicate<"!Subtarget->isBTMemSlow()">;
@@ -473,15 +476,14 @@ def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
 def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
     return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
-  else {
-    unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits();
-    APInt Mask = APInt::getAllOnesValue(BitWidth);
-    APInt KnownZero0, KnownOne0;
-    CurDAG->ComputeMaskedBits(N->getOperand(0), Mask, KnownZero0, KnownOne0, 0);
-    APInt KnownZero1, KnownOne1;
-    CurDAG->ComputeMaskedBits(N->getOperand(1), Mask, KnownZero1, KnownOne1, 0);
-    return (~KnownZero0 & ~KnownZero1) == 0;
-  }
+
+  unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits();
+  APInt Mask = APInt::getAllOnesValue(BitWidth);
+  APInt KnownZero0, KnownOne0;
+  CurDAG->ComputeMaskedBits(N->getOperand(0), Mask, KnownZero0, KnownOne0, 0);
+  APInt KnownZero1, KnownOne1;
+  CurDAG->ComputeMaskedBits(N->getOperand(1), Mask, KnownZero1, KnownOne1, 0);
+  return (~KnownZero0 & ~KnownZero1) == 0;
 }]>;
 
 // 'shld' and 'shrd' instruction patterns. Note that even though these have
@@ -585,7 +587,7 @@ let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in
 
 // Return instructions.
 let isTerminator = 1, isReturn = 1, isBarrier = 1,
-    hasCtrlDep = 1, FPForm = SpecialFP, FPFormBits = SpecialFP.Value in {
+    hasCtrlDep = 1, FPForm = SpecialFP in {
   def RET    : I   <0xC3, RawFrm, (outs), (ins variable_ops),
                     "ret",
                     [(X86retflag 0)]>;
@@ -806,33 +808,29 @@ let isTwoAddress = 1 in                               // GR32 = bswap GR32
 let Defs = [EFLAGS] in {
 def BSF16rr  : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, (X86bsf GR16:$src)), (implicit EFLAGS)]>, TB;
+                 [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>, TB;
 def BSF16rm  : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, (X86bsf (loadi16 addr:$src))),
-                  (implicit EFLAGS)]>, TB;
+                 [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>, TB;
 def BSF32rr  : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, (X86bsf GR32:$src)), (implicit EFLAGS)]>, TB;
+                 [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>, TB;
 def BSF32rm  : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, (X86bsf (loadi32 addr:$src))),
-                  (implicit EFLAGS)]>, TB;
+                 [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>, TB;
 
 def BSR16rr  : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, (X86bsr GR16:$src)), (implicit EFLAGS)]>, TB;
+                 [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>, TB;
 def BSR16rm  : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, (X86bsr (loadi16 addr:$src))),
-                  (implicit EFLAGS)]>, TB;
+                 [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>, TB;
 def BSR32rr  : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, (X86bsr GR32:$src)), (implicit EFLAGS)]>, TB;
+                 [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>, TB;
 def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, (X86bsr (loadi32 addr:$src))),
-                  (implicit EFLAGS)]>, TB;
+                 [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>, TB;
 } // Defs = [EFLAGS]
 
 let neverHasSideEffects = 1 in
@@ -1697,18 +1695,17 @@ let isTwoAddress = 0 in {
 let Defs = [EFLAGS] in {
 let CodeSize = 2 in
 def INC8r  : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src), "inc{b}\t$dst",
-               [(set GR8:$dst, (add GR8:$src, 1)),
-                (implicit EFLAGS)]>;
+               [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src))]>;
+
 let isConvertibleToThreeAddress = 1, CodeSize = 1 in {  // Can xform into LEA.
 def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src), 
                "inc{w}\t$dst",
-               [(set GR16:$dst, (add GR16:$src, 1)),
-                (implicit EFLAGS)]>,
+               [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src))]>,
              OpSize, Requires<[In32BitMode]>;
 def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), 
                "inc{l}\t$dst",
-               [(set GR32:$dst, (add GR32:$src, 1)),
-                (implicit EFLAGS)]>, Requires<[In32BitMode]>;
+               [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src))]>,
+             Requires<[In32BitMode]>;
 }
 let isTwoAddress = 0, CodeSize = 2 in {
   def INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
@@ -1726,18 +1723,16 @@ let isTwoAddress = 0, CodeSize = 2 in {
 
 let CodeSize = 2 in
 def DEC8r  : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src), "dec{b}\t$dst",
-               [(set GR8:$dst, (add GR8:$src, -1)),
-                (implicit EFLAGS)]>;
+               [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src))]>;
 let isConvertibleToThreeAddress = 1, CodeSize = 1 in {   // Can xform into LEA.
 def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src), 
                "dec{w}\t$dst",
-               [(set GR16:$dst, (add GR16:$src, -1)),
-                (implicit EFLAGS)]>,
+               [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src))]>,
              OpSize, Requires<[In32BitMode]>;
 def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), 
                "dec{l}\t$dst",
-               [(set GR32:$dst, (add GR32:$src, -1)),
-                (implicit EFLAGS)]>, Requires<[In32BitMode]>;
+               [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src))]>,
+             Requires<[In32BitMode]>;
 }
 
 let isTwoAddress = 0, CodeSize = 2 in {
@@ -1758,21 +1753,20 @@ let isTwoAddress = 0, CodeSize = 2 in {
 // Logical operators...
 let Defs = [EFLAGS] in {
 let isCommutable = 1 in {   // X = AND Y, Z   --> X = AND Z, Y
-def AND8rr   : I<0x20, MRMDestReg,
-                (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2),
-                "and{b}\t{$src2, $dst|$dst, $src2}",
-                [(set GR8:$dst, (and GR8:$src1, GR8:$src2)),
-                 (implicit EFLAGS)]>;
-def AND16rr  : I<0x21, MRMDestReg,
-                 (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
-                 "and{w}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR16:$dst, (and GR16:$src1, GR16:$src2)),
-                  (implicit EFLAGS)]>, OpSize;
-def AND32rr  : I<0x21, MRMDestReg, 
-                 (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
-                 "and{l}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR32:$dst, (and GR32:$src1, GR32:$src2)),
-                  (implicit EFLAGS)]>;
+def AND8rr  : I<0x20, MRMDestReg,
+               (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2),
+               "and{b}\t{$src2, $dst|$dst, $src2}",
+               [(set GR8:$dst, EFLAGS, (X86and_flag GR8:$src1, GR8:$src2))]>;
+def AND16rr : I<0x21, MRMDestReg,
+                (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                "and{w}\t{$src2, $dst|$dst, $src2}",
+                [(set GR16:$dst, EFLAGS, (X86and_flag GR16:$src1,
+                                                      GR16:$src2))]>, OpSize;
+def AND32rr : I<0x21, MRMDestReg, 
+                (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                "and{l}\t{$src2, $dst|$dst, $src2}",
+                [(set GR32:$dst, EFLAGS, (X86and_flag GR32:$src1,
+                                                      GR32:$src2))]>;
 }
 
 // AND instructions with the destination register in REG and the source register
@@ -1789,45 +1783,46 @@ def AND32rr_REV : I<0x23, MRMSrcReg, (outs GR32:$dst),
 def AND8rm   : I<0x22, MRMSrcMem, 
                  (outs GR8 :$dst), (ins GR8 :$src1, i8mem :$src2),
                  "and{b}\t{$src2, $dst|$dst, $src2}",
-                [(set GR8:$dst, (and GR8:$src1, (loadi8 addr:$src2))),
-                 (implicit EFLAGS)]>;
+                [(set GR8:$dst, EFLAGS, (X86and_flag GR8:$src1,
+                                                     (loadi8 addr:$src2)))]>;
 def AND16rm  : I<0x23, MRMSrcMem, 
                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
                  "and{w}\t{$src2, $dst|$dst, $src2}",
-                [(set GR16:$dst, (and GR16:$src1, (loadi16 addr:$src2))),
-                 (implicit EFLAGS)]>, OpSize;
+                [(set GR16:$dst, EFLAGS, (X86and_flag GR16:$src1,
+                                                      (loadi16 addr:$src2)))]>,
+               OpSize;
 def AND32rm  : I<0x23, MRMSrcMem,
                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
                  "and{l}\t{$src2, $dst|$dst, $src2}",
-                [(set GR32:$dst, (and GR32:$src1, (loadi32 addr:$src2))),
-                 (implicit EFLAGS)]>;
+                [(set GR32:$dst, EFLAGS, (X86and_flag GR32:$src1,
+                                                      (loadi32 addr:$src2)))]>;
 
 def AND8ri   : Ii8<0x80, MRM4r, 
                    (outs GR8 :$dst), (ins GR8 :$src1, i8imm :$src2),
                    "and{b}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR8:$dst, (and GR8:$src1, imm:$src2)),
-                    (implicit EFLAGS)]>;
+                   [(set GR8:$dst, EFLAGS, (X86and_flag GR8:$src1,
+                                                        imm:$src2))]>;
 def AND16ri  : Ii16<0x81, MRM4r, 
                     (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
                     "and{w}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR16:$dst, (and GR16:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>, OpSize;
+                    [(set GR16:$dst, EFLAGS, (X86and_flag GR16:$src1,
+                                                          imm:$src2))]>, OpSize;
 def AND32ri  : Ii32<0x81, MRM4r, 
                     (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
                     "and{l}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR32:$dst, (and GR32:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR32:$dst, EFLAGS, (X86and_flag GR32:$src1,
+                                                          imm:$src2))]>;
 def AND16ri8 : Ii8<0x83, MRM4r, 
                    (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
                    "and{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (and GR16:$src1, i16immSExt8:$src2)),
-                    (implicit EFLAGS)]>,
+                   [(set GR16:$dst, EFLAGS, (X86and_flag GR16:$src1,
+                                                         i16immSExt8:$src2))]>,
                    OpSize;
 def AND32ri8 : Ii8<0x83, MRM4r, 
                    (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
                    "and{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (and GR32:$src1, i32immSExt8:$src2)),
-                    (implicit EFLAGS)]>;
+                   [(set GR32:$dst, EFLAGS, (X86and_flag GR32:$src1,
+                                                         i32immSExt8:$src2))]>;
 
 let isTwoAddress = 0 in {
   def AND8mr   : I<0x20, MRMDestMem,
@@ -1888,18 +1883,16 @@ let isCommutable = 1 in {   // X = OR Y, Z   --> X = OR Z, Y
 def OR8rr    : I<0x08, MRMDestReg, (outs GR8 :$dst), 
                  (ins GR8 :$src1, GR8 :$src2),
                  "or{b}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR8:$dst, (or GR8:$src1, GR8:$src2)),
-                  (implicit EFLAGS)]>;
+                 [(set GR8:$dst, EFLAGS, (X86or_flag GR8:$src1, GR8:$src2))]>;
 def OR16rr   : I<0x09, MRMDestReg, (outs GR16:$dst), 
                  (ins GR16:$src1, GR16:$src2),
                  "or{w}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR16:$dst, (or GR16:$src1, GR16:$src2)),
-                  (implicit EFLAGS)]>, OpSize;
+                 [(set GR16:$dst, EFLAGS, (X86or_flag GR16:$src1,GR16:$src2))]>,
+               OpSize;
 def OR32rr   : I<0x09, MRMDestReg, (outs GR32:$dst), 
                  (ins GR32:$src1, GR32:$src2),
                  "or{l}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR32:$dst, (or GR32:$src1, GR32:$src2)),
-                  (implicit EFLAGS)]>;
+                 [(set GR32:$dst, EFLAGS, (X86or_flag GR32:$src1,GR32:$src2))]>;
 }
 
 // OR instructions with the destination register in REG and the source register
@@ -1913,48 +1906,48 @@ def OR32rr_REV : I<0x0B, MRMSrcReg, (outs GR32:$dst),
                    (ins GR32:$src1, GR32:$src2),
                    "or{l}\t{$src2, $dst|$dst, $src2}", []>;
                   
-def OR8rm    : I<0x0A, MRMSrcMem , (outs GR8 :$dst), 
+def OR8rm    : I<0x0A, MRMSrcMem, (outs GR8 :$dst), 
                  (ins GR8 :$src1, i8mem :$src2),
                  "or{b}\t{$src2, $dst|$dst, $src2}",
-                [(set GR8:$dst, (or GR8:$src1, (load addr:$src2))),
-                 (implicit EFLAGS)]>;
-def OR16rm   : I<0x0B, MRMSrcMem , (outs GR16:$dst), 
+                [(set GR8:$dst, EFLAGS, (X86or_flag GR8:$src1,
+                                                    (load addr:$src2)))]>;
+def OR16rm   : I<0x0B, MRMSrcMem, (outs GR16:$dst), 
                  (ins GR16:$src1, i16mem:$src2),
                  "or{w}\t{$src2, $dst|$dst, $src2}",
-                [(set GR16:$dst, (or GR16:$src1, (load addr:$src2))),
-                 (implicit EFLAGS)]>, OpSize;
-def OR32rm   : I<0x0B, MRMSrcMem , (outs GR32:$dst), 
+                [(set GR16:$dst, EFLAGS, (X86or_flag GR16:$src1,
+                                                     (load addr:$src2)))]>,
+               OpSize;
+def OR32rm   : I<0x0B, MRMSrcMem, (outs GR32:$dst), 
                  (ins GR32:$src1, i32mem:$src2),
                  "or{l}\t{$src2, $dst|$dst, $src2}",
-                [(set GR32:$dst, (or GR32:$src1, (load addr:$src2))),
-                 (implicit EFLAGS)]>;
+                [(set GR32:$dst, EFLAGS, (X86or_flag GR32:$src1,
+                                                     (load addr:$src2)))]>;
 
 def OR8ri    : Ii8 <0x80, MRM1r, (outs GR8 :$dst), 
                     (ins GR8 :$src1, i8imm:$src2),
                     "or{b}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR8:$dst, (or GR8:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR8:$dst,EFLAGS, (X86or_flag GR8:$src1, imm:$src2))]>;
 def OR16ri   : Ii16<0x81, MRM1r, (outs GR16:$dst), 
                     (ins GR16:$src1, i16imm:$src2),
                     "or{w}\t{$src2, $dst|$dst, $src2}", 
-                    [(set GR16:$dst, (or GR16:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>, OpSize;
+                    [(set GR16:$dst, EFLAGS, (X86or_flag GR16:$src1,
+                                                        imm:$src2))]>, OpSize;
 def OR32ri   : Ii32<0x81, MRM1r, (outs GR32:$dst), 
                     (ins GR32:$src1, i32imm:$src2),
                     "or{l}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR32:$dst, (or GR32:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR32:$dst, EFLAGS, (X86or_flag GR32:$src1,
+                                                         imm:$src2))]>;
 
 def OR16ri8  : Ii8<0x83, MRM1r, (outs GR16:$dst), 
                    (ins GR16:$src1, i16i8imm:$src2),
                    "or{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (or GR16:$src1, i16immSExt8:$src2)),
-                    (implicit EFLAGS)]>, OpSize;
+                   [(set GR16:$dst, EFLAGS, (X86or_flag GR16:$src1,
+                                                i16immSExt8:$src2))]>, OpSize;
 def OR32ri8  : Ii8<0x83, MRM1r, (outs GR32:$dst), 
                    (ins GR32:$src1, i32i8imm:$src2),
                    "or{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (or GR32:$src1, i32immSExt8:$src2)),
-                    (implicit EFLAGS)]>;
+                   [(set GR32:$dst, EFLAGS, (X86or_flag GR32:$src1,
+                                                        i32immSExt8:$src2))]>;
 let isTwoAddress = 0 in {
   def OR8mr  : I<0x08, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
                  "or{b}\t{$src, $dst|$dst, $src}",
@@ -2004,18 +1997,18 @@ let isCommutable = 1 in { // X = XOR Y, Z --> X = XOR Z, Y
   def XOR8rr   : I<0x30, MRMDestReg,
                    (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2),
                    "xor{b}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR8:$dst, (xor GR8:$src1, GR8:$src2)),
-                    (implicit EFLAGS)]>;
+                   [(set GR8:$dst, EFLAGS, (X86xor_flag GR8:$src1,
+                                                        GR8:$src2))]>;
   def XOR16rr  : I<0x31, MRMDestReg, 
                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), 
                    "xor{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (xor GR16:$src1, GR16:$src2)),
-                    (implicit EFLAGS)]>, OpSize;
+                   [(set GR16:$dst, EFLAGS, (X86xor_flag GR16:$src1,
+                                                         GR16:$src2))]>, OpSize;
   def XOR32rr  : I<0x31, MRMDestReg, 
                    (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), 
                    "xor{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (xor GR32:$src1, GR32:$src2)),
-                    (implicit EFLAGS)]>;
+                   [(set GR32:$dst, EFLAGS, (X86xor_flag GR32:$src1,
+                                                         GR32:$src2))]>;
 } // isCommutable = 1
 
 // XOR instructions with the destination register in REG and the source register
@@ -2029,49 +2022,48 @@ def XOR32rr_REV : I<0x33, MRMSrcReg, (outs GR32:$dst),
                     (ins GR32:$src1, GR32:$src2),
                    "xor{l}\t{$src2, $dst|$dst, $src2}", []>;
 
-def XOR8rm   : I<0x32, MRMSrcMem , 
+def XOR8rm   : I<0x32, MRMSrcMem, 
                  (outs GR8 :$dst), (ins GR8:$src1, i8mem :$src2), 
                  "xor{b}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR8:$dst, (xor GR8:$src1, (load addr:$src2))),
-                  (implicit EFLAGS)]>;
-def XOR16rm  : I<0x33, MRMSrcMem , 
+                 [(set GR8:$dst, EFLAGS, (X86xor_flag GR8:$src1,
+                                                      (load addr:$src2)))]>;
+def XOR16rm  : I<0x33, MRMSrcMem, 
                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), 
                  "xor{w}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR16:$dst, (xor GR16:$src1, (load addr:$src2))),
-                  (implicit EFLAGS)]>,
+                 [(set GR16:$dst, EFLAGS, (X86xor_flag GR16:$src1,
+                                                       (load addr:$src2)))]>,
                  OpSize;
-def XOR32rm  : I<0x33, MRMSrcMem , 
+def XOR32rm  : I<0x33, MRMSrcMem, 
                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), 
                  "xor{l}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR32:$dst, (xor GR32:$src1, (load addr:$src2))),
-                  (implicit EFLAGS)]>;
-
-def XOR8ri   : Ii8<0x80, MRM6r, 
-                   (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2), 
-                   "xor{b}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR8:$dst, (xor GR8:$src1, imm:$src2)),
-                    (implicit EFLAGS)]>;
-def XOR16ri  : Ii16<0x81, MRM6r, 
-                    (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), 
-                    "xor{w}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR16:$dst, (xor GR16:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>, OpSize;
+                 [(set GR32:$dst, EFLAGS, (X86xor_flag GR32:$src1,
+                                                       (load addr:$src2)))]>;
+
+def XOR8ri  : Ii8<0x80, MRM6r, 
+                  (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2), 
+                  "xor{b}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR8:$dst, EFLAGS, (X86xor_flag GR8:$src1, imm:$src2))]>;
+def XOR16ri : Ii16<0x81, MRM6r, 
+                   (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), 
+                   "xor{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, EFLAGS, (X86xor_flag GR16:$src1,
+                                                         imm:$src2))]>, OpSize;
 def XOR32ri  : Ii32<0x81, MRM6r, 
                     (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), 
                     "xor{l}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR32:$dst, (xor GR32:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR32:$dst, EFLAGS, (X86xor_flag GR32:$src1,
+                                                          imm:$src2))]>;
 def XOR16ri8 : Ii8<0x83, MRM6r, 
                    (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
                    "xor{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (xor GR16:$src1, i16immSExt8:$src2)),
-                    (implicit EFLAGS)]>,
+                   [(set GR16:$dst, EFLAGS, (X86xor_flag GR16:$src1,
+                                                         i16immSExt8:$src2))]>,
                    OpSize;
 def XOR32ri8 : Ii8<0x83, MRM6r, 
                    (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
                    "xor{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (xor GR32:$src1, i32immSExt8:$src2)),
-                    (implicit EFLAGS)]>;
+                   [(set GR32:$dst, EFLAGS, (X86xor_flag GR32:$src1,
+                                                         i32immSExt8:$src2))]>;
 
 let isTwoAddress = 0 in {
   def XOR8mr   : I<0x30, MRMDestMem,
@@ -2118,12 +2110,12 @@ let isTwoAddress = 0 in {
                  [(store (xor (load addr:$dst), i32immSExt8:$src), addr:$dst),
                   (implicit EFLAGS)]>;
                   
-  def XOR8i8 : Ii8 <0x34, RawFrm, (outs), (ins i8imm:$src),
-                   "xor{b}\t{$src, %al|%al, $src}", []>;
-  def XOR16i16 : Ii16 <0x35, RawFrm, (outs), (ins i16imm:$src),
-                        "xor{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
-  def XOR32i32 : Ii32 <0x35, RawFrm, (outs), (ins i32imm:$src),
-                        "xor{l}\t{$src, %eax|%eax, $src}", []>;
+  def XOR8i8   : Ii8 <0x34, RawFrm, (outs), (ins i8imm:$src),
+                      "xor{b}\t{$src, %al|%al, $src}", []>;
+  def XOR16i16 : Ii16<0x35, RawFrm, (outs), (ins i16imm:$src),
+                      "xor{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+  def XOR32i32 : Ii32<0x35, RawFrm, (outs), (ins i32imm:$src),
+                      "xor{l}\t{$src, %eax|%eax, $src}", []>;
 } // isTwoAddress = 0
 } // Defs = [EFLAGS]
 
@@ -2690,21 +2682,20 @@ let isCommutable = 1 in {   // X = ADD Y, Z   --> X = ADD Z, Y
 def ADD8rr    : I<0x00, MRMDestReg, (outs GR8 :$dst),
                                     (ins GR8 :$src1, GR8 :$src2),
                   "add{b}\t{$src2, $dst|$dst, $src2}",
-                  [(set GR8:$dst, (add GR8:$src1, GR8:$src2)),
-                   (implicit EFLAGS)]>;
+                  [(set GR8:$dst, EFLAGS, (X86add_flag GR8:$src1, GR8:$src2))]>;
 
 let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
 // Register-Register Addition
 def ADD16rr  : I<0x01, MRMDestReg, (outs GR16:$dst),
                                    (ins GR16:$src1, GR16:$src2),
                  "add{w}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR16:$dst, (add GR16:$src1, GR16:$src2)),
-                  (implicit EFLAGS)]>, OpSize;
+                 [(set GR16:$dst, EFLAGS, (X86add_flag GR16:$src1,
+                                                       GR16:$src2))]>, OpSize;
 def ADD32rr  : I<0x01, MRMDestReg, (outs GR32:$dst),
                                    (ins GR32:$src1, GR32:$src2),
                  "add{l}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR32:$dst, (add GR32:$src1, GR32:$src2)),
-                  (implicit EFLAGS)]>;
+                 [(set GR32:$dst, EFLAGS, (X86add_flag GR32:$src1,
+                                                       GR32:$src2))]>;
 } // end isConvertibleToThreeAddress
 } // end isCommutable
 
@@ -2723,47 +2714,47 @@ let isCodeGenOnly = 1 in {
 def ADD8rm   : I<0x02, MRMSrcMem, (outs GR8 :$dst),
                                   (ins GR8 :$src1, i8mem :$src2),
                  "add{b}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR8:$dst, (add GR8:$src1, (load addr:$src2))),
-                  (implicit EFLAGS)]>;
+                 [(set GR8:$dst, EFLAGS, (X86add_flag GR8:$src1,
+                                                      (load addr:$src2)))]>;
 def ADD16rm  : I<0x03, MRMSrcMem, (outs GR16:$dst),
                                   (ins GR16:$src1, i16mem:$src2),
                  "add{w}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR16:$dst, (add GR16:$src1, (load addr:$src2))),
-                  (implicit EFLAGS)]>, OpSize;
+                 [(set GR16:$dst, EFLAGS, (X86add_flag GR16:$src1,
+                                                  (load addr:$src2)))]>, OpSize;
 def ADD32rm  : I<0x03, MRMSrcMem, (outs GR32:$dst),
                                   (ins GR32:$src1, i32mem:$src2),
                  "add{l}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR32:$dst, (add GR32:$src1, (load addr:$src2))),
-                  (implicit EFLAGS)]>;
+                 [(set GR32:$dst, EFLAGS, (X86add_flag GR32:$src1,
+                                                       (load addr:$src2)))]>;
                   
 // Register-Integer Addition
 def ADD8ri    : Ii8<0x80, MRM0r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
                     "add{b}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR8:$dst, (add GR8:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR8:$dst, EFLAGS,
+                          (X86add_flag GR8:$src1, imm:$src2))]>;
 
 let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
 // Register-Integer Addition
 def ADD16ri  : Ii16<0x81, MRM0r, (outs GR16:$dst),
                                  (ins GR16:$src1, i16imm:$src2),
                     "add{w}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR16:$dst, (add GR16:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>, OpSize;
+                    [(set GR16:$dst, EFLAGS,
+                          (X86add_flag GR16:$src1, imm:$src2))]>, OpSize;
 def ADD32ri  : Ii32<0x81, MRM0r, (outs GR32:$dst),
                                  (ins GR32:$src1, i32imm:$src2),
                     "add{l}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR32:$dst, (add GR32:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR32:$dst, EFLAGS, 
+                          (X86add_flag GR32:$src1, imm:$src2))]>;
 def ADD16ri8 : Ii8<0x83, MRM0r, (outs GR16:$dst),
                                 (ins GR16:$src1, i16i8imm:$src2),
                    "add{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (add GR16:$src1, i16immSExt8:$src2)),
-                    (implicit EFLAGS)]>, OpSize;
+                   [(set GR16:$dst, EFLAGS,
+                         (X86add_flag GR16:$src1, i16immSExt8:$src2))]>, OpSize;
 def ADD32ri8 : Ii8<0x83, MRM0r, (outs GR32:$dst),
                                 (ins GR32:$src1, i32i8imm:$src2),
                    "add{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (add GR32:$src1, i32immSExt8:$src2)),
-                    (implicit EFLAGS)]>;
+                   [(set GR32:$dst, EFLAGS,
+                         (X86add_flag GR32:$src1, i32immSExt8:$src2))]>;
 }
 
 let isTwoAddress = 0 in {
@@ -2911,16 +2902,16 @@ let isTwoAddress = 0 in {
 // Register-Register Subtraction
 def SUB8rr  : I<0x28, MRMDestReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
                 "sub{b}\t{$src2, $dst|$dst, $src2}",
-                [(set GR8:$dst, (sub GR8:$src1, GR8:$src2)),
-                 (implicit EFLAGS)]>;
+                [(set GR8:$dst, EFLAGS,
+                      (X86sub_flag GR8:$src1, GR8:$src2))]>;
 def SUB16rr : I<0x29, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
                 "sub{w}\t{$src2, $dst|$dst, $src2}",
-                [(set GR16:$dst, (sub GR16:$src1, GR16:$src2)),
-                 (implicit EFLAGS)]>, OpSize;
+                [(set GR16:$dst, EFLAGS,
+                      (X86sub_flag GR16:$src1, GR16:$src2))]>, OpSize;
 def SUB32rr : I<0x29, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
                 "sub{l}\t{$src2, $dst|$dst, $src2}",
-                [(set GR32:$dst, (sub GR32:$src1, GR32:$src2)),
-                 (implicit EFLAGS)]>;
+                [(set GR32:$dst, EFLAGS,
+                      (X86sub_flag GR32:$src1, GR32:$src2))]>;
 
 def SUB8rr_REV : I<0x2A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
                    "sub{b}\t{$src2, $dst|$dst, $src2}", []>;
@@ -2935,45 +2926,45 @@ def SUB32rr_REV : I<0x2B, MRMSrcReg, (outs GR32:$dst),
 def SUB8rm  : I<0x2A, MRMSrcMem, (outs GR8 :$dst),
                                  (ins GR8 :$src1, i8mem :$src2),
                 "sub{b}\t{$src2, $dst|$dst, $src2}",
-                [(set GR8:$dst, (sub GR8:$src1, (load addr:$src2))),
-                 (implicit EFLAGS)]>;
+                [(set GR8:$dst, EFLAGS,
+                      (X86sub_flag GR8:$src1, (load addr:$src2)))]>;
 def SUB16rm : I<0x2B, MRMSrcMem, (outs GR16:$dst),
                                  (ins GR16:$src1, i16mem:$src2),
                 "sub{w}\t{$src2, $dst|$dst, $src2}",
-                [(set GR16:$dst, (sub GR16:$src1, (load addr:$src2))),
-                 (implicit EFLAGS)]>, OpSize;
+                [(set GR16:$dst, EFLAGS,
+                      (X86sub_flag GR16:$src1, (load addr:$src2)))]>, OpSize;
 def SUB32rm : I<0x2B, MRMSrcMem, (outs GR32:$dst),
                                  (ins GR32:$src1, i32mem:$src2),
                 "sub{l}\t{$src2, $dst|$dst, $src2}",
-                [(set GR32:$dst, (sub GR32:$src1, (load addr:$src2))),
-                 (implicit EFLAGS)]>;
+                [(set GR32:$dst, EFLAGS,
+                      (X86sub_flag GR32:$src1, (load addr:$src2)))]>;
 
 // Register-Integer Subtraction
 def SUB8ri   : Ii8 <0x80, MRM5r, (outs GR8:$dst),
                                  (ins GR8:$src1, i8imm:$src2),
                     "sub{b}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR8:$dst, (sub GR8:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR8:$dst, EFLAGS,
+                          (X86sub_flag GR8:$src1, imm:$src2))]>;
 def SUB16ri  : Ii16<0x81, MRM5r, (outs GR16:$dst),
                                  (ins GR16:$src1, i16imm:$src2),
                     "sub{w}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR16:$dst, (sub GR16:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>, OpSize;
+                    [(set GR16:$dst, EFLAGS,
+                          (X86sub_flag GR16:$src1, imm:$src2))]>, OpSize;
 def SUB32ri  : Ii32<0x81, MRM5r, (outs GR32:$dst),
                                  (ins GR32:$src1, i32imm:$src2),
                     "sub{l}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR32:$dst, (sub GR32:$src1, imm:$src2)),
-                     (implicit EFLAGS)]>;
+                    [(set GR32:$dst, EFLAGS,
+                          (X86sub_flag GR32:$src1, imm:$src2))]>;
 def SUB16ri8 : Ii8<0x83, MRM5r, (outs GR16:$dst),
                                 (ins GR16:$src1, i16i8imm:$src2),
                    "sub{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (sub GR16:$src1, i16immSExt8:$src2)),
-                    (implicit EFLAGS)]>, OpSize;
+                   [(set GR16:$dst, EFLAGS,
+                         (X86sub_flag GR16:$src1, i16immSExt8:$src2))]>, OpSize;
 def SUB32ri8 : Ii8<0x83, MRM5r, (outs GR32:$dst),
                                 (ins GR32:$src1, i32i8imm:$src2),
                    "sub{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (sub GR32:$src1, i32immSExt8:$src2)),
-                    (implicit EFLAGS)]>;
+                   [(set GR32:$dst, EFLAGS,
+                         (X86sub_flag GR32:$src1, i32immSExt8:$src2))]>;
 
 let isTwoAddress = 0 in {
   // Memory-Register Subtraction
@@ -3122,25 +3113,26 @@ let isCommutable = 1 in {  // X = IMUL Y, Z --> X = IMUL Z, Y
 // Register-Register Signed Integer Multiply
 def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
                  "imul{w}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR16:$dst, (mul GR16:$src1, GR16:$src2)),
-                  (implicit EFLAGS)]>, TB, OpSize;
+                 [(set GR16:$dst, EFLAGS,
+                       (X86smul_flag GR16:$src1, GR16:$src2))]>, TB, OpSize;
 def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
                  "imul{l}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR32:$dst, (mul GR32:$src1, GR32:$src2)),
-                  (implicit EFLAGS)]>, TB;
+                 [(set GR32:$dst, EFLAGS,
+                       (X86smul_flag GR32:$src1, GR32:$src2))]>, TB;
 }
 
 // Register-Memory Signed Integer Multiply
 def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
                                   (ins GR16:$src1, i16mem:$src2),
                  "imul{w}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR16:$dst, (mul GR16:$src1, (load addr:$src2))),
-                  (implicit EFLAGS)]>, TB, OpSize;
+                 [(set GR16:$dst, EFLAGS,
+                       (X86smul_flag GR16:$src1, (load addr:$src2)))]>,
+               TB, OpSize;
 def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst), 
                  (ins GR32:$src1, i32mem:$src2),
                  "imul{l}\t{$src2, $dst|$dst, $src2}",
-                 [(set GR32:$dst, (mul GR32:$src1, (load addr:$src2))),
-                  (implicit EFLAGS)]>, TB;
+                 [(set GR32:$dst, EFLAGS,
+                       (X86smul_flag GR32:$src1, (load addr:$src2)))]>, TB;
 } // Defs = [EFLAGS]
 } // end Two Address instructions
 
@@ -3150,47 +3142,49 @@ let Defs = [EFLAGS] in {
 def IMUL16rri  : Ii16<0x69, MRMSrcReg,                      // GR16 = GR16*I16
                       (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
                       "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      [(set GR16:$dst, (mul GR16:$src1, imm:$src2)),
-                       (implicit EFLAGS)]>, OpSize;
+                      [(set GR16:$dst, EFLAGS, 
+                            (X86smul_flag GR16:$src1, imm:$src2))]>, OpSize;
 def IMUL32rri  : Ii32<0x69, MRMSrcReg,                      // GR32 = GR32*I32
                       (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
                       "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      [(set GR32:$dst, (mul GR32:$src1, imm:$src2)),
-                       (implicit EFLAGS)]>;
+                      [(set GR32:$dst, EFLAGS,
+                            (X86smul_flag GR32:$src1, imm:$src2))]>;
 def IMUL16rri8 : Ii8<0x6B, MRMSrcReg,                       // GR16 = GR16*I8
                      (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     [(set GR16:$dst, (mul GR16:$src1, i16immSExt8:$src2)),
-                      (implicit EFLAGS)]>, OpSize;
+                     [(set GR16:$dst, EFLAGS,
+                           (X86smul_flag GR16:$src1, i16immSExt8:$src2))]>,
+                 OpSize;
 def IMUL32rri8 : Ii8<0x6B, MRMSrcReg,                       // GR32 = GR32*I8
                      (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     [(set GR32:$dst, (mul GR32:$src1, i32immSExt8:$src2)),
-                      (implicit EFLAGS)]>;
+                     [(set GR32:$dst, EFLAGS,
+                           (X86smul_flag GR32:$src1, i32immSExt8:$src2))]>;
 
 // Memory-Integer Signed Integer Multiply
 def IMUL16rmi  : Ii16<0x69, MRMSrcMem,                     // GR16 = [mem16]*I16
                       (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
                       "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      [(set GR16:$dst, (mul (load addr:$src1), imm:$src2)),
-                       (implicit EFLAGS)]>, OpSize;
+                      [(set GR16:$dst, EFLAGS,
+                            (X86smul_flag (load addr:$src1), imm:$src2))]>,
+                 OpSize;
 def IMUL32rmi  : Ii32<0x69, MRMSrcMem,                     // GR32 = [mem32]*I32
                       (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
                       "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      [(set GR32:$dst, (mul (load addr:$src1), imm:$src2)),
-                       (implicit EFLAGS)]>;
+                      [(set GR32:$dst, EFLAGS,
+                            (X86smul_flag (load addr:$src1), imm:$src2))]>;
 def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR16 = [mem16]*I8
                      (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     [(set GR16:$dst, (mul (load addr:$src1),
-                                       i16immSExt8:$src2)),
-                      (implicit EFLAGS)]>, OpSize;
+                     [(set GR16:$dst, EFLAGS,
+                           (X86smul_flag (load addr:$src1),
+                                         i16immSExt8:$src2))]>, OpSize;
 def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR32 = [mem32]*I8
                      (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     [(set GR32:$dst, (mul (load addr:$src1),
-                                           i32immSExt8:$src2)),
-                      (implicit EFLAGS)]>;
+                     [(set GR32:$dst, EFLAGS,
+                           (X86smul_flag (load addr:$src1),
+                                         i32immSExt8:$src2))]>;
 } // Defs = [EFLAGS]
 
 //===----------------------------------------------------------------------===//
@@ -4345,9 +4339,12 @@ def : Pat<(X86tcret GR32_TC:$dst, imm:$off),
           (TCRETURNri GR32_TC:$dst, imm:$off)>,
 	  Requires<[In32BitMode]>;
 
+// FIXME: This is disabled for 32-bit PIC mode because the global base
+// register which is part of the address mode may be assigned a 
+// callee-saved register.
 def : Pat<(X86tcret (load addr:$dst), imm:$off),
           (TCRETURNmi addr:$dst, imm:$off)>,
-	  Requires<[In32BitMode]>;
+	  Requires<[In32BitMode, IsNotPIC]>;
 
 def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
           (TCRETURNdi texternalsym:$dst, imm:$off)>,
@@ -4722,23 +4719,17 @@ def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
 
 // (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
 let AddedComplexity = 5 in { // Try this before the selecting to OR
-def : Pat<(parallel (or_is_add GR16:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or_is_add GR16:$src1, imm:$src2),
           (ADD16ri GR16:$src1, imm:$src2)>;
-def : Pat<(parallel (or_is_add GR32:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or_is_add GR32:$src1, imm:$src2),
           (ADD32ri GR32:$src1, imm:$src2)>;
-def : Pat<(parallel (or_is_add GR16:$src1, i16immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or_is_add GR16:$src1, i16immSExt8:$src2),
           (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(parallel (or_is_add GR32:$src1, i32immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or_is_add GR32:$src1, i32immSExt8:$src2),
           (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
-def : Pat<(parallel (or_is_add GR16:$src1, GR16:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or_is_add GR16:$src1, GR16:$src2),
           (ADD16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(parallel (or_is_add GR32:$src1, GR32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or_is_add GR32:$src1, GR32:$src2),
           (ADD32rr GR32:$src1, GR32:$src2)>;
 } // AddedComplexity
 
@@ -4746,270 +4737,173 @@ def : Pat<(parallel (or_is_add GR32:$src1, GR32:$src2),
 // EFLAGS-defining Patterns
 //===----------------------------------------------------------------------===//
 
-// Register-Register Addition with EFLAGS result
-def : Pat<(parallel (X86add_flag GR8:$src1, GR8:$src2),
-                    (implicit EFLAGS)),
-          (ADD8rr GR8:$src1, GR8:$src2)>;
-def : Pat<(parallel (X86add_flag GR16:$src1, GR16:$src2),
-                    (implicit EFLAGS)),
-          (ADD16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(parallel (X86add_flag GR32:$src1, GR32:$src2),
-                    (implicit EFLAGS)),
-          (ADD32rr GR32:$src1, GR32:$src2)>;
+// add reg, reg
+def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>;
 
-// Register-Memory Addition with EFLAGS result
-def : Pat<(parallel (X86add_flag GR8:$src1, (loadi8 addr:$src2)),
-                    (implicit EFLAGS)),
+// add reg, mem
+def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
           (ADD8rm GR8:$src1, addr:$src2)>;
-def : Pat<(parallel (X86add_flag GR16:$src1, (loadi16 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
           (ADD16rm GR16:$src1, addr:$src2)>;
-def : Pat<(parallel (X86add_flag GR32:$src1, (loadi32 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
           (ADD32rm GR32:$src1, addr:$src2)>;
 
-// Register-Integer Addition with EFLAGS result
-def : Pat<(parallel (X86add_flag GR8:$src1, imm:$src2),
-                    (implicit EFLAGS)),
-          (ADD8ri GR8:$src1, imm:$src2)>;
-def : Pat<(parallel (X86add_flag GR16:$src1, imm:$src2),
-                    (implicit EFLAGS)),
-          (ADD16ri GR16:$src1, imm:$src2)>;
-def : Pat<(parallel (X86add_flag GR32:$src1, imm:$src2),
-                    (implicit EFLAGS)),
-          (ADD32ri GR32:$src1, imm:$src2)>;
-def : Pat<(parallel (X86add_flag GR16:$src1, i16immSExt8:$src2),
-                    (implicit EFLAGS)),
+// add reg, imm
+def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri  GR8:$src1 , imm:$src2)>;
+def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(add GR16:$src1, i16immSExt8:$src2),
           (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(parallel (X86add_flag GR32:$src1, i32immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(add GR32:$src1, i32immSExt8:$src2),
           (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
 
-// Register-Register Subtraction with EFLAGS result
-def : Pat<(parallel (X86sub_flag GR8:$src1, GR8:$src2),
-                    (implicit EFLAGS)),
-          (SUB8rr GR8:$src1, GR8:$src2)>;
-def : Pat<(parallel (X86sub_flag GR16:$src1, GR16:$src2),
-                    (implicit EFLAGS)),
-          (SUB16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(parallel (X86sub_flag GR32:$src1, GR32:$src2),
-                    (implicit EFLAGS)),
-          (SUB32rr GR32:$src1, GR32:$src2)>;
+// sub reg, reg
+def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>;
 
-// Register-Memory Subtraction with EFLAGS result
-def : Pat<(parallel (X86sub_flag GR8:$src1, (loadi8 addr:$src2)),
-                    (implicit EFLAGS)),
+// sub reg, mem
+def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
           (SUB8rm GR8:$src1, addr:$src2)>;
-def : Pat<(parallel (X86sub_flag GR16:$src1, (loadi16 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
           (SUB16rm GR16:$src1, addr:$src2)>;
-def : Pat<(parallel (X86sub_flag GR32:$src1, (loadi32 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
           (SUB32rm GR32:$src1, addr:$src2)>;
 
-// Register-Integer Subtraction with EFLAGS result
-def : Pat<(parallel (X86sub_flag GR8:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+// sub reg, imm
+def : Pat<(sub GR8:$src1, imm:$src2),
           (SUB8ri GR8:$src1, imm:$src2)>;
-def : Pat<(parallel (X86sub_flag GR16:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(sub GR16:$src1, imm:$src2),
           (SUB16ri GR16:$src1, imm:$src2)>;
-def : Pat<(parallel (X86sub_flag GR32:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(sub GR32:$src1, imm:$src2),
           (SUB32ri GR32:$src1, imm:$src2)>;
-def : Pat<(parallel (X86sub_flag GR16:$src1, i16immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
           (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(parallel (X86sub_flag GR32:$src1, i32immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
           (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
 
-// Register-Register Signed Integer Multiply with EFLAGS result
-def : Pat<(parallel (X86smul_flag GR16:$src1, GR16:$src2),
-                    (implicit EFLAGS)),
+// mul reg, reg
+def : Pat<(mul GR16:$src1, GR16:$src2),
           (IMUL16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(parallel (X86smul_flag GR32:$src1, GR32:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul GR32:$src1, GR32:$src2),
           (IMUL32rr GR32:$src1, GR32:$src2)>;
 
-// Register-Memory Signed Integer Multiply with EFLAGS result
-def : Pat<(parallel (X86smul_flag GR16:$src1, (loadi16 addr:$src2)),
-                    (implicit EFLAGS)),
+// mul reg, mem
+def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
           (IMUL16rm GR16:$src1, addr:$src2)>;
-def : Pat<(parallel (X86smul_flag GR32:$src1, (loadi32 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
           (IMUL32rm GR32:$src1, addr:$src2)>;
 
-// Register-Integer Signed Integer Multiply with EFLAGS result
-def : Pat<(parallel (X86smul_flag GR16:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+// mul reg, imm
+def : Pat<(mul GR16:$src1, imm:$src2),
           (IMUL16rri GR16:$src1, imm:$src2)>;
-def : Pat<(parallel (X86smul_flag GR32:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul GR32:$src1, imm:$src2),
           (IMUL32rri GR32:$src1, imm:$src2)>;
-def : Pat<(parallel (X86smul_flag GR16:$src1, i16immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul GR16:$src1, i16immSExt8:$src2),
           (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(parallel (X86smul_flag GR32:$src1, i32immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul GR32:$src1, i32immSExt8:$src2),
           (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
 
-// Memory-Integer Signed Integer Multiply with EFLAGS result
-def : Pat<(parallel (X86smul_flag (loadi16 addr:$src1), imm:$src2),
-                    (implicit EFLAGS)),
+// reg = mul mem, imm
+def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
           (IMUL16rmi addr:$src1, imm:$src2)>;
-def : Pat<(parallel (X86smul_flag (loadi32 addr:$src1), imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
           (IMUL32rmi addr:$src1, imm:$src2)>;
-def : Pat<(parallel (X86smul_flag (loadi16 addr:$src1), i16immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
           (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
-def : Pat<(parallel (X86smul_flag (loadi32 addr:$src1), i32immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
           (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
 
 // Optimize multiply by 2 with EFLAGS result.
 let AddedComplexity = 2 in {
-def : Pat<(parallel (X86smul_flag GR16:$src1, 2),
-                    (implicit EFLAGS)),
-          (ADD16rr GR16:$src1, GR16:$src1)>;
-
-def : Pat<(parallel (X86smul_flag GR32:$src1, 2),
-                    (implicit EFLAGS)),
-          (ADD32rr GR32:$src1, GR32:$src1)>;
+def : Pat<(X86smul_flag GR16:$src1, 2), (ADD16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(X86smul_flag GR32:$src1, 2), (ADD32rr GR32:$src1, GR32:$src1)>;
 }
 
-// INC and DEC with EFLAGS result. Note that these do not set CF.
-def : Pat<(parallel (X86inc_flag GR8:$src), (implicit EFLAGS)),
-          (INC8r GR8:$src)>;
-def : Pat<(parallel (X86dec_flag GR8:$src), (implicit EFLAGS)),
-          (DEC8r GR8:$src)>;
-
-def : Pat<(parallel (X86inc_flag GR16:$src), (implicit EFLAGS)),
-          (INC16r GR16:$src)>, Requires<[In32BitMode]>;
-def : Pat<(parallel (X86dec_flag GR16:$src), (implicit EFLAGS)),
-          (DEC16r GR16:$src)>, Requires<[In32BitMode]>;
-
-def : Pat<(parallel (X86inc_flag GR32:$src), (implicit EFLAGS)),
-          (INC32r GR32:$src)>, Requires<[In32BitMode]>;
-def : Pat<(parallel (X86dec_flag GR32:$src), (implicit EFLAGS)),
-          (DEC32r GR32:$src)>, Requires<[In32BitMode]>;
-
-// Register-Register Or with EFLAGS result
-def : Pat<(parallel (X86or_flag GR8:$src1, GR8:$src2),
-                    (implicit EFLAGS)),
-          (OR8rr GR8:$src1, GR8:$src2)>;
-def : Pat<(parallel (X86or_flag GR16:$src1, GR16:$src2),
-                    (implicit EFLAGS)),
-          (OR16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(parallel (X86or_flag GR32:$src1, GR32:$src2),
-                    (implicit EFLAGS)),
-          (OR32rr GR32:$src1, GR32:$src2)>;
-
-// Register-Memory Or with EFLAGS result
-def : Pat<(parallel (X86or_flag GR8:$src1, (loadi8 addr:$src2)),
-                    (implicit EFLAGS)),
+// Patterns for nodes that do not produce flags, for instructions that do.
+
+// Increment reg.
+def : Pat<(add GR8:$src ,  1), (INC8r  GR8:$src)>;
+def : Pat<(add GR16:$src,  1), (INC16r GR16:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR32:$src,  1), (INC32r GR32:$src)>, Requires<[In32BitMode]>;
+
+// Decrement reg.
+def : Pat<(add GR8:$src , -1), (DEC8r  GR8:$src)>;
+def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>, Requires<[In32BitMode]>;
+
+// or reg/reg.
+def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>;
+
+// or reg/mem
+def : Pat<(or GR8:$src1, (loadi8 addr:$src2)),
           (OR8rm GR8:$src1, addr:$src2)>;
-def : Pat<(parallel (X86or_flag GR16:$src1, (loadi16 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(or GR16:$src1, (loadi16 addr:$src2)),
           (OR16rm GR16:$src1, addr:$src2)>;
-def : Pat<(parallel (X86or_flag GR32:$src1, (loadi32 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(or GR32:$src1, (loadi32 addr:$src2)),
           (OR32rm GR32:$src1, addr:$src2)>;
 
-// Register-Integer Or with EFLAGS result
-def : Pat<(parallel (X86or_flag GR8:$src1, imm:$src2),
-                    (implicit EFLAGS)),
-          (OR8ri GR8:$src1, imm:$src2)>;
-def : Pat<(parallel (X86or_flag GR16:$src1, imm:$src2),
-                    (implicit EFLAGS)),
-          (OR16ri GR16:$src1, imm:$src2)>;
-def : Pat<(parallel (X86or_flag GR32:$src1, imm:$src2),
-                    (implicit EFLAGS)),
-          (OR32ri GR32:$src1, imm:$src2)>;
-def : Pat<(parallel (X86or_flag GR16:$src1, i16immSExt8:$src2),
-                    (implicit EFLAGS)),
+// or reg/imm
+def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri  GR8 :$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, i16immSExt8:$src2),
           (OR16ri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(parallel (X86or_flag GR32:$src1, i32immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(or GR32:$src1, i32immSExt8:$src2),
           (OR32ri8 GR32:$src1, i32immSExt8:$src2)>;
 
-// Register-Register XOr with EFLAGS result
-def : Pat<(parallel (X86xor_flag GR8:$src1, GR8:$src2),
-                    (implicit EFLAGS)),
-          (XOR8rr GR8:$src1, GR8:$src2)>;
-def : Pat<(parallel (X86xor_flag GR16:$src1, GR16:$src2),
-                    (implicit EFLAGS)),
-          (XOR16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(parallel (X86xor_flag GR32:$src1, GR32:$src2),
-                    (implicit EFLAGS)),
-          (XOR32rr GR32:$src1, GR32:$src2)>;
-
-// Register-Memory XOr with EFLAGS result
-def : Pat<(parallel (X86xor_flag GR8:$src1, (loadi8 addr:$src2)),
-                    (implicit EFLAGS)),
+// xor reg/reg
+def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>;
+
+// xor reg/mem
+def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)),
           (XOR8rm GR8:$src1, addr:$src2)>;
-def : Pat<(parallel (X86xor_flag GR16:$src1, (loadi16 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)),
           (XOR16rm GR16:$src1, addr:$src2)>;
-def : Pat<(parallel (X86xor_flag GR32:$src1, (loadi32 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)),
           (XOR32rm GR32:$src1, addr:$src2)>;
 
-// Register-Integer XOr with EFLAGS result
-def : Pat<(parallel (X86xor_flag GR8:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+// xor reg/imm
+def : Pat<(xor GR8:$src1, imm:$src2),
           (XOR8ri GR8:$src1, imm:$src2)>;
-def : Pat<(parallel (X86xor_flag GR16:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(xor GR16:$src1, imm:$src2),
           (XOR16ri GR16:$src1, imm:$src2)>;
-def : Pat<(parallel (X86xor_flag GR32:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(xor GR32:$src1, imm:$src2),
           (XOR32ri GR32:$src1, imm:$src2)>;
-def : Pat<(parallel (X86xor_flag GR16:$src1, i16immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(xor GR16:$src1, i16immSExt8:$src2),
           (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(parallel (X86xor_flag GR32:$src1, i32immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(xor GR32:$src1, i32immSExt8:$src2),
           (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
 
-// Register-Register And with EFLAGS result
-def : Pat<(parallel (X86and_flag GR8:$src1, GR8:$src2),
-                    (implicit EFLAGS)),
-          (AND8rr GR8:$src1, GR8:$src2)>;
-def : Pat<(parallel (X86and_flag GR16:$src1, GR16:$src2),
-                    (implicit EFLAGS)),
-          (AND16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(parallel (X86and_flag GR32:$src1, GR32:$src2),
-                    (implicit EFLAGS)),
-          (AND32rr GR32:$src1, GR32:$src2)>;
-
-// Register-Memory And with EFLAGS result
-def : Pat<(parallel (X86and_flag GR8:$src1, (loadi8 addr:$src2)),
-                    (implicit EFLAGS)),
+// and reg/reg
+def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>;
+
+// and reg/mem
+def : Pat<(and GR8:$src1, (loadi8 addr:$src2)),
           (AND8rm GR8:$src1, addr:$src2)>;
-def : Pat<(parallel (X86and_flag GR16:$src1, (loadi16 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(and GR16:$src1, (loadi16 addr:$src2)),
           (AND16rm GR16:$src1, addr:$src2)>;
-def : Pat<(parallel (X86and_flag GR32:$src1, (loadi32 addr:$src2)),
-                    (implicit EFLAGS)),
+def : Pat<(and GR32:$src1, (loadi32 addr:$src2)),
           (AND32rm GR32:$src1, addr:$src2)>;
 
-// Register-Integer And with EFLAGS result
-def : Pat<(parallel (X86and_flag GR8:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+// and reg/imm
+def : Pat<(and GR8:$src1, imm:$src2),
           (AND8ri GR8:$src1, imm:$src2)>;
-def : Pat<(parallel (X86and_flag GR16:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(and GR16:$src1, imm:$src2),
           (AND16ri GR16:$src1, imm:$src2)>;
-def : Pat<(parallel (X86and_flag GR32:$src1, imm:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(and GR32:$src1, imm:$src2),
           (AND32ri GR32:$src1, imm:$src2)>;
-def : Pat<(parallel (X86and_flag GR16:$src1, i16immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(and GR16:$src1, i16immSExt8:$src2),
           (AND16ri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(parallel (X86and_flag GR32:$src1, i32immSExt8:$src2),
-                    (implicit EFLAGS)),
+def : Pat<(and GR32:$src1, i32immSExt8:$src2),
           (AND32ri8 GR32:$src1, i32immSExt8:$src2)>;
 
 // -disable-16bit support.
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index e1203e2..1c81c5e 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -605,22 +605,9 @@ let AddedComplexity = 10 in {
 def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
                   VR64:$src2)),
           (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV_bc))),
-                  VR64:$src2)),
-          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV_bc))),
-                  VR64:$src2)),
-          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
-
 def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
                   (load addr:$src2))),
           (MMX_PANDNrm VR64:$src1, addr:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV_bc))),
-                  (load addr:$src2))),
-          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV_bc))),
-                  (load addr:$src2))),
-          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
 
 // Move MMX to lower 64-bit of XMM
 def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v8i8 VR64:$src))))),
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 720b663..dadc2a6 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -69,8 +69,9 @@ def X86pcmpgtw : SDNode<"X86ISD::PCMPGTW", SDTIntBinOp>;
 def X86pcmpgtd : SDNode<"X86ISD::PCMPGTD", SDTIntBinOp>;
 def X86pcmpgtq : SDNode<"X86ISD::PCMPGTQ", SDTIntBinOp>;
 
-def SDTX86CmpPTest : SDTypeProfile<0, 2, [SDTCisVT<0, v4f32>,
-                                          SDTCisVT<1, v4f32>]>;
+def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+                                          SDTCisVT<1, v4f32>,
+                                          SDTCisVT<2, v4f32>]>;
 def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
 
 //===----------------------------------------------------------------------===//
@@ -1114,15 +1115,19 @@ def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
 // load of an all-zeros value if folding it would be beneficial.
 // FIXME: Change encoding to pseudo!
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1 in
-def V_SET0 : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
+    isCodeGenOnly = 1 in {
+def V_SET0PS : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
+                 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+def V_SET0PD : PDI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
+                 [(set VR128:$dst, (v2f64 immAllZerosV))]>;
+let ExeDomain = SSEPackedInt in
+def V_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
                  [(set VR128:$dst, (v4i32 immAllZerosV))]>;
+}
 
-def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
-def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
-def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
-def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
-def : Pat<(v4f32 immAllZerosV), (V_SET0)>;
+def : Pat<(v2i64 immAllZerosV), (V_SET0PI)>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0PI)>;
+def : Pat<(v16i8 immAllZerosV), (V_SET0PI)>;
 
 def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
           (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), x86_subreg_ss))>;
@@ -1935,6 +1940,7 @@ let Constraints = "$src1 = $dst" in {
 
 //===---------------------------------------------------------------------===//
 // SSE integer instructions
+let ExeDomain = SSEPackedInt in {
 
 // Move Instructions
 let neverHasSideEffects = 1 in
@@ -2043,6 +2049,7 @@ multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
 }
 
 } // Constraints = "$src1 = $dst"
+} // ExeDomain = SSEPackedInt
 
 // 128-bit Integer Arithmetic
 
@@ -2105,7 +2112,8 @@ defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
                                int_x86_sse2_psra_d, int_x86_sse2_psrai_d>;
 
 // 128-bit logical shifts.
-let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
+let Constraints = "$src1 = $dst", neverHasSideEffects = 1,
+    ExeDomain = SSEPackedInt in {
   def PSLLDQri : PDIi8<0x73, MRM7r,
                        (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
                        "pslldq\t{$src2, $dst|$dst, $src2}", []>;
@@ -2139,7 +2147,7 @@ defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>;
 defm POR  : PDI_binop_rm_v2i64<0xEB, "por" , or , 1>;
 defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>;
 
-let Constraints = "$src1 = $dst" in {
+let Constraints = "$src1 = $dst", ExeDomain = SSEPackedInt in {
   def PANDNrr : PDI<0xDF, MRMSrcReg,
                     (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "pandn\t{$src2, $dst|$dst, $src2}",
@@ -2193,6 +2201,8 @@ defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128>;
 defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>;
 defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>;
 
+let ExeDomain = SSEPackedInt in {
+
 // Shuffle and unpack instructions
 let AddedComplexity = 5 in {
 def PSHUFDri : PDIi8<0x70, MRMSrcReg,
@@ -2369,10 +2379,13 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
                      "maskmovdqu\t{$mask, $src|$src, $mask}",
                      [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
 
+} // ExeDomain = SSEPackedInt
+
 // Non-temporal stores
 def MOVNTPDmr_Int : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                         "movntpd\t{$src, $dst|$dst, $src}",
                         [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
+let ExeDomain = SSEPackedInt in
 def MOVNTDQmr_Int : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                         "movntdq\t{$src, $dst|$dst, $src}",
                         [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
@@ -2386,6 +2399,7 @@ def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     "movntpd\t{$src, $dst|$dst, $src}",
                     [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
 
+let ExeDomain = SSEPackedInt in
 def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     "movntdq\t{$src, $dst|$dst, $src}",
                     [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
@@ -2414,7 +2428,7 @@ def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm),
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-ones value if folding it would be beneficial.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1 in
+    isCodeGenOnly = 1, ExeDomain = SSEPackedInt in
   // FIXME: Change encoding to pseudo.
   def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
                          [(set VR128:$dst, (v4i32 immAllOnesV))]>;
@@ -3016,14 +3030,14 @@ let Predicates = [HasSSE2] in {
 let AddedComplexity = 15 in {
 // Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
-          (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+          (MOVSDrr (v2f64 (V_SET0PS)), FR64:$src)>;
 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
-          (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+          (MOVSSrr (v4f32 (V_SET0PS)), FR32:$src)>;
 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
-          (MOVSSrr (v4f32 (V_SET0)),
+          (MOVSSrr (v4f32 (V_SET0PS)),
                    (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), x86_subreg_ss)))>;
 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
-          (MOVSSrr (v4i32 (V_SET0)),
+          (MOVSSrr (v4i32 (V_SET0PI)),
                    (EXTRACT_SUBREG (v4i32 VR128:$src), x86_subreg_ss))>;
 }
 
@@ -3181,9 +3195,6 @@ def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
                      (SHUFFLE_get_shuf_imm VR128:$src3))>;
 
 // Set lowest element and zero upper elements.
-let AddedComplexity = 15 in
-def : Pat<(v2f64 (movl immAllZerosV_bc, VR128:$src)),
-          (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
           (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
 
@@ -3441,8 +3452,28 @@ let Constraints = "$src1 = $dst" in {
                        OpSize;
   }
 }
-defm PMULLD       : SS41I_binop_patint<0x40, "pmulld", v4i32, mul,
-                                       int_x86_sse41_pmulld, 1>;
+
+/// SS48I_binop_rm - Simple SSE41 binary operator.
+let Constraints = "$src1 = $dst" in {
+multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, bit Commutable = 0> {
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 
+                                 (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]>,
+               OpSize {
+    let isCommutable = Commutable;
+  }
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 
+                                 (ins VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (OpNode VR128:$src1,
+                                  (bc_v4i32 (memopv2i64 addr:$src2))))]>,
+               OpSize;
+}
+}
+
+defm PMULLD         : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, 1>;
 
 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
 let Constraints = "$src1 = $dst" in {
@@ -3772,12 +3803,12 @@ def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3),
 let Defs = [EFLAGS] in {
 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
                     "ptest \t{$src2, $src1|$src1, $src2}",
-                    [(X86ptest VR128:$src1, VR128:$src2),
-                      (implicit EFLAGS)]>, OpSize;
+                    [(set EFLAGS, (X86ptest VR128:$src1, VR128:$src2))]>,
+              OpSize;
 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2),
                     "ptest \t{$src2, $src1|$src1, $src2}",
-                    [(X86ptest VR128:$src1, (load addr:$src2)),
-                        (implicit EFLAGS)]>, OpSize;
+                    [(set EFLAGS, (X86ptest VR128:$src1, (load addr:$src2)))]>,
+              OpSize;
 }
 
 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
@@ -3817,6 +3848,53 @@ def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)),
 def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
           (PCMPGTQrm VR128:$src1, addr:$src2)>;
 
+// TODO: These should be AES as a feature set.
+defm AESIMC          : SS42I_binop_rm_int<0xDB, "aesimc",
+                       int_x86_aesni_aesimc>;
+defm AESENC          : SS42I_binop_rm_int<0xDC, "aesenc",
+                       int_x86_aesni_aesenc>;
+defm AESENCLAST      : SS42I_binop_rm_int<0xDD, "aesenclast",
+                       int_x86_aesni_aesenclast>;
+defm AESDEC          : SS42I_binop_rm_int<0xDE, "aesdec",
+                       int_x86_aesni_aesdec>;
+defm AESDECLAST      : SS42I_binop_rm_int<0xDF, "aesdeclast",
+                       int_x86_aesni_aesdeclast>;
+
+def : Pat<(v2i64 (int_x86_aesni_aesimc VR128:$src1, VR128:$src2)),
+          (AESIMCrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesimc VR128:$src1, (memop addr:$src2))),
+          (AESIMCrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)),
+          (AESENCrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))),
+          (AESENCrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)),
+          (AESENCLASTrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))),
+          (AESENCLASTrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)),
+          (AESDECrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))),
+          (AESDECrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)),
+          (AESDECLASTrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))),
+          (AESDECLASTrm VR128:$src1, addr:$src2)>;
+
+def AESKEYGENASSIST128rr : SS42AI<0xDF, MRMSrcReg, (outs VR128:$dst),
+  (ins VR128:$src1, i32i8imm:$src2),
+  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+  [(set VR128:$dst,
+    (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+  OpSize;
+def AESKEYGENASSIST128rm : SS42AI<0xDF, MRMSrcMem, (outs VR128:$dst),
+  (ins i128mem:$src1, i32i8imm:$src2),
+  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+  [(set VR128:$dst,
+    (int_x86_aesni_aeskeygenassist (bitconvert (memopv2i64 addr:$src1)),
+                                    imm:$src2))]>,
+  OpSize;
+
 // crc intrinsic instruction
 // This set of instructions are only rm, the only difference is the size
 // of r and m.
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index cd56816..8a0cde4 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -266,6 +266,9 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
     unsigned Model  = 0;
     DetectFamilyModel(EAX, Family, Model);
     IsBTMemSlow = IsAMD || (Family == 6 && Model >= 13);
+    // If it's Nehalem, unaligned memory access is fast.
+    if (Family == 15 && Model == 26)
+      IsUAMemFast = true;
 
     GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
     HasX86_64 = (EDX >> 29) & 0x1;
@@ -286,6 +289,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS,
   , HasFMA3(false)
   , HasFMA4(false)
   , IsBTMemSlow(false)
+  , IsUAMemFast(false)
   , HasVectorUAMem(false)
   , DarwinVers(0)
   , stackAlignment(8)
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 56220db..bf30154 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -78,6 +78,9 @@ protected:
   /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
   bool IsBTMemSlow;
 
+  /// IsUAMemFast - True if unaligned memory access is fast.
+  bool IsUAMemFast;
+
   /// HasVectorUAMem - True if SIMD operations can have unaligned memory
   ///                  operands. This may require setting a feature bit in the
   ///                  processor.
@@ -148,6 +151,7 @@ public:
   bool hasFMA3() const { return HasFMA3; }
   bool hasFMA4() const { return HasFMA4; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
+  bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
   bool hasVectorUAMem() const { return HasVectorUAMem; }
 
   bool isTargetDarwin() const { return TargetType == isDarwin; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index f13e6f3..c608e56 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -17,6 +17,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegistry.h"
@@ -169,6 +170,15 @@ bool X86TargetMachine::addPostRegAlloc(PassManagerBase &PM,
   return true;  // -print-machineinstr should print after this.
 }
 
+bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel) {
+  if (OptLevel != CodeGenOpt::None && Subtarget.hasSSE2()) {
+    PM.add(createSSEDomainFixPass());
+    return true;
+  }
+  return false;
+}
+
 bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
                                       CodeGenOpt::Level OptLevel,
                                       JITCodeEmitter &JCE) {
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 2bb5454..ae7b5b2 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -66,6 +66,7 @@ public:
   virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
   virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
   virtual bool addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
   virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
                               JITCodeEmitter &JCE);
 };
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index e5f5a6d..54df33c 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -215,7 +215,15 @@ XCoreInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                               bool AllowModify) const {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
     return false;
 
   // Get the last instruction in the block.
@@ -326,6 +334,11 @@ XCoreInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin()) return 0;
   --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
   if (!IsBRU(I->getOpcode()) && !IsCondBranch(I->getOpcode()))
     return 0;
   
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index 2e9a1e5..dd3cbc1 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -32,7 +32,7 @@ def XCoreBranchLink     : SDNode<"XCoreISD::BL",SDT_XCoreBranchLink,
                             [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag,
                              SDNPVariadic]>;
 
-def XCoreRetsp       : SDNode<"XCoreISD::RETSP", SDTNone,
+def XCoreRetsp       : SDNode<"XCoreISD::RETSP", SDTBrind,
                          [SDNPHasChain, SDNPOptInFlag]>;
 
 def SDT_XCoreBR_JT    : SDTypeProfile<0, 2,
author	rdivacky <rdivacky@FreeBSD.org>	2010-04-02 08:54:30 +0000
committer	rdivacky <rdivacky@FreeBSD.org>	2010-04-02 08:54:30 +0000
commit	20e856b2a58d12231aa42d5d13888b15ac03e5a4 (patch)
tree	cf5763d092b81cecc168fa28032247ee495d06e2 /lib/Target
parent	2f2afc1aae898651e26987a5c71f3febb19bca98 (diff)
download	FreeBSD-src-20e856b2a58d12231aa42d5d13888b15ac03e5a4.zip FreeBSD-src-20e856b2a58d12231aa42d5d13888b15ac03e5a4.tar.gz